[ovs-dev] [PATCH v2 3/3] dpctl: Support flush conntrack by conntrack 5-tuple
With this patch, "flush-conntrack" in ovs-dpctl and ovs-appctl accept a conntrack 5-tuple to delete the conntrack entry specified by the 5-tuple. For example, user can use the following command to flush a conntrack entry in zone 5. $ ovs-dpctl flush-conntrack zone=5 \ 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1' $ ovs-appctl dpctl/flush-conntrack zone=5 \ 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1' VMWare-BZ: #1983178 Signed-off-by: Yi-Hung Wei--- NEWS| 2 + lib/ct-dpif.c | 108 lib/ct-dpif.h | 1 + lib/dpctl.c | 76 +++--- lib/dpctl.man | 20 +++-- tests/system-traffic.at | 64 utilities/ovs-dpctl.c | 4 +- 7 files changed, 253 insertions(+), 22 deletions(-) diff --git a/NEWS b/NEWS index a93237f9078b..198ab7e7d09f 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,8 @@ Post-v2.8.0 IPv6 packets. - Linux kernel 4.13 * Add support for compiling OVS with the latest Linux 4.13 kernel + - "flush-conntrack" in ovs-dpctl and ovs-appctl now accept a 5-tuple to + delete a specific connection tracking entry. v2.8.0 - 31 Aug 2017 diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index 5cd6b5cfd870..3cad5c928e8b 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -20,6 +20,7 @@ #include #include "ct-dpif.h" +#include "openvswitch/ofp-parse.h" #include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(ct_dpif); @@ -434,3 +435,110 @@ ct_dpif_format_tcp_stat(struct ds * ds, int tcp_state, int conn_per_state) ds_put_cstr(ds, "]"); ds_put_format(ds, "=%u", conn_per_state); } + +/* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. + * Returns true on success. Otherwise, returns false and puts the error + * message in 'ds'. */ +bool +ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const char *s, struct ds *ds) +{ +char *pos, *key, *value, *copy; +memset(tuple, 0, sizeof *tuple); + +pos = copy = xstrdup(s); +while (ofputil_parse_key_value(, , )) { +if (!*value) { +ds_put_format(ds, "field %s missing value", key); +goto error; +} + +if (!strcmp(key, "ct_nw_src") || !strcmp(key, "ct_nw_dst")) { +if (tuple->l3_type && tuple->l3_type != AF_INET) { +ds_put_cstr(ds, "L3 type set multiple times"); +goto error; +} else { +tuple->l3_type = AF_INET; +} +if (!ip_parse(value, key[6] == 's' ? >src.ip : + >dst.ip)) { +goto error_with_msg; +} +} else if (!strcmp(key, "ct_ipv6_src") || + !strcmp(key, "ct_ipv6_dst")) { +if (tuple->l3_type && tuple->l3_type != AF_INET6) { +ds_put_cstr(ds, "L3 type set multiple times"); +goto error; +} else { +tuple->l3_type = AF_INET6; +} +if (!ipv6_parse(value, key[8] == 's' ? >src.in6 : + >dst.in6)) { +goto error_with_msg; +} +} else if (!strcmp(key, "ct_nw_proto")) { +char *err = str_to_u8(value, key, >ip_proto); +if (err) { +free(err); +goto error_with_msg; +} +} else if (!strcmp(key, "ct_tp_src") || !strcmp(key,"ct_tp_dst")) { +uint16_t port; +char *err = str_to_u16(value, key, ); +if (err) { +free(err); +goto error_with_msg; +} +if (key[6] == 's') { +tuple->src_port = htons(port); +} else { +tuple->dst_port = htons(port); +} +} else if (!strcmp(key, "icmp_type") || !strcmp(key, "icmp_code") || + !strcmp(key, "icmp_id") ) { +if (tuple->ip_proto != IPPROTO_ICMP && +tuple->ip_proto != IPPROTO_ICMPV6) { +ds_put_cstr(ds, "Invalid L4 fields"); +goto error; +} +uint16_t icmp_id; +char *err; +if (key[5] == 't') { +err = str_to_u8(value, key, >icmp_type); +} else if (key[5] == 'c') { +err = str_to_u8(value, key, >icmp_code); +} else { +err = str_to_u16(value, key, _id); +tuple->icmp_id = htons(icmp_id); +} +if (err) { +free(err); +goto error_with_msg; +} +}else { +ds_put_format(ds, "Invalid conntrack tuple field: %s", key); +goto error; +} +} + +if (ipv6_is_zero(>src.in6)
[ovs-dev] [PATCH v2 2/3] conntrack: Support conntrack flush by ct 5-tuple
This patch adds support of flushing a conntrack entry specified by the conntrack 5-tuple in dpif-netdev. Signed-off-by: Yi-Hung Wei--- lib/conntrack.c | 64 +++ lib/conntrack.h | 3 +++ lib/dpif-netdev.c | 2 +- 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index f5a3aa9fab34..c9077c07042c 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2335,6 +2335,18 @@ ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a, } static void +ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a, + struct ct_addr *b, + const uint16_t l3_type) +{ +if (l3_type == AF_INET) { +b->ipv4_aligned = a->ip; +} else if (l3_type == AF_INET6) { +b->ipv6_aligned = a->in6; +} +} + +static void conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple) { if (key->dl_type == htons(ETH_TYPE_IP)) { @@ -2359,6 +2371,35 @@ conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple) } static void +tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone, + struct conn_key *key) +{ +if (tuple->l3_type == AF_INET) { +key->dl_type = htons(ETH_TYPE_IP); +} else if (tuple->l3_type == AF_INET6) { +key->dl_type = htons(ETH_TYPE_IPV6); +} +key->nw_proto = tuple->ip_proto; +ct_dpif_inet_addr_to_ct_endpoint(>src, >src.addr, + tuple->l3_type); +ct_dpif_inet_addr_to_ct_endpoint(>dst, >dst.addr, + tuple->l3_type); + +if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) { +key->src.icmp_id = tuple->icmp_id; +key->src.icmp_type = tuple->icmp_type; +key->src.icmp_code = tuple->icmp_code; +key->dst.icmp_id = tuple->icmp_id; +key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type); +key->dst.icmp_code = tuple->icmp_code; +} else { +key->src.port = tuple->src_port; +key->dst.port = tuple->dst_port; +} +key->zone = zone; +} + +static void conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, long long now, int bkt) { @@ -2485,6 +2526,29 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone) return 0; } +int +conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple, + uint16_t zone) +{ +struct conn_lookup_ctx ctx; +int error = 0; + +memset(, 0, sizeof(ctx)); +tuple_to_conn_key(tuple, zone, ); +ctx.hash = conn_key_hash(, ct->hash_basis); +unsigned bucket = hash_to_bucket(ctx.hash); + +ct_lock_lock(>buckets[bucket].lock); +conn_key_lookup(>buckets[bucket], , time_msec()); +if (ctx.conn) { +conn_clean(ct, ctx.conn, >buckets[bucket]); +} else { +error = ENOENT; +} +ct_lock_unlock(>buckets[bucket].lock); +return error; +} + /* This function must be called with the ct->resources read lock taken. */ static struct alg_exp_node * expectation_lookup(struct hmap *alg_expectations, diff --git a/lib/conntrack.h b/lib/conntrack.h index fbeef1c4754e..b5b26dd96e00 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -107,6 +107,7 @@ struct conntrack_dump { }; struct ct_dpif_entry; +struct ct_dpif_tuple; int conntrack_dump_start(struct conntrack *, struct conntrack_dump *, const uint16_t *pzone, int *); @@ -114,6 +115,8 @@ int conntrack_dump_next(struct conntrack_dump *, struct ct_dpif_entry *); int conntrack_dump_done(struct conntrack_dump *); int conntrack_flush(struct conntrack *, const uint16_t *zone); +int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *, + uint16_t zone); /* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful to try * different types of locks (e.g. spinlocks) */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index b1ef9a6a5992..527442ebfc2a 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -5740,7 +5740,7 @@ dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, struct dp_netdev *dp = get_dp_netdev(dpif); if (tuple) { -return EOPNOTSUPP; +return conntrack_flush_tuple(>conntrack, tuple, zone ? *zone : 0); } return conntrack_flush(>conntrack, zone); } -- 2.7.4 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v2 1/3] ct-dpif, dpif-netlink: Support conntrack flush by ct 5-tuple
This patch adds support of flushing a conntrack entry specified by the conntrack 5-tuple, and provides the implementation in dpif-netlink. The implementation of dpif-netlink in the linux datapath utilizes the NFNL_SUBSYS_CTNETLINK netlink subsystem to delete a conntrack entry in nf_conntrack. VMWare-BZ: #1983178 Signed-off-by: Yi-Hung Wei--- lib/ct-dpif.c | 23 +--- lib/ct-dpif.h | 3 +- lib/dpctl.c | 2 +- lib/dpif-netdev.c | 6 ++- lib/dpif-netlink.c | 7 +++- lib/dpif-provider.h | 13 +-- lib/netlink-conntrack.c | 97 + lib/netlink-conntrack.h | 1 + ofproto/ofproto-dpif.c | 2 +- tests/ovs-ofctl.at | 2 +- 10 files changed, 140 insertions(+), 16 deletions(-) diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index c79e69e23970..5cd6b5cfd870 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -111,19 +111,30 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump) } /* Flush the entries in the connection tracker used by 'dpif'. - * - * If 'zone' is not NULL, flush only the entries in '*zone'. */ + * If both 'zone' and 'tuple' are NULL, flush all the conntrack entries. + * If 'zone' is not NULL, and 'tuple' is NULL, flush all the conntrack + * entries in '*zone'. + * If 'tuple' is not NULL, flush the conntrack entry specified by 'tuple' + * in '*zone'. In this case, we use default zone (zone 0) if 'zone' is + * NULL. */ int -ct_dpif_flush(struct dpif *dpif, const uint16_t *zone) +ct_dpif_flush(struct dpif *dpif, const uint16_t *zone, + const struct ct_dpif_tuple *tuple) { -if (zone) { -VLOG_DBG("%s: ct_flush: %"PRIu16, dpif_name(dpif), *zone); +if (tuple) { +struct ds ds = DS_EMPTY_INITIALIZER; +ct_dpif_format_tuple(, tuple); +VLOG_DBG("%s: ct_flush: %s in zone %d", dpif_name(dpif), ds_cstr(), +zone ? *zone : 0); +ds_destroy(); +} else if (zone) { +VLOG_DBG("%s: ct_flush: zone %"PRIu16, dpif_name(dpif), *zone); } else { VLOG_DBG("%s: ct_flush: ", dpif_name(dpif)); } return (dpif->dpif_class->ct_flush -? dpif->dpif_class->ct_flush(dpif, zone) +? dpif->dpif_class->ct_flush(dpif, zone, tuple) : EOPNOTSUPP); } diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index d5f966175bc0..ef019050c78e 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -195,7 +195,8 @@ int ct_dpif_dump_start(struct dpif *, struct ct_dpif_dump_state **, const uint16_t *zone, int *); int ct_dpif_dump_next(struct ct_dpif_dump_state *, struct ct_dpif_entry *); int ct_dpif_dump_done(struct ct_dpif_dump_state *); -int ct_dpif_flush(struct dpif *, const uint16_t *zone); +int ct_dpif_flush(struct dpif *, const uint16_t *zone, + const struct ct_dpif_tuple *); void ct_dpif_entry_uninit(struct ct_dpif_entry *); void ct_dpif_format_entry(const struct ct_dpif_entry *, struct ds *, bool verbose, bool print_stats); diff --git a/lib/dpctl.c b/lib/dpctl.c index 6520788aa428..7fc0e3afab37 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -1353,7 +1353,7 @@ dpctl_flush_conntrack(int argc, const char *argv[], return error; } -error = ct_dpif_flush(dpif, pzone); +error = ct_dpif_flush(dpif, pzone, NULL); dpif_close(dpif); return error; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 0a62630c2712..b1ef9a6a5992 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -5734,10 +5734,14 @@ dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED, } static int -dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone) +dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, + const struct ct_dpif_tuple *tuple) { struct dp_netdev *dp = get_dp_netdev(dpif); +if (tuple) { +return EOPNOTSUPP; +} return conntrack_flush(>conntrack, zone); } diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index c265909f8587..3f76128999d1 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -2892,9 +2892,12 @@ dpif_netlink_ct_dump_done(struct dpif *dpif OVS_UNUSED, } static int -dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone) +dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone, + const struct ct_dpif_tuple *tuple) { -if (zone) { +if (tuple) { +return nl_ct_flush_tuple(tuple, zone ? *zone : 0); +} else if (zone) { return nl_ct_flush_zone(*zone); } else { return nl_ct_flush(); diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index 1d82a0939aa1..33d7f2a64367 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -75,6 +75,7 @@ dpif_flow_dump_thread_init(struct dpif_flow_dump_thread *thread, struct ct_dpif_dump_state; struct ct_dpif_entry; +struct
Re: [ovs-dev] [PATCH 2/2] dpctl: Support flush conntrack by conntrack 5-tuple
On Wed, Nov 15, 2017 at 5:28 PM, Justin Pettitwrote > > > > With this patch, ovs-dpctl flush-conntrack accepts a conntrack 5-tuple > > I'd also mention ovs-appctl, since ovs-dpctl doesn't work on all platforms. > Sure. > diff --git a/NEWS b/NEWS > > index 047f34b9f402..800eb84cd24f 100644 > > --- a/NEWS > > +++ b/NEWS > > @@ -11,6 +11,8 @@ Post-v2.8.0 > > * Add support for compiling OVS with the latest Linux 4.13 kernel > > + - "ovs-dpctl flush-conntrack" now accepts conntrack 5-tuple to > delete a > > + connection tracking entry. > I would also mention the ovs-appctl, too. Maybe: > > "flush-conntrack" in ovs-dpctl and ovs-appctl now accept a 5-tuple > to delete a specific connection tracking entry. > Sure. > diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c > > index a9f58f4eb449..bdd9ef5d2551 100644 > > --- a/lib/ct-dpif.c > > +++ b/lib/ct-dpif.c > > @@ -20,6 +20,7 @@ > > #include > > > > #include "ct-dpif.h" > > +#include "openvswitch/ofp-parse.h" > > #include "openvswitch/vlog.h" > > > > VLOG_DEFINE_THIS_MODULE(ct_dpif); > > @@ -434,3 +435,83 @@ ct_dpif_format_tcp_stat(struct ds * ds, int > tcp_state, int conn_per_state) > > ds_put_cstr(ds, "]"); > > ds_put_format(ds, "=%u", conn_per_state); > > } > > + > > +/* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. > > + * Returns true on success. Otherwise, returns false and puts the error > msg > > + * in 'ds'. > > + */ > > We would normally close the function comment on the previous line. Also, > I wouldn't abbreviate "message" in a comment. > Thanks, done in v2. > +bool > > +ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const char *s, struct > ds *ds) > > +{ > > ... > > +if (ipv6_is_zero(>src.in6) || ipv6_is_zero(>dst.in6) > || > > +!tuple->ip_proto || !tuple->src_port || !tuple->dst_port) { > > +ds_put_cstr(ds, "Miss at least one of the conntrack 5-tuple > field"); > > I think this error message might be a little clearer: "At least one of the > conntrack 5-tuple fields is missing." > Sure, it is more clear. > > +goto out2; > > +} > > + > > +free(copy); > > +return true; > > + > > +out: > > +ds_put_format(ds, "Failed to parse field %s", key); > > +out2: > > +free(copy); > > +return false; > > Minor, but I think these labels could be more descriptive. What about > "error_with_msg" and "error" or something? > Sounds good. I will fold in that in v2. > diff --git a/lib/dpctl.c b/lib/dpctl.c > > index 7569189d8f22..682b9bf35a75 100644 > > --- a/lib/dpctl.c > > +++ b/lib/dpctl.c > > @@ -1331,14 +1331,32 @@ dpctl_flush_conntrack(int argc, const char > *argv[], > > struct dpctl_params *dpctl_p) > > { > > struct dpif *dpif; > > +struct ct_dpif_tuple tuple, *ptuple = NULL; > > +struct ds ds = DS_EMPTY_INITIALIZER; > > uint16_t zone, *pzone = NULL; > > char *name; > > int error; > > > > +/* Parse ct tuple */ > > +if (argc > 1 && ct_dpif_parse_tuple(, argv[argc-1], )) { > > +ptuple = > > +argc--; > > +} else if (argc == 4) { > > +dpctl_error(dpctl_p, EINVAL, "%s", ds_cstr()); > > +ds_destroy(); > > +return EINVAL; > > +} > > +ds_destroy(); > > I don't think this handles the case of an error parsing 'tuple' when a > datapath name and zone isn't provided. It also reads a little funny, since > 'ds' can only have a value if there was an error, but it's called even on > the non-error condition. It's obviously correct, since it's a safe > operation, but it's unnecessary. > > + > > +/* Parse zone */ > > if (argc > 1 && ovs_scan(argv[argc - 1], "zone=%"SCNu16, )) { > > pzone = > > argc--; > > +} else if (argc == 3) { > > +dpctl_error(dpctl_p, EINVAL, "Invalid zone or conntrack tuple"); > > +return EINVAL; > > Same thing here about this only triggering if there wasn't a datapath name. > I rewrite the error reporting logic in v2. It catches more error cases, but it also makes the code to be more complicated since all the 3 parameters are optional on this command. I hope it can reduce the chance to accidentally flush conntrack entries. > diff --git a/lib/dpctl.man b/lib/dpctl.man > > index 675fe5af4914..f429f1653fa7 100644 > > --- a/lib/dpctl.man > > +++ b/lib/dpctl.man > > @@ -217,10 +217,20 @@ are included. With \fB\-\-statistics\fR timeouts > and timestamps are > > added to the output. > > . > > .TP > > -\*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR] > > -Flushes all the connection entries in the tracker used by \fIdp\fR. > > +\*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR] > [\fBct-tuple\fR] > > Shouldn't "ct-tuple" (and all the references below) be using "\fI", since > it should be an argument? > Yes, I will fold in all the suggestion in dpctl.man in v2. Thanks, -Yi-Hung ___ dev mailing list d...@openvswitch.org
[ovs-dev] [PATCH v2 0/3] Support conntrack flush by ct 5-tuple
This patch series add support to delete a connection tracking entry specified by a 5-tuple. v1 -> v2: * Incoporate suggestions from Justin * Add dpif-netdev implementation * Improve error reporting logic in dpctl.c * Support ICMP 5-tuple * Rebase to master Yi-Hung Wei (3): ct-dpif,dpif-netlink: Support conntrack flush by ct 5-tuple conntrack: Support conntrack flush by ct 5-tuple dpctl: Support flush conntrack by conntrack 5-tuple NEWS| 2 + lib/conntrack.c | 64 +++ lib/conntrack.h | 3 ++ lib/ct-dpif.c | 131 +--- lib/ct-dpif.h | 4 +- lib/dpctl.c | 76 ++-- lib/dpctl.man | 20 ++-- lib/dpif-netdev.c | 6 ++- lib/dpif-netlink.c | 7 ++- lib/dpif-provider.h | 13 +++-- lib/netlink-conntrack.c | 97 +++ lib/netlink-conntrack.h | 1 + ofproto/ofproto-dpif.c | 2 +- tests/ovs-ofctl.at | 2 +- tests/system-traffic.at | 64 +++ utilities/ovs-dpctl.c | 4 +- 16 files changed, 459 insertions(+), 37 deletions(-) -- 2.7.4 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH 1/2] ct-dpif, dpif-netlink: Support conntrack flush by ct 5-tuple
Thanks Justin for reviewing this series. I will repin v2 based on your comments. On Wed, Nov 15, 2017 at 4:28 PM, Justin Pettitwrote: > > --- a/lib/ct-dpif.c > > +++ b/lib/ct-dpif.c > > @@ -111,19 +111,30 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump) > > } > > > > /* Flush the entries in the connection tracker used by 'dpif'. > > - * > > - * If 'zone' is not NULL, flush only the entries in '*zone'. */ > > + * If both 'zone' and 'tuple' are NULL, flush all the conntrack entries. > > + * If 'zone' is not NULL, and 'tuple's is NULL, flush all the conntrack > > I'd drop the "s" in "'tuple's". > Sure. > > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > > index 599308d257a8..0e1696a94e31 100644 > > --- a/lib/dpif-netdev.c > > +++ b/lib/dpif-netdev.c > > @@ -5741,10 +5741,14 @@ dpif_netdev_ct_dump_done(struct dpif *dpif > OVS_UNUSED, > > } > > > > static int > > -dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone) > > +dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, > > + const struct ct_dpif_tuple *tuple) > > { > > struct dp_netdev *dp = get_dp_netdev(dpif); > > > > +if (tuple) { > > +return EOPNOTSUPP; > > +} > > return conntrack_flush(>conntrack, zone); > > } > > Do you plan on implementing this? > Yes. Will have that in v2. > > diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h > > index 1d82a0939aa1..083d67c8d5b0 100644 > > --- a/lib/dpif-provider.h > > +++ b/lib/dpif-provider.h > > ... > > -/* Flushes the connection tracking tables. If 'zone' is not NULL, > > - * only deletes connections in '*zone'. */ > > -int (*ct_flush)(struct dpif *, const uint16_t *zone); > > +/* Flushes the connection tracking tables. > > + * If both 'zone' and 'tuple' are NULL, flush all the conntrack > entries. > > + * If 'zone' is not NULL, and 'tuple's is NULL, flush all the > conntrack > > + * entries in '*zone'. > > Same comment as above about "'tuple's'". > Done in v2. > > > diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c > > index 1e1bb2f79d1d..9094779aaa55 100644 > > --- a/lib/netlink-conntrack.c > > +++ b/lib/netlink-conntrack.c > > > > +int > > +nl_ct_delete(const struct ct_dpif_tuple *tuple, uint16_t zone) > > +{ > > +int err; > > +struct ofpbuf buf; > > + > > +ofpbuf_init(, NL_DUMP_BUFSIZE); > > +nl_msg_put_nfgenmsg(, 0, tuple->l3_type, NFNL_SUBSYS_CTNETLINK, > > +IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST); > > + > > +nl_msg_put_be16(, CTA_ZONE, htons(zone)); > > +if (!nl_ct_put_ct_tuple(, tuple, CTA_TUPLE_ORIG)) { > > +err = EOPNOTSUPP; > > +goto out; > > +} > > +err = nl_transact(NETLINK_NETFILTER, , NULL); > > +out: > > +ofpbuf_uninit(); > > +return err; > > +} > > The Linux code for nl_ct_flush_zone() seems to go through some contortions > due to there being a problem with flushing a specific zone. I took a brief > look at the kernel code, and it looks like it does properly handle zones if > the tuple is provided, but not otherwise. Is that a correct reading? > Yes, that is correct. On a more cosmetic note, most of the external interface refers to deleting > a specific conntrack entry as a "flush", but the internal code refers to it > as a "delete". I think this is a reasonable distinction, but if it's not > carried externally, then it doesn't seem necessary internally. > Right, it make sense to rename the "delete" to "flush" to make it consistent. Also, this delete and flush code is nearly identical, so it seems like > there's some opportunity to just call it "flush" and consolidate to a > single implementation that handles a tuple if it's provided. > I am more incline to rename the nl_ct_delete() to nl_ct_flush_tuple(). I agree that there are some code duplication in the delete and flush code, but consolidate them to a single implementation will lead to the following two functions. It is a bit confusing since the first one will either flush all entries or flush by a tuple. - nl_ct_flush(const struct ct_dpif_tuple *tupe, uint16_t zone); // flush all or a tuple - nl_ct_flush_zone(uint16_t zone) // flush a particular zone Instead, if we rename nl_ct_delete() to nl_ct_flush_tuple(), we will have the following three functions, and it seems to be more clear to me. - nl_ct_flush(void) // flush all - nl_ct_flush_zone(uint16_t zone) // flush a particular zone - nl_ct_flush_tuple(const struct ct_dpif_tuple *tuple, uint16_t zone) // flush a tuple > +static bool > > +nl_ct_put_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple) > > +{ > > +if (!nl_ct_put_tuple_ip(buf, tuple)) { > > +return false; > > +} > > +if (!nl_ct_put_tuple_proto(buf, tuple)) { > > +return false; > > +} > > +return true; > > +} > > + > > +static bool > > +nl_ct_put_ct_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple > *tuple, > > + enum ctattr_type
[ovs-dev] [PATCH v1] datapath-windows: Add support for deleting conntrack entry by 5-tuple.
To delete a conntrack entry specified by 5-tuple pass an additional conntrack 5-tuple parameter to flush-conntrack. Signed-off-by: Anand Kumar--- datapath-windows/ovsext/Conntrack.c | 146 +--- 1 file changed, 134 insertions(+), 12 deletions(-) diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c index 3203411..dc268b3 100644 --- a/datapath-windows/ovsext/Conntrack.c +++ b/datapath-windows/ovsext/Conntrack.c @@ -35,8 +35,10 @@ static PNDIS_RW_LOCK_EX ovsConntrackLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; static UINT64 ctTotalEntries; -static __inline NDIS_STATUS OvsCtFlush(UINT16 zone); - +static __inline OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple); +static __inline NDIS_STATUS +MapNlToCtTuple(POVS_MESSAGE msgIn, PNL_ATTR attr, + struct ovs_key_ct_tuple_ipv4 *ct_tuple); /* * * OvsInitConntrack @@ -120,7 +122,7 @@ OvsCleanupConntrack(VOID) ObDereferenceObject(ctThreadCtx.threadObject); /* Force flush all entries before removing */ -OvsCtFlush(0); +OvsCtFlush(0, NULL); if (ovsConntrackTable) { OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); @@ -1018,11 +1020,11 @@ OvsConntrackEntryCleaner(PVOID data) /* * * OvsCtFlush - * Flushes out all Conntrack Entries that match the given zone + * Flushes out all Conntrack Entries that match any of the arguments * */ static __inline NDIS_STATUS -OvsCtFlush(UINT16 zone) +OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) { PLIST_ENTRY link, next; POVS_CT_ENTRY entry; @@ -1034,9 +1036,26 @@ OvsCtFlush(UINT16 zone) for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { LIST_FORALL_SAFE([i], link, next) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); -/* zone is a non-zero value */ -if (!zone || zone == entry->key.zone) +if (tuple) { +if (tuple->ipv4_proto != IPPROTO_ICMP && +tuple->ipv4_src == entry->key.src.addr.ipv4_aligned && +tuple->ipv4_dst == entry->key.dst.addr.ipv4_aligned && +tuple->ipv4_proto == entry->key.nw_proto && +tuple->src_port == entry->key.src.port && +tuple->dst_port == entry->key.dst.port && +(zone ? entry->key.zone == zone: TRUE)) { +OvsCtEntryDelete(entry); +} else if (tuple->ipv4_src == entry->key.src.addr.ipv4_aligned && +tuple->ipv4_dst == entry->key.dst.addr.ipv4_aligned && +tuple->ipv4_proto == entry->key.nw_proto && +tuple->src_port == entry->key.src.icmp_type && +tuple->dst_port == entry->key.src.icmp_code && +(zone ? entry->key.zone == zone: TRUE)) { +OvsCtEntryDelete(entry); +} +} else if (!zone || zone == entry->key.zone) { OvsCtEntryDelete(entry); +} } } } @@ -1058,19 +1077,21 @@ OvsCtDeleteCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, NL_ERROR nlError = NL_ERROR_SUCCESS; NTSTATUS status; UINT16 zone = 0; +struct ovs_key_ct_tuple_ipv4 *ct_tuple = NULL; NL_BUFFER nlBuf; UINT16 nlmsgType; PNL_MSG_HDR nlMsg; -static const NL_POLICY ctZonePolicy[] = { -[CTA_ZONE] = { .type = NL_A_BE16, .optional = TRUE }, +static const NL_POLICY ctAttrPolicy[] = { +[CTA_TUPLE_ORIG] = {.type = NL_A_NESTED, .optional = TRUE}, +[CTA_ZONE] = {.type = NL_A_BE16, .optional = TRUE }, }; if ((NlAttrParse(nlMsgHdr, attrOffset, NlNfMsgAttrsLen(nlMsgHdr), -ctZonePolicy, ARRAY_SIZE(ctZonePolicy), +ctAttrPolicy, ARRAY_SIZE(ctAttrPolicy), ctAttrs, ARRAY_SIZE(ctAttrs))) != TRUE) { -OVS_LOG_ERROR("Zone attr parsing failed for msg: %p", nlMsgHdr); +OVS_LOG_ERROR("Ct attr parsing failed for msg: %p", nlMsgHdr); status = STATUS_INVALID_PARAMETER; goto done; } @@ -1079,7 +1100,21 @@ OvsCtDeleteCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, zone = NlAttrGetU16(ctAttrs[CTA_ZONE]); } -status = OvsCtFlush(zone); +if (ctAttrs[CTA_TUPLE_ORIG]) { +ct_tuple = OvsAllocateMemoryWithTag(sizeof(struct ovs_key_ct_tuple_ipv4), +OVS_CT_POOL_TAG); +if (ct_tuple == NULL) { +status = STATUS_INSUFFICIENT_RESOURCES; +goto done; +} +/*
[ovs-dev] [PATCH v1] Add support to delete a conntrack entry by 5-tuple
This patch adds support for deleting conntrack entry by conntrack 5-tuple in windows kernel. The related userspace changes are currrently being reviewed https://patchwork.ozlabs.org/project/openvswitch/list/?series=13345 The windows kernel changes can go in independent of the userspace changes since conntrack 5-tuple is an optional parameter. Anand Kumar (1): datapath-windows: Add support for deleting conntrack entry by 5 tuple. datapath-windows/ovsext/Conntrack.c | 146 +--- 1 file changed, 134 insertions(+), 12 deletions(-) -- 2.9.3.windows.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v2 0/5] dpif-netdev: Cuckoo-Distributor implementation
Hi, Jan, Thanks for sharing the patch and nice talking with you in the conference. Could you tell us more about the test case you used such as the rule set and traffic pattern? Since we have some other work at hand to be done soon, we will definitely test the patch later and get back to you in a week or two. Thank Yipeng > -Original Message- > From: Jan Scheurich [mailto:jan.scheur...@ericsson.com] > Sent: Monday, November 20, 2017 9:39 AM > To: Wang, Yipeng1; d...@openvswitch.org > Cc: Gobriel, Sameh ; Fischetti, Antonio > ; db...@vmware.com; Ben Pfaff (b...@ovn.org) > > Subject: RE: [PATCH v2 0/5] dpif-netdev: Cuckoo-Distributor implementation > > Hi Yipeng and Sameh, > > As discussed on the OVS conference, I have just sent my Datapath Flow > Cache patch to the ML: > https://mail.openvswitch.org/pipermail/ovs-dev/2017- > November/341066.html > http://patchwork.ozlabs.org/patch/839645/ > > I would be happy if you could test it in your setup and compare the > performance with your CD implementation. I don't have a test bed at hand > where I could easily test the speedup for many visited subtables. > > Thanks, Jan > > > -Original Message- > > From: Yipeng Wang [mailto:yipeng1.w...@intel.com] > > Sent: Wednesday, 01 November, 2017 00:40 > > To: d...@openvswitch.org > > Cc: yipeng1.w...@intel.com; sameh.gobr...@intel.com; > antonio.fische...@intel.com; db...@vmware.com; Jan Scheurich > > > > Subject: [PATCH v2 0/5] dpif-netdev: Cuckoo-Distributor implementation > > > > The Datapath Classifier uses tuple space search for flow classification. > > The rules are arranged into a set of tuples/subtables (each with a > > distinct mask). Each subtable is implemented as a hash table and lookup > > is done with flow keys formed by selecting the bits from the packet header > > based on each subtable's mask. Tuple space search will sequentially search > > each subtable until a match is found. With a large number of subtables, a > > sequential search of the subtables could consume a lot of CPU cycles. In > > a testbench with a uniform traffic pattern equally distributed across 20 > > subtables, we measured that up to 65% of total execution time is > attributed > > to the megaflow cache lookup. > > > > This patch presents the idea of the two-layer hierarchical lookup, where a > > low overhead first level of indirection is accessed first, we call this > > level cuckoo distributor (CD). If a flow key has been inserted in the flow > > table the first level will indicate with high probability that which > > subtable to look into. A lookup is performed on the second level (the > > target subtable) to retrieve the result. If the key doesn’t have a match, > > then we revert back to the sequential search of subtables. The patch is > > partially inspired by earlier concepts proposed in "simTable"[1] and > > "Cuckoo Filter"[2], and DPDK's Cuckoo Hash implementation. > > > > This patch can improve the already existing Subtable Ranking when traffic > > data has high entropy. Subtable Ranking helps minimize the number of > > traversed subtables when most of the traffic hit the same subtable. > > However, in the case of high entropy traffic such as traffic coming from > > a physical port, multiple subtables could be hit with a similar frequency. > > In this case the average subtable lookups per hit would be much greater > > than 1. In addition, CD can adaptively turn off when it finds the traffic > > mostly hit one subtable. Thus, CD will not be an overhead when Subtable > > Ranking works well. > > > > Scheme: > > CD is in front of the subtables. Packets are directed to corresponding > subtable > > if hit in CD instead of searching each subtable sequentially. > > --- > > | CD | > > --- > >\ > > \ > > - - - > > |sub ||sub |...|sub | > > |table||table| |table| > > - - - > > > > Evaluation: > > -- > > We create a set of rules with various src IP. We feed traffic containing > various > > numbers of flows with various src IP and dst IP. All the flows hit 10/20/30 > > rules creating 10/20/30 subtables. We will explain the rule/traffic setup > > in detail later. > > > > The table below shows the preliminary continuous testing results (full line > > speed test) we collected with a uni-directional phy-to-phy setup. OvS > > runs with 1 PMD. We use Spirent as the hardware traffic generator. > > > > Before v2 rebase: > > > > AVX2 data: > > 20k flows: > > no.subtable: 10 20 30 > > cd-ovs 4267332 3478251 3126763 > > orig-ovs 3260883 2174551 1689981 > > speedup 1.31x 1.60x 1.85x > > > > 100k flows: > > no.subtable: 10 20 30 > > cd-ovs 4015783 3276100 2970645 > > orig-ovs 2692882 1711955 1302321 > > speedup
[ovs-dev] Gestión de Tesorería
Liquidez en equilibrio para hacer frente a los retos financieros Gestión de Tesorería: Importancia ante una crisis financiera 24 de noviembre - C.P. y MAN. Fernando García Zárate 9am-6pm Obtendrá las herramientas necesarias que le permitirán hacer frente a los retos en caso de una crisis financiera, a través de la optimización del ciclo económico, maximización del rendimiento operativo y de inversión, así como en la reducción de costos. Identifique y prevenga riesgos de liquidez, conociendo los instrumentos para su cobertura. BENEFICIOS DE ASISTIR: -Entenderá el papel del tesorero en la empresa y su relación con el Director Financiero -Aprenderá las herramientas para optimizar el flujo de caja -Conocerá la importancia del presupuesto y liquidez de la tesorería -Identificará los riesgos de un mal manejo de tesorería ¿Requiere la información a la Brevedad? responda este email con la palabra: Tesorería + nombre - teléfono - correo. centro telefónico:018002120744 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v2 0/9] convert Perl to Python
Ben Pfaffwrites: > This series gets rid of all usage of Perl in the tree, converting > it to Python instead. > > v1->v2: > * Minor fixes from Aaron Conole. > * Break soexpand and sodepends common code into library (suggested by > Aaron Conole) > > Ben Pfaff (9): > tests: Convert uuidfilt utility from Perl to Python. > tests: Convert ovsdb-monitor-sort utility from Perl to Python. > tests: Convert flowgen utility from Perl to Python. > tests: Convert dpdkstrip utility from Perl to Python. > tests: Convert soexpand build tool from Perl to Python. > tests: Convert sodepends build tool from Perl to Python. > tests: Convert dot2pic build tool from Perl to Python. > tests: Convert miscellaneous test code from Perl to Python. > Remove Perl dependency. For the series: Acked-by: Aaron Conole > Documentation/intro/install/general.rst | 8 - > Makefile.am | 16 +- > build-aux/automake.mk | 5 +- > build-aux/dpdkstrip.pl | 35 - > build-aux/dpdkstrip.py | 48 ++ > build-aux/sodepends.pl | 70 - > build-aux/sodepends.py | 68 + > build-aux/soexpand.pl | 40 - > build-aux/soexpand.py | 59 > configure.ac| 6 - > ovn/automake.mk | 4 +- > ovsdb/dot2pic | 155 +++ > python/automake.mk | 3 +- > python/build/soutil.py | 56 +++ > rhel/openvswitch-fedora.spec.in | 2 +- > tests/atlocal.in| 9 +- > tests/automake.mk | 7 +- > tests/flowgen.pl| 253 > > tests/flowgen.py| 240 ++ > tests/library.at| 2 +- > tests/ofproto-macros.at | 4 +- > tests/ofproto.at| 62 > tests/ovn-controller.at | 2 +- > tests/ovn-nbctl.at | 102 ++--- > tests/ovs-macros.at | 5 +- > tests/ovs-vsctl.at | 16 +- > tests/ovsdb-execution.at| 4 +- > tests/ovsdb-idl.at | 30 ++-- > tests/ovsdb-monitor-sort.pl | 52 --- > tests/ovsdb-monitor-sort.py | 85 +++ > tests/ovsdb-monitor.at | 4 +- > tests/ovsdb-rbac.at | 28 ++-- > tests/ovsdb-server.at | 36 ++--- > tests/ovsdb-tool.at | 18 +-- > tests/ovsdb-trigger.at | 2 +- > tests/system-kmod-macros.at | 4 +- > tests/system-userspace-macros.at| 4 +- > tests/uuidfilt.pl | 33 - > tests/uuidfilt.py | 50 +++ > vswitchd/automake.mk| 2 +- > vtep/automake.mk| 2 +- > 41 files changed, 891 insertions(+), 740 deletions(-) > delete mode 100644 build-aux/dpdkstrip.pl > create mode 100755 build-aux/dpdkstrip.py > delete mode 100644 build-aux/sodepends.pl > create mode 100755 build-aux/sodepends.py > delete mode 100644 build-aux/soexpand.pl > create mode 100755 build-aux/soexpand.py > create mode 100755 python/build/soutil.py > delete mode 100755 tests/flowgen.pl > create mode 100755 tests/flowgen.py > delete mode 100755 tests/ovsdb-monitor-sort.pl > create mode 100755 tests/ovsdb-monitor-sort.py > delete mode 100755 tests/uuidfilt.pl > create mode 100755 tests/uuidfilt.py ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v2 1/9] tests: Convert uuidfilt utility from Perl to Python.
Ben Pfaffwrites: > Perl is unfashionable and Python is more widely available and understood, > so this commit converts one of the OVS uses of Perl into Python. > > Signed-off-by: Ben Pfaff > --- > tests/automake.mk| 5 +- > tests/ofproto-macros.at | 4 +- > tests/ofproto.at | 24 - > tests/ovn-controller.at | 2 +- > tests/ovn-nbctl.at | 102 > +++ > tests/ovs-macros.at | 4 ++ > tests/ovs-vsctl.at | 16 +++--- > tests/ovsdb-execution.at | 4 +- > tests/ovsdb-idl.at | 30 ++-- > tests/ovsdb-monitor.at | 4 +- > tests/ovsdb-rbac.at | 28 +-- > tests/ovsdb-server.at| 36 +++--- > tests/ovsdb-tool.at | 18 +++ > tests/ovsdb-trigger.at | 2 +- > tests/system-kmod-macros.at | 4 +- > tests/system-userspace-macros.at | 4 +- > tests/uuidfilt.pl| 33 - > tests/uuidfilt.py| 50 +++ > 18 files changed, 196 insertions(+), 174 deletions(-) > delete mode 100755 tests/uuidfilt.pl > create mode 100755 tests/uuidfilt.py > > diff --git a/tests/automake.mk b/tests/automake.mk > index 156b40f58d7c..3ca60e2ea450 100644 > --- a/tests/automake.mk > +++ b/tests/automake.mk > @@ -290,7 +290,7 @@ $(srcdir)/package.m4: $(top_srcdir)/configure.ac > noinst_PROGRAMS += tests/test-ovsdb > tests_test_ovsdb_SOURCES = tests/test-ovsdb.c > nodist_tests_test_ovsdb_SOURCES = tests/idltest.c tests/idltest.h > -EXTRA_DIST += tests/uuidfilt.pl tests/ovsdb-monitor-sort.pl > +EXTRA_DIST += tests/ovsdb-monitor-sort.pl > tests_test_ovsdb_LDADD = ovsdb/libovsdb.la lib/libopenvswitch.la > > noinst_PROGRAMS += tests/test-lib > @@ -389,7 +389,8 @@ CHECK_PYFILES = \ > tests/MockXenAPI.py \ > tests/test-unix-socket.py \ > tests/test-unixctl.py \ > - tests/test-vlog.py > + tests/test-vlog.py \ > + tests/uuidfilt.py > EXTRA_DIST += $(CHECK_PYFILES) > PYCOV_CLEAN_FILES += $(CHECK_PYFILES:.py=.py,cover) .coverage > > diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at > index 38449db32ee0..4fbb10070342 100644 > --- a/tests/ofproto-macros.at > +++ b/tests/ofproto-macros.at > @@ -353,7 +353,7 @@ m4_define([_OVS_VSWITCHD_START], > # br0 with predictable settings, passing 'vsctl-args' as additional > # commands to ovs-vsctl. If 'vsctl-args' causes ovs-vsctl to provide > # output (e.g. because it includes "create" commands) then 'vsctl-output' > -# specifies the expected output after filtering through uuidfilt.pl. > +# specifies the expected output after filtering through uuidfilt. > # > # If a test needs to use "system" devices (as dummies), then specify > # =override (literally) as the third argument. Otherwise, system devices > @@ -364,7 +364,7 @@ m4_define([_OVS_VSWITCHD_START], > # to ovs-vswitchd > m4_define([OVS_VSWITCHD_START], >[_OVS_VSWITCHD_START([--enable-dummy$3 --disable-system $4]) > - AT_CHECK([add_of_br 0 $1 m4_if([$2], [], [], [| ${PERL} > $srcdir/uuidfilt.pl])], [0], [$2]) > + AT_CHECK([add_of_br 0 $1 m4_if([$2], [], [], [| uuidfilt])], [0], [$2]) > ]) > > # check_logs scans through all *.log files (except '*.log' and testsuite.log) > diff --git a/tests/ofproto.at b/tests/ofproto.at > index 31cb5208fc1c..9853a21db989 100644 > --- a/tests/ofproto.at > +++ b/tests/ofproto.at > @@ -2196,7 +2196,7 @@ AT_CHECK( > -- --id=@t0 create Flow_Table name=main \ > -- --id=@t1 create Flow_Table flow-limit=1024 \ > -- set bridge br0 'flow_tables={1=@t1,0=@t0}' \ > - | ${PERL} $srcdir/uuidfilt.pl], > + | uuidfilt], >[0], [<0> > <1> > ]) > @@ -2348,7 +2348,7 @@ AT_CHECK( > -- --id=@t0 create Flow_Table name=main \ > -- --id=@t1 create Flow_Table flow-limit=1024 \ > -- set bridge br0 'flow_tables={1=@t1,0=@t0}' \ > - | ${PERL} $srcdir/uuidfilt.pl], > + | uuidfilt], >[0], [<0> > <1> > ]) > @@ -2603,7 +2603,7 @@ AT_CHECK( > -- --id=@t0 create Flow_Table name=main \ > -- --id=@t1 create Flow_Table flow-limit=1024 \ > -- set bridge br0 'flow_tables={1=@t1,0=@t0}' \ > - | ${PERL} $srcdir/uuidfilt.pl], > + | uuidfilt], >[0], [<0> > <1> > ]) > @@ -2655,7 +2655,7 @@ AT_CHECK( >[ovs-vsctl \ > -- --id=@t0 create Flow_Table flow-limit=4 \ > -- set bridge br0 flow_tables:0=@t0 \ > - | ${PERL} $srcdir/uuidfilt.pl], > + | uuidfilt], >[0], [<0> > ]) > # Add 4 flows. > @@ -2699,7 +2699,7 @@ AT_CHECK( >[ovs-vsctl \ > -- --id=@t0 create Flow_Table flow-limit=4 \ > -- set bridge br0 flow_tables:0=@t0 \ > - | ${PERL} $srcdir/uuidfilt.pl], > + | uuidfilt], >[0], [<0> > ]) > # Add 4 flows. > @@ -2738,7 +2738,7 @@ AT_CHECK( >[ovs-vsctl \ > -- --id=@t0 create Flow_Table flow-limit=4 overflow-policy=evict
[ovs-dev] [RFC PATCH v3 8/8] netdev-dpdk: support multi-segment jumbo frames
Currently, jumbo frame support for OvS-DPDK is implemented by increasing the size of mbufs within a mempool, such that each mbuf within the pool is large enough to contain an entire jumbo frame of a user-defined size. Typically, for each user-defined MTU, 'requested_mtu', a new mempool is created, containing mbufs of size ~requested_mtu. With the multi-segment approach, a port uses a single mempool, (containing standard/default-sized mbufs of ~2k bytes), irrespective of the user-requested MTU value. To accommodate jumbo frames, mbufs are chained together, where each mbuf in the chain stores a portion of the jumbo frame. Each mbuf in the chain is termed a segment, hence the name. == Enabling multi-segment mbufs == Multi-segment and single-segment mbufs are mutually exclusive, and the user must decide on which approach to adopt on init. The introduction of a new OVSDB field, 'dpdk-multi-seg-mbufs', facilitates this. This is a global boolean value, which determines how jumbo frames are represented across all DPDK ports. In the absence of a user-supplied value, 'dpdk-multi-seg-mbufs' defaults to false, i.e. multi-segment mbufs must be explicitly enabled / single-segment mbufs remain the default. Setting the field is identical to setting existing DPDK-specific OVSDB fields: ovs-vsctl set Open_vSwitch . other_config:dpdk-init=true ovs-vsctl set Open_vSwitch . other_config:dpdk-lcore-mask=0x10 ovs-vsctl set Open_vSwitch . other_config:dpdk-socket-mem=4096,0 ==> ovs-vsctl set Open_vSwitch . other_config:dpdk-multi-seg-mbufs=true Signed-off-by: Mark Kavanagh--- NEWS | 1 + lib/dpdk.c | 7 +++ lib/netdev-dpdk.c| 43 --- lib/netdev-dpdk.h| 1 + vswitchd/vswitch.xml | 20 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index c15dc24..657b598 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,7 @@ Post-v2.8.0 - DPDK: * Add support for DPDK v17.11 * Add support for vHost IOMMU feature + * Add support for multi-segment mbufs v2.8.0 - 31 Aug 2017 diff --git a/lib/dpdk.c b/lib/dpdk.c index 8da6c32..4c28bd0 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -450,6 +450,13 @@ dpdk_init__(const struct smap *ovs_other_config) /* Finally, register the dpdk classes */ netdev_dpdk_register(); + +bool multi_seg_mbufs_enable = smap_get_bool(ovs_other_config, +"dpdk-multi-seg-mbufs", false); +if (multi_seg_mbufs_enable) { +VLOG_INFO("DPDK multi-segment mbufs enabled\n"); +netdev_dpdk_multi_segment_mbufs_enable(); +} } void diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 36275bd..293edad 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -65,6 +65,7 @@ enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; VLOG_DEFINE_THIS_MODULE(netdev_dpdk); static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); +static bool dpdk_multi_segment_mbufs = false; #define DPDK_PORT_WATCHDOG_INTERVAL 5 @@ -500,6 +501,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, uint16_t frame_len) + dev->requested_n_txq * dev->requested_txq_size + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST + MIN_NB_MBUF; +/* XXX (RFC) - should n_mbufs be increased if multi-seg mbufs are used? */ ovs_mutex_lock(_mp_mutex); do { @@ -568,7 +570,13 @@ dpdk_mp_free(struct rte_mempool *mp) /* Tries to allocate a new mempool - or re-use an existing one where * appropriate - on requested_socket_id with a size determined by - * requested_mtu and requested Rx/Tx queues. + * requested_mtu and requested Rx/Tx queues. Some properties of the mempool's + * elements are dependent on the value of 'dpdk_multi_segment_mbufs': + * - if 'true', then the mempool contains standard-sized mbufs that are chained + * together to accommodate packets of size 'requested_mtu'. + * - if 'false', then the members of the allocated mempool are + * non-standard-sized mbufs. Each mbuf in the mempool is large enough to fully + * accomdate packets of size 'requested_mtu'. * On success - or when re-using an existing mempool - the new configuration * will be applied. * On error, device will be left unchanged. */ @@ -576,10 +584,18 @@ static int netdev_dpdk_mempool_configure(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) { -uint16_t buf_size = dpdk_buf_size(dev->requested_mtu); +uint16_t buf_size = 0; struct rte_mempool *mp; int ret = 0; +/* Contiguous mbufs in use - permit oversized mbufs */ +if (!dpdk_multi_segment_mbufs) { +buf_size = dpdk_buf_size(dev->requested_mtu); +} else { +/* multi-segment mbufs - use standard mbuf size */ +buf_size = dpdk_buf_size(ETHER_MTU); +} + mp = dpdk_mp_create(dev, buf_size); if (!mp) { VLOG_ERR("Failed to create memory pool for
[ovs-dev] [RFC PATCH v3 7/8] netdev-dpdk: copy large packet to multi-seg. mbufs
From: Michael QiuCurrently, packets are only copied to a single segment in the function dpdk_do_tx_copy(). This could be an issue in the case of jumbo frames, particularly when multi-segment mbufs are involved. This patch calculates the number of segments needed by a packet and copies the data to each segment. Signed-off-by: Michael Qiu Signed-off-by: Mark Kavanagh --- lib/netdev-dpdk.c | 55 +++ 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 61a0dca..36275bd 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1824,8 +1824,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) #endif struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); struct rte_mbuf *pkts[PKT_ARRAY_SIZE]; +struct rte_mbuf *temp, *head = NULL; uint32_t cnt = batch_cnt; uint32_t dropped = 0; +uint32_t i, j, nb_segs; if (dev->type != DPDK_DEV_VHOST) { /* Check if QoS has been configured for this netdev. */ @@ -1838,9 +1840,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) uint32_t txcnt = 0; -for (uint32_t i = 0; i < cnt; i++) { +for (i = 0; i < cnt; i++) { struct dp_packet *packet = batch->packets[i]; uint32_t size = dp_packet_size(packet); +uint16_t max_data_len, data_len; if (OVS_UNLIKELY(size > dev->max_packet_len)) { VLOG_WARN_RL(, "Too big size %u max_packet_len %d", @@ -1850,15 +1853,59 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) continue; } -pkts[txcnt] = rte_pktmbuf_alloc(dev->mp); +temp = pkts[txcnt] = rte_pktmbuf_alloc(dev->mp); if (OVS_UNLIKELY(!pkts[txcnt])) { dropped += cnt - i; break; } +/* All new allocated mbuf's max data len is the same */ +max_data_len = temp->buf_len - temp->data_off; + +/* Calculate # of output mbufs. */ +nb_segs = size / max_data_len; +if (size % max_data_len) +nb_segs = nb_segs + 1; + +/* Allocate additional mbufs when multiple output mbufs required. */ +for (j = 1; j < nb_segs; j++) { +temp->next = rte_pktmbuf_alloc(dev->mp); +if (!temp->next) { +rte_pktmbuf_free(pkts[txcnt]); +pkts[txcnt] = NULL; +break; +} +temp = temp->next; +} /* We have to do a copy for now */ -memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *), - dp_packet_data(packet), size); +rte_pktmbuf_pkt_len(pkts[txcnt]) = size; +temp = pkts[txcnt]; + +data_len = size < max_data_len ? size: max_data_len; +if (packet->source == DPBUF_DPDK) { +head = &(packet->mbuf); +while (temp && head && size > 0) { +rte_memcpy(rte_pktmbuf_mtod(temp, void*), +dp_packet_data((struct dp_packet *)head), data_len); +rte_pktmbuf_data_len(temp) = data_len; +head = head->next; +size = size - data_len; +data_len = size < max_data_len ? size: max_data_len; +temp = temp->next; +} +} else { +int offset = 0; +while (temp && size > 0) { +memcpy(rte_pktmbuf_mtod(temp, void *), +dp_packet_at(packet, offset, data_len), data_len); +rte_pktmbuf_data_len(temp) = data_len; +temp = temp->next; +size = size - data_len; +offset += data_len; +data_len = size < max_data_len ? size: max_data_len; +} +} + dp_packet_set_size((struct dp_packet *)pkts[txcnt], size); pkts[txcnt]->nb_segs = packet->mbuf.nb_segs; pkts[txcnt]->ol_flags = packet->mbuf.ol_flags; -- 1.9.3 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 6/8] lib/dp-packet: copy data from multi-seg. DPDK mbuf
From: Michael QiuWhen doing packet clone, if packet source is from DPDK driver, multi-segment must be considered, and copy the segment's data one by one. Signed-off-by: Michael Qiu Signed-off-by: Mark Kavanagh --- lib/dp-packet.c | 24 ++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 5c590e5..26fff02 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -166,10 +166,30 @@ struct dp_packet * dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom) { struct dp_packet *new_buffer; +uint32_t pkt_len = dp_packet_size(buffer); +#ifdef DPDK_NETDEV +/* copy multi-seg data */ +if (buffer->source == DPBUF_DPDK && buffer->mbuf.nb_segs > 1) { +uint32_t offset = 0; +void *dst = NULL; +struct rte_mbuf *tmbuf = CONST_CAST(struct rte_mbuf *, &(buffer->mbuf)); + +new_buffer = dp_packet_new_with_headroom(pkt_len, headroom); +dp_packet_set_size(new_buffer, pkt_len + headroom); +dst = dp_packet_tail(new_buffer); + +while (tmbuf) { +rte_memcpy((char *)dst + offset, + rte_pktmbuf_mtod(tmbuf, void *), tmbuf->data_len); +offset += tmbuf->data_len; +tmbuf = tmbuf->next; +} +} +else +#endif new_buffer = dp_packet_clone_data_with_headroom(dp_packet_data(buffer), - dp_packet_size(buffer), - headroom); + pkt_len, headroom); /* Copy the following fields into the returned buffer: l2_pad_size, * l2_5_ofs, l3_ofs, l4_ofs, cutlen, packet_type and md. */ memcpy(_buffer->l2_pad_size, >l2_pad_size, -- 1.9.3 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 5/8] lib/dp-packet: fix dp_packet_put_uninit for multi-seg mbufs
dp_packet_put_uninit(dp_packet, size) appends 'size' bytes to the tail of a dp_packet. In the case of multi-segment mbufs, it is the data length of the last mbuf in the mbuf chain that should be adjusted by 'size' bytes. In its current implementation, dp_packet_put_uninit() adjusts the dp_packet's size via a call to dp_packet_set_size(); however, this adjusts the data length of the first mbuf in the chain, which is incorrect in the case of multi-segment mbufs. Instead, traverse the mbuf chain to locate the final mbuf of said chain, and update its data_len [1]. To finish, increase the packet length of the entire mbuf [2] by 'size'. [1] In the case of a single-segment mbuf, this is the mbuf itself. [2] This is stored in the first mbuf of an mbuf chain. Signed-off-by: Mark Kavanagh--- lib/dp-packet.c | 16 1 file changed, 16 insertions(+) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 5078211..5c590e5 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -325,6 +325,22 @@ dp_packet_put_uninit(struct dp_packet *b, size_t size) void *p; dp_packet_prealloc_tailroom(b, size); p = dp_packet_tail(b); +#ifdef DPDK_NETDEV +if (b->source == DPBUF_DPDK) { +struct rte_mbuf *buf = &(b->mbuf); +/* In the case of multi-segment mbufs, the data length of the last mbuf + * should be adjusted by 'size' bytes. A call to dp_packet_size() would + * adjust the data length of the first mbuf in the segment, so we avoid + * invoking same; as a result, the packet length of the entire mbuf + * chain (stored in the first mbuf of said chain) must be adjusted here + * instead. + */ +while (buf->next) +buf = buf->next; +buf->data_len += size; +b->mbuf.pkt_len += size; +} else +#endif dp_packet_set_size(b, dp_packet_size(b) + size); return p; } -- 1.9.3 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 4/8] lib/dp-packet: Fix data_len issue with multi-segs
From: Michael QiuWhen a packet is from DPDK source, and it contains multiple segments, data_len is not equal to the packet size. This patch fixes this issue. Co-authored-by: Mark Kavanagh Co-authored-by: Przemyslaw Lal Co-authored-by: Marcin Ksiadz Co-authored-by: Yuanhan Liu Signed-off-by: Michael Qiu Signed-off-by: Mark Kavanagh Signed-off-by: Przemyslaw Lal Signed-off-by: Marcin Ksiadz Signed-off-by: Yuanhan Liu --- lib/dp-packet.h | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 7aa440f..c2736d3 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -23,6 +23,7 @@ #ifdef DPDK_NETDEV #include #include +#include "rte_ether.h" #endif #include "netdev-dpdk.h" @@ -429,17 +430,14 @@ dp_packet_size(const struct dp_packet *b) static inline void dp_packet_set_size(struct dp_packet *b, uint32_t v) { -/* netdev-dpdk does not currently support segmentation; consequently, for - * all intents and purposes, 'data_len' (16 bit) and 'pkt_len' (32 bit) may - * be used interchangably. - * - * On the datapath, it is expected that the size of packets - * (and thus 'v') will always be <= UINT16_MAX; this means that there is no - * loss of accuracy in assigning 'v' to 'data_len'. +/* + * Assign current segment length. If total length is greater than + * max data length in a segment, additional calculation is needed */ -b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ -b->mbuf.pkt_len = v; /* Total length of all segments linked to - * this segment. */ +b->mbuf.data_len = MIN(v, b->mbuf.buf_len - b->mbuf.data_off); + +/* Total length of all segments linked to this segment. */ +b->mbuf.pkt_len = v; } static inline uint16_t -- 1.9.3 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 3/8] lib/dp-packet: copy mbuf info for packet copy
From: Michael QiuCurrently, when doing packet copy, lots of DPDK mbuf's info will be missed, like packet type, ol_flags, etc. Those information is very important for DPDK to do packets processing. Signed-off-by: Michael Qiu [mark.b.kavan...@intel.com rebased] Signed-off-by: Mark Kavanagh --- lib/dp-packet.c | 3 +++ lib/netdev-dpdk.c | 4 2 files changed, 7 insertions(+) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 443c225..5078211 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -178,6 +178,9 @@ dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom) #ifdef DPDK_NETDEV new_buffer->mbuf.ol_flags = buffer->mbuf.ol_flags; +new_buffer->mbuf.tx_offload = buffer->mbuf.tx_offload; +new_buffer->mbuf.packet_type = buffer->mbuf.packet_type; +new_buffer->mbuf.nb_segs = buffer->mbuf.nb_segs; #else new_buffer->rss_hash_valid = buffer->rss_hash_valid; #endif diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index c5eb851..61a0dca 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1860,6 +1860,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *), dp_packet_data(packet), size); dp_packet_set_size((struct dp_packet *)pkts[txcnt], size); +pkts[txcnt]->nb_segs = packet->mbuf.nb_segs; +pkts[txcnt]->ol_flags = packet->mbuf.ol_flags; +pkts[txcnt]->packet_type = packet->mbuf.packet_type; +pkts[txcnt]->tx_offload = packet->mbuf.tx_offload; txcnt++; } -- 1.9.3 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 0/8] netdev-dpdk: support multi-segment mbufs
Overview This patchset introduces support for multi-segment mbufs to OvS-DPDK. Multi-segment mbufs are typically used when the size of an mbuf is insufficient to contain the entirety of a packet's data. Instead, the data is split across numerous mbufs, each carrying a portion, or 'segment', of the packet data. mbufs are chained via their 'next' attribute (an mbuf pointer). Use Cases = i. Handling oversized (guest-originated) frames, which are marked for hardware accelration/offload (TSO, for example). Packets which originate from a non-DPDK source may be marked for offload; as such, they may be larger than the permitted ingress interface's MTU, and may be stored in an oversized dp-packet. In order to transmit such packets over a DPDK port, their contents must be copied to a DPDK mbuf (via dpdk_do_tx_copy). However, in its current implementation, that function only copies data into a single mbuf; if the space available in the mbuf is exhausted, but not all packet data has been copied, then it is lost. Similarly, when cloning a DPDK mbuf, it must be considered whether that mbuf contains multiple segments. Both issues are resolved within this patchset. ii. Handling jumbo frames. While OvS already supports jumbo frames, it does so by increasing mbuf size, such that the entirety of a jumbo frame may be handled in a single mbuf. This is certainly the preferred, and most performant approach (and remains the default). However, it places high demands on system memory; multi-segment mbufs may be prefereable for systems which are memory-constrained. Enabling multi-segment mbufs Multi-segment and single-segment mbufs are mutually exclusive, and the user must decide on which approach to adopt on init. The introduction of a new OVSDB field, 'dpdk-multi-seg-mbufs', facilitates this. This is a global boolean value, which determines how jumbo frames are represented across all DPDK ports. In the absence of a user-supplied value, 'dpdk-multi-seg-mbufs' defaults to false, i.e. multi-segment mbufs must be explicitly enabled / single-segment mbufs remain the default. Setting the field is identical to setting existing DPDK-specific OVSDB fields: ovs-vsctl set Open_vSwitch . other_config:dpdk-init=true ovs-vsctl set Open_vSwitch . other_config:dpdk-lcore-mask=0x10 ovs-vsctl set Open_vSwitch . other_config:dpdk-socket-mem=4096,0 ==> ovs-vsctl set Open_vSwitch . other_config:dpdk-multi-seg-mbufs=true Notes = This patchset was generated and built against: - Current HEAD of OvS master [1] + DPDK v17.11 support patchset [2] - DPDK v17.11 [1] 3728b3b ("Merge branch 'dpdk_merge' of https://github.com/ist...;) [2] https://patchwork.ozlabs.org/series/13829/mbox/ Mark Kavanagh (4): netdev-dpdk: simplify mbuf sizing lib/dp-packet: init specific mbuf fields to 0 lib/dp-packet: fix dp_packet_put_uninit for multi-seg mbufs netdev-dpdk: support multi-segment jumbo frames Michael Qiu (4): lib/dp-packet: copy mbuf info for packet copy lib/dp-packet: Fix data_len issue with multi-segs lib/dp-packet: copy data from multi-seg. DPDK mbuf netdev-dpdk: copy large packet to multi-seg. mbufs NEWS | 1 + lib/dp-packet.c | 43 +- lib/dp-packet.h | 24 +- lib/dpdk.c | 7 +++ lib/netdev-dpdk.c| 122 ++- lib/netdev-dpdk.h| 1 + vswitchd/vswitch.xml | 20 + 7 files changed, 183 insertions(+), 35 deletions(-) -- 1.9.3 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH] datapath-windows: Account for VLAN tag in tunnel Decap
Acked-by: Anand KumarThanks, Anand Kumar On 11/20/17, 3:06 PM, "ovs-dev-boun...@openvswitch.org on behalf of Shashank Ram" wrote: Decap functions for tunneling protocols do not compute the packet header offsets correctly when there is a VLAN tag in the L2 header. This results in incorrect checksum computation causing the packet to be dropped. This patch adds support to account for the VLAN tag in the packet if its present, and makes use of the OvsExtractLayers() function to correctly compute the header offsets for different layers. Testing done: - Tested Geneve, STT, Vxlan and Gre and verified that there are no regressions. - Verified that packets with VLAN tags are correctly handled in the decap code of all tunneling protocols. Previously, this would result in packet drops due to invalid checksums being computed. - Verified that non-VLAN tagged packets are handled correctly. Signed-off-by: Shashank Ram --- datapath-windows/ovsext/Geneve.c | 14 + datapath-windows/ovsext/Geneve.h | 6 ++ datapath-windows/ovsext/Gre.c | 29 -- datapath-windows/ovsext/Gre.h | 16 ++ datapath-windows/ovsext/Offload.c | 10 + datapath-windows/ovsext/Offload.h | 3 ++- datapath-windows/ovsext/Stt.c | 44 +++ datapath-windows/ovsext/Stt.h | 6 ++ datapath-windows/ovsext/Vxlan.c | 14 + datapath-windows/ovsext/Vxlan.h | 6 ++ 10 files changed, 111 insertions(+), 37 deletions(-) diff --git a/datapath-windows/ovsext/Geneve.c b/datapath-windows/ovsext/Geneve.c index 6dca69b..210716d 100644 --- a/datapath-windows/ovsext/Geneve.c +++ b/datapath-windows/ovsext/Geneve.c @@ -262,10 +262,16 @@ NDIS_STATUS OvsDecapGeneve(POVS_SWITCH_CONTEXT switchContext, PUINT8 bufferStart; PVOID optStart; NDIS_STATUS status; +OVS_PACKET_HDR_INFO layers = { 0 }; + +status = OvsExtractLayers(curNbl, ); +if (status != NDIS_STATUS_SUCCESS) { +return status; +} /* Check the length of the UDP payload */ curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); -tunnelSize = OvsGetGeneveTunHdrMinSize(); +tunnelSize = OvsGetGeneveTunHdrSizeFromLayers(); packetLength = NET_BUFFER_DATA_LENGTH(curNb); if (packetLength <= tunnelSize) { return NDIS_STATUS_INVALID_LENGTH; @@ -295,13 +301,13 @@ NDIS_STATUS OvsDecapGeneve(POVS_SWITCH_CONTEXT switchContext, ethHdr = (EthHdr *)bufferStart; /* XXX: Handle IP options. */ -ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr); +ipHdr = (IPHdr *)(bufferStart + layers.l3Offset); tunKey->src = ipHdr->saddr; tunKey->dst = ipHdr->daddr; tunKey->tos = ipHdr->tos; tunKey->ttl = ipHdr->ttl; tunKey->pad = 0; -udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr); +udpHdr = (UDPHdr *)(bufferStart + layers.l4Offset); /* Validate if NIC has indicated checksum failure. */ status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0); @@ -312,7 +318,7 @@ NDIS_STATUS OvsDecapGeneve(POVS_SWITCH_CONTEXT switchContext, /* Calculate and verify UDP checksum if NIC didn't do it. */ if (udpHdr->check != 0) { status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, - packetLength); + packetLength, ); tunKey->flags |= OVS_TNL_F_CSUM; if (status != NDIS_STATUS_SUCCESS) { goto dropNbl; diff --git a/datapath-windows/ovsext/Geneve.h b/datapath-windows/ovsext/Geneve.h index 019c0dd..db758dd 100644 --- a/datapath-windows/ovsext/Geneve.h +++ b/datapath-windows/ovsext/Geneve.h @@ -113,6 +113,12 @@ OvsGetGeneveTunHdrMaxSize(VOID) return OvsGetGeneveTunHdrMinSize() + TUN_OPT_MAX_LEN; } +static __inline UINT32 +OvsGetGeneveTunHdrSizeFromLayers(POVS_PACKET_HDR_INFO layers) +{ +return layers->l7Offset + sizeof(GeneveHdr); +} + #define GENEVE_UDP_PORT 6081 #define GENEVE_UDP_PORT_NBO 0xC117 #define GENEVE_VER 0 diff --git a/datapath-windows/ovsext/Gre.c b/datapath-windows/ovsext/Gre.c index f095742..1f38ee7 100644 --- a/datapath-windows/ovsext/Gre.c +++ b/datapath-windows/ovsext/Gre.c @@ -317,35 +317,42 @@ OvsDecapGre(POVS_SWITCH_CONTEXT switchContext, GREHdr *greHdr; UINT32 tunnelSize, packetLength; UINT32 headRoom = 0; +UINT32 maxGreLen; PUINT8 bufferStart; NDIS_STATUS status =
Re: [ovs-dev] [PATCH] dpif-netdev: Refactor datapath flow cache
Hi Billy, Thanks for your feedback. I continue to have problems with sending out patches via mail from our company network. This time it seems it is the "Page Feed" (^L) character in the dpif-netdev.c file that got corrupted in the emailed version of the patch. I wish there was a way to upload patches other than through email. Please try the attached patch file instead. This one applies cleanly on the tip of master for me. Some more answers below. BR, Jan > > The EMC implementation is simplified by removing the possibility to store a > > flow > > in two slots, as there is no particular reason why two flows should > > systematically > > collide (the RSS hash is not symmetric). > > The maximum size of the EMC flow key is limited to 256 bytes to reduce the > > memory footprint. This should be sufficient to hold most real life packet > > flow > > keys. Larger flows are not installed in the EMC. > [[BO'M]] Does miniflow_extract work ok with the reduced miniflow size? > Miniflow comment says: > Caller is responsible for initializing 'dst' with enough storage for > FLOW_U64S * 8 bytes. > [Jan] miniflow_extract() is not used here. At EMC insertion I just copy the extracted miniflow into the EMC entry, provided that its length does not exceed the 248 available octets. If it is too large I simply don't insert the flow. > > The pmd-stats-show command is enhanced to show both EMC and DFC hits > > separately. > > > > The sweep speed for cleaning up obsolete EMC and DFC flow entries and > > freeing > > dead megaflow entries is increased. With a typical PMD cycle duration of > > 100us > > under load and checking one DFC entry per cycle, the DFC sweep should > > normally complete within in 100s. > > > > In PVP performance tests with an L3 pipeline over VXLAN we determined the > > optimal EMC size to be 16K entries to obtain a uniform speedup compared to > > the master branch over the full range of parallel flows. The measurement > > below > > is for 64 byte packets and the average number of subtable lookups per DPCLS > > hit > > in this pipeline is 1.0, i.e. the acceleration already starts for a single > > busy mask. > > Tests with many visited subtables should show a strong increase of the gain > > through DFC. > > > > Flows master DFC+EMC Gain > > [Mpps] [Mpps] > > -- > > 8 4.454.62 3.8% > > 100 4.174.47 7.2% > > 10003.884.3412.0% > > 20003.544.1717.8% > > 50003.013.8227.0% > > 1 2.753.6331.9% > > 2 2.643.5032.8% > > 5 2.603.3328.1% > > 10 2.593.2324.7% > > 50 2.593.1621.9% > [[BO'M]] > What is the flow distribution here? > > Are there other flow distributions that we want to ensure do not suffer a > possible regression. I'm not sure what they are exactly - I have > some ideas but admittedly I've only every tested with either a uniform flow > distribution or else a round-robin distribution. [Jan] Since we are using DPDK Pktgen as traffic generator in our standard setup, this is a round-robin flow distribution, which from all I understand, is the worst-case for flow caching as the flow packets are equidistant and there is no clustering. I would be happy if people with other traffic generators can provide complementary measurements. > > + > > +/* EMC lookup not successful: try DFC lookup. */ > > +struct dfc_entry *current_entry = dfc_entry_get(cache, key->hash); > > +flow = current_entry->flow; > > [[BO'M]] There is no equivalent of netdev_flow_key_equal_mf() used in > emc_lookup here - which does a memcmp to verify the minflows > are exactly equal. Without something similar for the DFC Isn't there a > possibility that the 20bit hashes of two packets will accidently point > to the same flow struct? > Yes, there is a risk for collision. The check whether the DFC entry points to the correct megaflow entry is done below. > > > > -if (min && random_uint32() <= min) { > > -emc_insert(>flow_cache, key, flow); > > +if (dfc_entry_alive(current_entry) && > > +dpcls_rule_matches_key(>cr, key)) { [Jan] If the pointed to megaflow is alive, we use the existing function dpcls_rule_matches_key() to do a masked comparison of the packet's flow key with the megaflow's rule. This function is used by the DPCLS classifier to check the packet against rules in the subtable matching the subtable hash and we can reuse it as is. In case the match fails, we have a DFC miss. The subsequent DPCLS lookup will then overwrite the current DFC entry. I decided not to implement probabilistic DFC insertion for two reasons: a) the DFC is much larger so will happen at much higher levels, and b) updating the megaflow pointer in the DFC is far cheaper than EMC insertion. ___ dev mailing list d...@openvswitch.org
Re: [ovs-dev] [PATCH 1/2] ovn-ctl: Add -vfile:info option to OVN_NB/SB_LOG options
Thank you! On Nov 20, 2017 10:07 PM, "Ben Pfaff"wrote: > Yes, of course. I didn't get much work done last week with ovscon > going. > > On Mon, Nov 20, 2017 at 09:11:41PM +0530, Numan Siddique wrote: > > A gentle ping. Ben, is it possible to have a look at this patch set - > > https://patchwork.ozlabs.org/project/openvswitch/list/?series=12481. > > > > Thanks > > Numan > > > > > > On Wed, Nov 8, 2017 at 2:30 PM, Numan Siddique > wrote: > > > > > > > > > > > On Wed, Nov 8, 2017 at 9:36 AM, Numan Siddique > > > wrote: > > > > > >> > > >> > > >> On Wed, Nov 8, 2017 at 9:25 AM, Ben Pfaff wrote: > > >> > > >>> On Wed, Nov 08, 2017 at 08:57:38AM +0530, Numan Siddique wrote: > > >>> > On Wed, Nov 8, 2017 at 8:36 AM, Ben Pfaff wrote: > > >>> > > > >>> > > On Tue, Nov 07, 2017 at 09:01:06PM +0530, nusid...@redhat.com > wrote: > > >>> > > > From: Numan Siddique > > >>> > > > > > >>> > > > In the RHEL environment, when OVN db servers are started using > > >>> ovn-ctl, > > >>> > > > log files are empty. Adding "-vfile:info" option to > ovsdb-server is > > >>> > > > resolving this issue. Running 'ovs-apptctl -t .. vlog/reopen" > > >>> results in > > >>> > > the > > >>> > > > logs appearing in the log files. This issue is seen with 2.7.2. > > >>> > > > > > >>> > > > "-vfile:info" option is passed to ovn-northd and ovn-controller > > >>> when > > >>> > > starting. > > >>> > > > There is no harm in adding this to OVN db servers. > > >>> > > > > > >>> > > > Signed-off-by: Numan Siddique > > >>> > > > > >>> > > This should be harmless, since "info" is the default log level, > but I > > >>> > > don't understand why it makes a difference. Is there an > underlying > > >>> bug > > >>> > > that should be fixed? (Is the --log-file option being passed in? > > >>> That > > >>> > > is the documented way to enable logging to a file.) > > >>> > > > > >>> > > > >>> > Yes. --log-file option is passed. I only see this issue with RHEL > and > > >>> not > > >>> > with centos. > > >>> > And if I restart the DB servers later, the log file gets generated > > >>> > properly. It is seen > > >>> > only for the first time durng tripleo deployment in RHEL > environment. I > > >>> > couldn't figure out the underlying > > >>> > cause. Since "vlog:info" is passed for ovn-northd and > ovn-controller, I > > >>> > thought of taking this approach. > > >>> > But I agree there seems to be underlying issue and this patch might > > >>> hide > > >>> > it. > > >>> > > >>> Have you spent a little time trying to track down the root cause? If > > >>> you have already done that, and still did not figure it out, then I'm > > >>> happy to apply this patch. Let me know. > > >>> > > >> > > >> Thanks Ben. I did try. I am trying again with a fresh deployment to > see > > >> if I can find anything. > > >> I will come back to you with the results. > > >> > > >> > > > Hi Ben, > > > > > > I couldn't find the root cause. I submitted v2 of the patch series. > > > > > > Thanks > > > Numan > > > > > > > > >> Thanks > > >> Numan > > >> > > >> > > >> > > > > ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] OVS DPDK: dpdk_merge pull request
> On Fri, Nov 17, 2017 at 07:27:51PM +, Stokes, Ian wrote: > > The following changes since commit > 1ae83bb20677b42d63dbb2140fa8ed3144c6260f: > > > > netdev-tc-offloads: Add support for action set (2017-11-16 08:10:29 > > -0800) > > > > are available in the git repository at: > > > > https://github.com/istokes/ovs dpdk_merge > > > > for you to fetch changes up to af5b0dad30359d14d6412eb80e81783a83a678ce: > > > > netdev-dpdk: Fix mempool creation with large MTU. (2017-11-17 > > 16:26:33 +) > > Thanks a lot. I merged this branch into master. Thanks Ben. > > This yields a new "sparse" warning: > ../lib/netdev-tc-offloads.c:584:20: warning: Variable length array is > used. > because of this declaration in parse_put_flow_set_masked_action(): > char *set_buff[set_len], *set_data, *set_mask; > So this was introduced in commit 1ae83bb20677b42d63dbb2140fa8ed3144c6260f: netdev-tc-offloads: Add support for action set. This was pushed just after I had submitted the pull request (but wasn't actually part of the pull request patches). It's strange though, I ran sparse before submitting the pull request and it came back clean. Checking after the merge and I don't see the warning you see. (Normally I would not submit a pull request if a sparse warning occurred). What version of sparse are you using out of interest? I'm using sparse-0.5.0-10. > It may or may not be worth fixing that. In favor of fixing it is keeping > OVS sparse warning free, but against it is that the replacement would > probably be malloc(), which is slower. > > However, looking a bit closer (now that I've already done the merge), I > think there is a bug here. set_buff[] is declared as char > *set_buff[set_len], which has size (set_len * sizeof(char *)), but only > set_len bytes of it are used. I think that the right declaration would be > more like uint8_t set_buff[set_len];, although that would lack the proper > alignment for struct nl_attr. > > Maybe something like this would be the appropriate fix for both issues. > Will you please review it? Sure, will be happy to review, but would also be good to get someone with tc-offload experience to sign off also. Roi has also submitted a patch to fix the issue, maybe Roi could review below also? https://mail.openvswitch.org/pipermail/ovs-dev/2017-November/34.html > > Thanks, > > Ben. > > diff --git a/lib/netdev-tc-offloads.c b/lib/netdev-tc-offloads.c index > 781d849e4a87..d6b61f6360d0 100644 > --- a/lib/netdev-tc-offloads.c > +++ b/lib/netdev-tc-offloads.c > @@ -581,7 +581,9 @@ parse_put_flow_set_masked_action(struct tc_flower > *flower, > bool hasmask) { > static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); > -char *set_buff[set_len], *set_data, *set_mask; > +uint64_t set_stub[1024 / 8]; > +struct ofpbuf set_buf = OFPBUF_STUB_INITIALIZER(set_stub); > +char *set_data, *set_mask; > char *key = (char *) >rewrite.key; > char *mask = (char *) >rewrite.mask; > const struct nlattr *attr; > @@ -589,8 +591,7 @@ parse_put_flow_set_masked_action(struct tc_flower > *flower, > size_t size; > > /* copy so we can set attr mask to 0 for used ovs key struct members > */ > -memcpy(set_buff, set, set_len); > -attr = (struct nlattr *) set_buff; > +attr = ofpbuf_put(_buf, set, set_len); > > type = nl_attr_type(attr); > size = nl_attr_get_size(attr) / 2; > @@ -600,6 +601,7 @@ parse_put_flow_set_masked_action(struct tc_flower > *flower, > if (type >= ARRAY_SIZE(set_flower_map) > || !set_flower_map[type][0].size) { > VLOG_DBG_RL(, "unsupported set action type: %d", type); > +ofpbuf_uninit(_buf); > return EOPNOTSUPP; > } > > @@ -632,9 +634,11 @@ parse_put_flow_set_masked_action(struct tc_flower > *flower, > if (hasmask && !is_all_zeros(set_mask, size)) { > VLOG_DBG_RL(, "unsupported sub attribute of set action type > %d", > type); > +ofpbuf_uninit(_buf); > return EOPNOTSUPP; > } > > +ofpbuf_uninit(_buf); > return 0; > } > ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 5/7] netdev-tc-offloads: Remove redundant loop handling ovs action set
From: Paul BlakeyOVS action set always has a single nested OVS_KEY_ATTR_* so there is no need to iterate it. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan --- lib/netdev-tc-offloads.c | 90 +++- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/lib/netdev-tc-offloads.c b/lib/netdev-tc-offloads.c index 6ad2551..1f77b55 100644 --- a/lib/netdev-tc-offloads.c +++ b/lib/netdev-tc-offloads.c @@ -644,56 +644,54 @@ static int parse_put_flow_set_action(struct tc_flower *flower, const struct nlattr *set, size_t set_len) { -const struct nlattr *set_attr; -size_t set_left; - -NL_ATTR_FOR_EACH_UNSAFE (set_attr, set_left, set, set_len) { -if (nl_attr_type(set_attr) == OVS_KEY_ATTR_TUNNEL) { -const struct nlattr *tunnel = nl_attr_get(set_attr); -const size_t tunnel_len = nl_attr_get_size(set_attr); -const struct nlattr *tun_attr; -size_t tun_left; - -flower->set.set = true; -NL_ATTR_FOR_EACH_UNSAFE (tun_attr, tun_left, tunnel, tunnel_len) { -switch (nl_attr_type(tun_attr)) { -case OVS_TUNNEL_KEY_ATTR_ID: { -flower->set.id = nl_attr_get_be64(tun_attr); -} -break; -case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: { -flower->set.ipv4.ipv4_src = nl_attr_get_be32(tun_attr); -} -break; -case OVS_TUNNEL_KEY_ATTR_IPV4_DST: { -flower->set.ipv4.ipv4_dst = nl_attr_get_be32(tun_attr); -} -break; -case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: { -flower->set.ipv6.ipv6_src = -nl_attr_get_in6_addr(tun_attr); -} -break; -case OVS_TUNNEL_KEY_ATTR_IPV6_DST: { -flower->set.ipv6.ipv6_dst = -nl_attr_get_in6_addr(tun_attr); -} -break; -case OVS_TUNNEL_KEY_ATTR_TP_SRC: { -flower->set.tp_src = nl_attr_get_be16(tun_attr); -} -break; -case OVS_TUNNEL_KEY_ATTR_TP_DST: { -flower->set.tp_dst = nl_attr_get_be16(tun_attr); -} -break; -} -} -} else { +const struct nlattr *tunnel; +const struct nlattr *tun_attr; +size_t tun_left, tunnel_len; + +if (nl_attr_type(set) != OVS_KEY_ATTR_TUNNEL) { return parse_put_flow_set_masked_action(flower, set, set_len, false); +} + +tunnel = nl_attr_get(set); +tunnel_len = nl_attr_get_size(set); + +flower->set.set = true; +NL_ATTR_FOR_EACH_UNSAFE(tun_attr, tun_left, tunnel, tunnel_len) { +switch (nl_attr_type(tun_attr)) { +case OVS_TUNNEL_KEY_ATTR_ID: { +flower->set.id = nl_attr_get_be64(tun_attr); +} +break; +case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: { +flower->set.ipv4.ipv4_src = nl_attr_get_be32(tun_attr); +} +break; +case OVS_TUNNEL_KEY_ATTR_IPV4_DST: { +flower->set.ipv4.ipv4_dst = nl_attr_get_be32(tun_attr); +} +break; +case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: { +flower->set.ipv6.ipv6_src = +nl_attr_get_in6_addr(tun_attr); +} +break; +case OVS_TUNNEL_KEY_ATTR_IPV6_DST: { +flower->set.ipv6.ipv6_dst = +nl_attr_get_in6_addr(tun_attr); +} +break; +case OVS_TUNNEL_KEY_ATTR_TP_SRC: { +flower->set.tp_src = nl_attr_get_be16(tun_attr); +} +break; +case OVS_TUNNEL_KEY_ATTR_TP_DST: { +flower->set.tp_dst = nl_attr_get_be16(tun_attr); +} +break; } } + return 0; } -- 2.7.5 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 6/7] tc: Fix wrong struct variable order
From: Paul BlakeyFix the struct variable order to corrospond with it's usage. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan --- lib/tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tc.c b/lib/tc.c index 4f82623..9170e3e 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -62,8 +62,8 @@ struct tc_pedit_key_ex { struct flower_key_to_pedit { enum pedit_header_type htype; -int flower_offset; int offset; +int flower_offset; int size; }; -- 2.7.5 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 2/7] netdev-tc-offloads: Fix accidental skipping of extended pedit keys
From: Paul BlakeyWe only support extended pedit keys for now, so it's the type we expect. Skip the legacy ones instead. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan --- lib/tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tc.c b/lib/tc.c index 627de8a..401 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -484,7 +484,7 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) break; } -if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) { +if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) { VLOG_ERR_RL(_rl, "unable to parse legacy pedit type: %d", nl_attr_type(nla)); return EOPNOTSUPP; -- 2.7.5 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 7/7] tc: Send csum action only if we need to update csum
From: Paul BlakeyCurrently we send the tc csum action even if it's not needed. Fix that by sending it only if csum update flags isn't zero. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan --- lib/tc.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index 9170e3e..d3a0312 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1371,9 +1371,11 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) } nl_msg_end_nested(request, act_offset); -act_offset = nl_msg_start_nested(request, act_index++); -nl_msg_put_act_csum(request, flower->csum_update_flags); -nl_msg_end_nested(request, act_offset); +if (flower->csum_update_flags) { +act_offset = nl_msg_start_nested(request, act_index++); +nl_msg_put_act_csum(request, flower->csum_update_flags); +nl_msg_end_nested(request, act_offset); +} } if (flower->set.set) { act_offset = nl_msg_start_nested(request, act_index++); -- 2.7.5 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 4/7] netdev-tc-offloads: Remove redundant brackets
From: Paul BlakeySigned-off-by: Paul Blakey Reviewed-by: Roi Dayan --- lib/tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tc.c b/lib/tc.c index 360a770..4f82623 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1059,7 +1059,7 @@ static void nl_msg_put_act_pedit(struct ofpbuf *request, struct tc_pedit *parm, struct tc_pedit_key_ex *ex) { -size_t ksize = sizeof *parm + (parm->nkeys * sizeof(struct tc_pedit_key)); +size_t ksize = sizeof *parm + parm->nkeys * sizeof(struct tc_pedit_key); size_t offset, offset_keys_ex, offset_key; int i; -- 2.7.5 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 3/7] netdev-tc-offloads: Verify csum flags on dump from tc
From: Paul BlakeyOn dump, parse and verify the tc csum action update flags in the same way as we put them. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan --- lib/tc.c | 117 ++- lib/tc.h | 2 ++ 2 files changed, 103 insertions(+), 16 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index 401..360a770 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -131,6 +131,10 @@ static struct flower_key_to_pedit flower_pedit_map[] = { }, }; +static inline int +csum_update_flag(struct tc_flower *flower, + enum pedit_header_type htype); + struct tcmsg * tc_make_request(int ifindex, int type, unsigned int flags, struct ofpbuf *request) @@ -465,7 +469,7 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) char *rewrite_key = (void *) >rewrite.key; char *rewrite_mask = (void *) >rewrite.mask; size_t keys_ex_size, left; -int type, i = 0; +int type, i = 0, err; if (!nl_parse_nested(options, pedit_policy, pe_attrs, ARRAY_SIZE(pedit_policy))) { @@ -493,6 +497,11 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE); type = nl_attr_get_u16(ex_type); +err = csum_update_flag(flower, type); +if (err) { +return err; +} + for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) { struct flower_key_to_pedit *m = _pedit_map[j]; int flower_off = m->flower_offset; @@ -736,6 +745,46 @@ nl_parse_act_vlan(struct nlattr *options, struct tc_flower *flower) return 0; } +static const struct nl_policy csum_policy[] = { +[TCA_CSUM_PARMS] = { .type = NL_A_UNSPEC, + .min_len = sizeof(struct tc_csum), + .optional = false, }, +}; + +static int +nl_parse_act_csum(struct nlattr *options, struct tc_flower *flower) +{ +struct nlattr *csum_attrs[ARRAY_SIZE(csum_policy)]; +const struct tc_csum *c; +const struct nlattr *csum_parms; + +if (!nl_parse_nested(options, csum_policy, csum_attrs, + ARRAY_SIZE(csum_policy))) { +VLOG_ERR_RL(_rl, "failed to parse csum action options"); +return EPROTO; +} + +csum_parms = csum_attrs[TCA_CSUM_PARMS]; +c = nl_attr_get_unspec(csum_parms, sizeof *c); + +/* sanity checks */ +if (c->update_flags != flower->csum_update_flags) { +VLOG_WARN_RL(_rl, + "expected different act csum flags: 0x%x != 0x%x", + flower->csum_update_flags, c->update_flags); +return EINVAL; +} +flower->csum_update_flags = 0; /* so we know csum was handled */ + +if (flower->needs_full_ip_proto_mask +&& flower->mask.ip_proto != UINT8_MAX) { +VLOG_WARN_RL(_rl, "expected full matching on flower ip_proto"); +return EINVAL; +} + +return 0; +} + static const struct nl_policy act_policy[] = { [TCA_ACT_KIND] = { .type = NL_A_STRING, .optional = false, }, [TCA_ACT_COOKIE] = { .type = NL_A_UNSPEC, .optional = true, }, @@ -782,8 +831,7 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower) } else if (!strcmp(act_kind, "pedit")) { nl_parse_act_pedit(act_options, flower); } else if (!strcmp(act_kind, "csum")) { -/* not doing anything for now, ovs has an implicit csum recalculation - * with rewriting of packet headers (translating of pedit acts). */ +nl_parse_act_csum(act_options, flower); } else { VLOG_ERR_RL(_rl, "unknown tc action kind: %s", act_kind); return EINVAL; @@ -840,6 +888,13 @@ nl_parse_flower_actions(struct nlattr **attrs, struct tc_flower *flower) } } +if (flower->csum_update_flags) { +VLOG_WARN_RL(_rl, + "expected act csum with flags: 0x%x", + flower->csum_update_flags); +return EINVAL; +} + return 0; } @@ -1181,28 +1236,49 @@ calc_offsets(struct tc_flower *flower, struct flower_key_to_pedit *m, *mask = (void *) (rewrite_mask + m->flower_offset - diff); } -static inline void +static inline int csum_update_flag(struct tc_flower *flower, enum pedit_header_type htype) { -if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4) { +/* Explictily specifiy the csum flags so HW can return EOPNOTSUPP + * if it doesn't support a checksum recalculation of some headers. + * And since OVS allows a flow such as + * eth(dst=),eth_type(0x0800) actions=set(ipv4(src=)) + * we need to force a more specific flow as this can, for example, + * need a recalculation of icmp checksum if the packet that passes + * is icmp and tcp checksum if its tcp. */ + +switch (htype) { +case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
[ovs-dev] [PATCH 1/7] netdev-tc-offloads: Fix travis compilation error
From: Paul BlakeyTravis complains about variable length array usage, use a fixed size instead. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan --- lib/netdev-tc-offloads.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/netdev-tc-offloads.c b/lib/netdev-tc-offloads.c index 781d849..6ad2551 100644 --- a/lib/netdev-tc-offloads.c +++ b/lib/netdev-tc-offloads.c @@ -581,13 +581,15 @@ parse_put_flow_set_masked_action(struct tc_flower *flower, bool hasmask) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); -char *set_buff[set_len], *set_data, *set_mask; +char *set_buff[128], *set_data, *set_mask; char *key = (char *) >rewrite.key; char *mask = (char *) >rewrite.mask; const struct nlattr *attr; int i, j, type; size_t size; +ovs_assert(set_len <= 128); + /* copy so we can set attr mask to 0 for used ovs key struct members */ memcpy(set_buff, set, set_len); attr = (struct nlattr *) set_buff; -- 2.7.5 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 0/7] Fixes for header rewrite feature
Hi, The following series fixes some issues with header rewrite and some small refactoring. Thanks, Roi Paul Blakey (7): netdev-tc-offloads: Fix travis compilation error netdev-tc-offloads: Fix accidental skipping of extended pedit keys netdev-tc-offloads: Verify csum flags on dump from tc netdev-tc-offloads: Remove redundant brackets netdev-tc-offloads: Remove redundant loop handling ovs action set tc: Fix wrong struct variable order tc: Send csum action only if we need to update csum lib/netdev-tc-offloads.c | 94 +- lib/tc.c | 131 +++ lib/tc.h | 2 + 3 files changed, 158 insertions(+), 69 deletions(-) -- 2.7.5 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH] dpif-netdev: Refactor datapath flow cache
Hi Jan, Thanks, that's a really interesting patch. Currently does not apply to head of master - what rev can I apply it to? Some more below - including one way down in the code. Thanks, /Billy. > -Original Message- > From: ovs-dev-boun...@openvswitch.org [mailto:ovs-dev- > boun...@openvswitch.org] On Behalf Of Jan Scheurich > Sent: Monday, November 20, 2017 5:33 PM > To: d...@openvswitch.org > Subject: [ovs-dev] [PATCH] dpif-netdev: Refactor datapath flow cache > > So far the netdev datapath uses an 8K EMC to speed up the lookup of frequently > used flows by comparing the parsed packet headers against the miniflow of a > cached flow, using 13 bits of the packet RSS hash as index. The EMC is too > small > for many applications with 100K or more parallel packet flows so that EMC > threshing actually degrades performance. > Furthermore, the size of struct miniflow and the flow copying cost prevents us > from making it much larger. > > At the same time the lookup cost of the megaflow classifier (DPCLS) is > increasing > as the number of frequently hit subtables grows with the complexity of > pipeline > and the number of recirculations. > > To close the performance gap for many parallel flows, this patch introduces > the > datapath flow cache (DFC) with 1M entries as lookup stage between EMC and > DPCLS. It directly maps 20 bits of the RSS hash to a pointer to the last hit > megaflow entry and performs a masked comparison of the packet flow with the > megaflow key to confirm the hit. This avoids the costly DPCLS lookup even for > very large number of parallel flows with a small memory overhead. > > Due the large size of the DFC and the low risk of DFC thrashing, any DPCLS hit > immediately inserts an entry in the DFC so that subsequent packets get speeded > up. The DFC, thus, accelerate also short-lived flows. > > To further accelerate the lookup of few elephant flows, every DFC hit > triggers a > probabilistic EMC insertion of the flow. As the DFC entry is already in place > the > default EMC insertion probability can be reduced to > 1/1000 to minimize EMC thrashing should there still be many fat flows. > The inverse EMC insertion probability remains configurable. > > The EMC implementation is simplified by removing the possibility to store a > flow > in two slots, as there is no particular reason why two flows should > systematically > collide (the RSS hash is not symmetric). > The maximum size of the EMC flow key is limited to 256 bytes to reduce the > memory footprint. This should be sufficient to hold most real life packet flow > keys. Larger flows are not installed in the EMC. [[BO'M]] Does miniflow_extract work ok with the reduced miniflow size? Miniflow comment says: Caller is responsible for initializing 'dst' with enough storage for FLOW_U64S * 8 bytes. > > The pmd-stats-show command is enhanced to show both EMC and DFC hits > separately. > > The sweep speed for cleaning up obsolete EMC and DFC flow entries and freeing > dead megaflow entries is increased. With a typical PMD cycle duration of 100us > under load and checking one DFC entry per cycle, the DFC sweep should > normally complete within in 100s. > > In PVP performance tests with an L3 pipeline over VXLAN we determined the > optimal EMC size to be 16K entries to obtain a uniform speedup compared to > the master branch over the full range of parallel flows. The measurement below > is for 64 byte packets and the average number of subtable lookups per DPCLS > hit > in this pipeline is 1.0, i.e. the acceleration already starts for a single > busy mask. > Tests with many visited subtables should show a strong increase of the gain > through DFC. > > Flows master DFC+EMC Gain > [Mpps] [Mpps] > -- > 8 4.454.62 3.8% > 100 4.174.47 7.2% > 10003.884.3412.0% > 20003.544.1717.8% > 50003.013.8227.0% > 1 2.753.6331.9% > 2 2.643.5032.8% > 5 2.603.3328.1% > 10 2.593.2324.7% > 50 2.593.1621.9% [[BO'M]] What is the flow distribution here? Are there other flow distributions that we want to ensure do not suffer a possible regression. I'm not sure what they are exactly - I have some ideas but admittedly I've only every tested with either a uniform flow distribution or else a round-robin distribution. > > > Signed-off-by: Jan Scheurich> --- > lib/dpif-netdev.c | 349 --- > --- > 1 file changed, 235 insertions(+), 114 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index db78318..efcf2e9 > 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -127,19 +127,19 @@ struct netdev_flow_key { > uint64_t buf[FLOW_MAX_PACKET_U64S]; }; > > -/* Exact match cache for frequently used flows > +/* Datapath flow cache (DFC) for frequently used flows >
Re: [ovs-dev] [PATCH v2] odp-execute: Skip processing actions when batch is emptied
Do anyone sees any issue with this change ? Though it does not address any existing bug, but will make the code less error prone. Regards, Vishal -Original Message- From: ovs-dev-boun...@openvswitch.org [mailto:ovs-dev-boun...@openvswitch.org] On Behalf Of Vishal Deep Ajmera Sent: Thursday, November 16, 2017 11:32 AM To: d...@openvswitch.org Subject: [ovs-dev] [PATCH v2] odp-execute: Skip processing actions when batch is emptied Today in OVS, when errors are encountered during the execution of an action the entire batch of packets may be deleted (for e.g. in processing push_tnl_action, if the port is not found in the port_cache of PMD). The remaining actions continue to be executed even though there are no packets to be processed. It is assumed that the code dealing with each action checks that the batch is not empty before executing. Crashes may occur if the assumption is not met. The patch makes OVS skip processing of further actions from the action-set once a batch is emptied. Doing so centralizes the check in one place and avoids the possibility of crashes. Signed-off-by: Vishal Deep Ajmera--- lib/odp-execute.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 3011479..887246d 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -686,9 +686,12 @@ odp_execute_actions(void *dp, struct dp_packet_batch *batch, bool steal, dp_execute_action(dp, batch, a, may_steal); -if (last_action) { -/* We do not need to free the packets. dp_execute_actions() - * has stolen them */ +if (last_action || (batch->count == 0)) { +/* We do not need to free the packets. + * Either dp_execute_actions() has stolen them + * or the batch is freed due to errors. In either + * case we do not need to execute further actions. + */ return; } } -- 1.9.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH V2 3/4] tc: Add header rewrite using tc pedit action
On Tue, Nov 21, 2017 at 07:50:24AM +0200, Roi Dayan wrote: > > > On 20/11/2017 14:30, Simon Horman wrote: > > On Sun, Nov 19, 2017 at 08:45:19AM +0200, Roi Dayan wrote: > > > > > > > > > On 16/11/2017 18:29, Simon Horman wrote: > > > > On Wed, Oct 25, 2017 at 02:24:15PM +0300, Roi Dayan wrote: > > > > > > > > > > > > > > > On 27/09/2017 12:08, Simon Horman wrote: > > > > > > On Mon, Sep 25, 2017 at 04:31:42PM +0300, Paul Blakey wrote: > > > > > > > > > > > > > > > > > > > > > On 18/09/2017 18:01, Simon Horman wrote: > > > > > > > > On Mon, Sep 18, 2017 at 07:16:03AM +0300, Roi Dayan wrote: > > > > > > > > > From: Paul Blakey> > > > > > > > > > > > > > > > > > To be later used to implement ovs action set offloading. > > > > > > > > > > > > > > > > > > Signed-off-by: Paul Blakey > > > > > > > > > Reviewed-by: Roi Dayan > > > > > > > > > --- > > > > > > > > > lib/tc.c | 372 > > > > > > > > > ++- > > > > > > > > > lib/tc.h | 16 +++ > > > > > > > > > 2 files changed, 385 insertions(+), 3 deletions(-) > > > > > > > > > > > > > > > > > > diff --git a/lib/tc.c b/lib/tc.c > > > > > > > > > index c9cada2..743b2ee 100644 > > > > > > > > > --- a/lib/tc.c > > > > > > > > > +++ b/lib/tc.c > > > > > > > > > @@ -21,8 +21,10 @@ > > > > > > > > > #include > > > > > > > > > #include > > > > > > > > > #include > > > > > > > > > +#include > > > > > > > > > #include > > > > > > > > > #include > > > > > > > > > +#include > > > > > > > > > #include > > > > > > > > > #include > > > > > > > > > #include > > > > > > > > > @@ -33,11 +35,14 @@ > > > > > > > > > #include "netlink-socket.h" > > > > > > > > > #include "netlink.h" > > > > > > > > > #include "openvswitch/ofpbuf.h" > > > > > > > > > +#include "openvswitch/util.h" > > > > > > > > > #include "openvswitch/vlog.h" > > > > > > > > > #include "packets.h" > > > > > > > > > #include "timeval.h" > > > > > > > > > #include "unaligned.h" > > > > > > > > > +#define MAX_PEDIT_OFFSETS 8 > > > > > > > > > > > > > > > > Why 8? > > > > > > > We don't expect anything more right now (ipv6 src/dst rewrite > > > > > > > requires 8 > > > > > > > pedits iirc). I can't think of a larger use case, maybe ipv6 + > > > > > > > macs if > > > > > > > that's makes sens. do you suggest we increase it? to what? > > > > > > > > > > > > It seems strange to me to place a somewhat arbitrary small limit > > > > > > when none exists in the pedit API being used. I would at prefer if > > > > > > it was at least a bigger, say 16 or 32. > > > > > > > > > > > > > > Hi Simon, > > > > > > > > > > Sorry for the late reply due to holidays and vacations. > > > > > Me & Paul going to go over this and do the fixes needed and > > > > > also rebase over latest master and run tests again. > > > > > > > > > > I'll answer what I'm more familiar with now and Paul will continue. > > > > > The 8 here is too low and you right. We used this definition > > > > > for allocation of the pedit keys on the stack in > > > > > nl_msg_put_flower_rewrite_pedits() > > > > > > > > > > It was for convenience instead of calculating the maximum possible > > > > > keys that could exists and allocating it there and freeing it at > > > > > the end. > > > > > > > > > > Increasing it to 32 is probably more than enough and wont waste much. > > > > > > > > I updated the value to 32 when applying the patch. > > > > > > > > ... > > > > > > > > > > > > If I understand the above correctly it is designed to make > > > > > > > > pedit actions disjoint. If so, why is that necessary? > > > > > > > > > > > > > > > It's not, as a single flower key rewrite can be split to multiple > > > > > > > pedit > > > > > > > actions it finds the overlap between a flower key and a pedit > > > > > > > action, if > > > > > > > they do overlap it translates it to the correct offset and masks > > > > > > > it out. > > > > > > > > > > > > Thanks, understood. > > > > > > > > > > > > > > > > > > > > > > +} else { > > > > > > > > > +VLOG_ERR_RL(_rl, "unable to parse legacy > > > > > > > > > pedit type: %d", > > > > > > > > > +nl_attr_type(nla)); > > > > > > > > > +return EOPNOTSUPP; > > > > > > > > > +} > > > > > > > > > > > > > > > > I think the code could exit early here as > > > > > > > > nl_msg_put_flower_rewrite_pedits() does below. > > > > > > > > > > > > > > > > > > > > > > Sorry, didn't understand. can you give an example? > > > > > > > > > > > > > > > > > > > I meant something like this. > > > > > > > > > > > >if (nl_attr_type(nla) != TCA_PEDIT_KEY_EX) { > > > > > >VLOG_ERR_RL(...); > > > > > >return EOPNOTSUPP; > > > > > >} > > > > > > > > > > understood. we'll do that. thanks. > > > > > > > > I
Re: [ovs-dev] [PATCH v2 0/3]: dpif-netdev: Detailed PMD performance metrics and supervision
I submitted a rebased version of this series yesterday. Please look at the new v3 series instead. /Jan > -Original Message- > From: Jan Scheurich > Sent: Wednesday, 18 October, 2017 12:46 > To: 'ovs-dev@openvswitch.org'> Cc: Jan Scheurich > Subject: [PATCH v2 0/3]: dpif-netdev: Detailed PMD performance metrics and > supervision > > Friendly reminder to test and/or review this patch series. > > The run-time performance of PMDs is often difficult to understand and > trouble-shoot. The existing PMD statistics counters only provide a > coarse grained average picture. At packet rates of several Mpps sporadic > drops of packet bursts happen at sub-millisecond time scales > and are impossible to capture and analyze with existing tools. > > This patch set refactors the existing PMD statistics into a dedicated > submodule and collects a large number of important PMD > performance metrics per PMD iteration, maintaining histograms and circular > histories for iteration metrics and millisecond averages. To > capture sporadic drop events, the patch set can be configured to monitor > iterations for suspicious metrics and to log the neighborhood of > such iterations for off-line analysis. > > The extra cost for the performance metric collection and the supervision has > been measured to be in the order of 1% compared to the > base commit in a phy-to-phy setup with VXLAN tunnels (two datapath passes per > packet). We believe this is fast enough to not warrant a > build- or run-time configuration option to disable this. > > The first patch in the series fully includes the changes proposed in Darrel's > earlier "[patch_v5 0/3] dpif-netdev: Fix and refactor pmd stats" > (https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337686.html). > > Jan Scheurich (3) > Patch 1/3: Refactor PMD performance into dpif-netdev-perf > Patch 2/3: Detailed performance stats for PMDs > Patch 3/3: Detection and logging of suspicious PMD iterations > > v1 -> v2: > Rebased to OVS master (commit 7468ec788) > No other changes compared to v1 > > lib/automake.mk| 2 + > lib/dp-packet.h| 2 + > lib/dpif-netdev-perf.c | 512 > + > lib/dpif-netdev-perf.h | 317 ++ > lib/dpif-netdev.c | 438 +++--- > lib/netdev-dpdk.c | 28 ++- > lib/netdev-dpdk.h | 14 ++ > ofproto/ofproto-dpif.c | 3 +- > tests/pmd.at | 22 ++- > 9 files changed, 1132 insertions(+), 206 deletions(-) ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev