Re: [ovs-dev] ovn master test failures
On Wed, May 12, 2021 at 09:21:32PM -0400, Ihar Hrachyshka wrote: > Hi Numan, > > both 3187b9fef124e038e474270a2728fe94bdca8eef (ovn-northd: introduce > new allow-stateless ACL verb) and > 127bf166ccf4a2509f670c48a00b0340039f20d2 (northd: Support flow > offloading for logical switches with no ACLs.) got merged in upstream > master, and this combination broke the following tests: > > 774: ovn -- ACL allow-stateless omit conntrack - Port_Group -- > ovn-northd-ddlog -- dp-groups=yes FAILED (ovn-northd.at:2752) > 775: ovn -- ACL allow-stateless omit conntrack - Port_Group -- > ovn-northd-ddlog FAILED (ovn-northd.at:2752) > > while the other scenarios are passing: > > 768: ovn -- ACL allow-stateless omit conntrack - Logical_Switch -- > ovn-northd -- dp-groups=yes ok > 769: ovn -- ACL allow-stateless omit conntrack - Logical_Switch -- ovn-northd > ok > 770: ovn -- ACL allow-stateless omit conntrack - Logical_Switch -- > ovn-northd-ddlog -- dp-groups=yes ok > 771: ovn -- ACL allow-stateless omit conntrack - Logical_Switch -- > ovn-northd-ddlog ok > 772: ovn -- ACL allow-stateless omit conntrack - Port_Group -- > ovn-northd -- dp-groups=yes ok > 773: ovn -- ACL allow-stateless omit conntrack - Port_Group -- ovn-northd ok This turned to out to be an all-afternoon-and-evening saga, but I got it fixed and documented: https://mail.openvswitch.org/pipermail/ovs-dev/2021-May/382993.html https://github.com/vmware/differential-datalog/pull/977 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH ovn] ovn-northd-ddlog: Fix weight overflows.
This fixes test failures in "ovn -- ACL allow-stateless omit conntrack - Port_Group -- ovn-northd-ddlog" with and without "dp-groups=yes". See https://github.com/vmware/differential-datalog/pull/977 for the full story. Signed-off-by: Ben Pfaff Reported-by: Ihar Hrachyshka Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-May/382967.html Suggested-by: Leonid Ryhzyk Suggested-by: Mihai Budiu --- northd/lswitch.dl | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/northd/lswitch.dl b/northd/lswitch.dl index 8b1f35ac4378..8fbb313b9666 100644 --- a/northd/lswitch.dl +++ b/northd/lswitch.dl @@ -70,7 +70,7 @@ LogicalSwitchPortWithUnknownAddress(ls_uuid, lsp_uuid) :- lsp in nb::Logical_Switch_Port(._uuid = lsp_uuid), lsp.is_enabled() and lsp.addresses.contains("unknown"). -relation LogicalSwitchHasUnknownPorts(ls: uuid, has_unknown: bool) +output relation LogicalSwitchHasUnknownPorts(ls: uuid, has_unknown: bool) LogicalSwitchHasUnknownPorts(ls, true) :- LogicalSwitchPortWithUnknownAddress(ls, _). LogicalSwitchHasUnknownPorts(ls, false) :- nb::Logical_Switch(._uuid = ls), @@ -116,7 +116,7 @@ LogicalSwitchStatefulACL(ls, acl) :- LogicalSwitchACL(ls, acl), nb::ACL(._uuid = acl, .action = "allow-related"). -relation LogicalSwitchHasStatefulACL(ls: uuid, has_stateful_acl: bool) +output relation LogicalSwitchHasStatefulACL(ls: uuid, has_stateful_acl: bool) LogicalSwitchHasStatefulACL(ls, true) :- LogicalSwitchStatefulACL(ls, _). @@ -125,7 +125,7 @@ LogicalSwitchHasStatefulACL(ls, false) :- nb::Logical_Switch(._uuid = ls), not LogicalSwitchStatefulACL(ls, _). -relation LogicalSwitchHasACLs(ls: uuid, has_acls: bool) +output relation LogicalSwitchHasACLs(ls: uuid, has_acls: bool) LogicalSwitchHasACLs(ls, true) :- LogicalSwitchACL(ls, _). @@ -170,7 +170,7 @@ LogicalSwitchWithDNSRecords(ls) :- nb::DNS(._uuid = dns_uuid, .records = records), not records.is_empty(). -relation LogicalSwitchHasDNSRecords(ls: uuid, has_dns_records: bool) +output relation LogicalSwitchHasDNSRecords(ls: uuid, has_dns_records: bool) LogicalSwitchHasDNSRecords(ls, true) :- LogicalSwitchWithDNSRecords(ls). @@ -186,7 +186,7 @@ LogicalSwitchHasNonRouterPort0(ls_uuid) :- lsp in nb::Logical_Switch_Port(._uuid = lsp_uuid), lsp.__type != "router". -relation LogicalSwitchHasNonRouterPort(ls: uuid, has_non_router_port: bool) +output relation LogicalSwitchHasNonRouterPort(ls: uuid, has_non_router_port: bool) LogicalSwitchHasNonRouterPort(ls, true) :- LogicalSwitchHasNonRouterPort0(ls). LogicalSwitchHasNonRouterPort(ls, false) :- @@ -285,7 +285,7 @@ SwitchLBVIP(sw_uuid, lb, vip, backends) :- var kv = FlatMap(vips), (var vip, var backends) = kv. -relation LogicalSwitchHasLBVIP(sw_uuid: uuid, has_lb_vip: bool) +output relation LogicalSwitchHasLBVIP(sw_uuid: uuid, has_lb_vip: bool) LogicalSwitchHasLBVIP(sw_uuid, true) :- SwitchLBVIP(.sw_uuid = sw_uuid). LogicalSwitchHasLBVIP(sw_uuid, false) :- -- 2.31.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH net v2] openvswitch: meter: fix race when getting now_ms.
Hello: This patch was applied to netdev/net.git (refs/heads/master): On Thu, 13 May 2021 21:08:00 +0800 you wrote: > We have observed meters working unexpected if traffic is 3+Gbit/s > with multiple connections. > > now_ms is not pretected by meter->lock, we may get a negative > long_delta_ms when another cpu updated meter->used, then: > delta_ms = (u32)long_delta_ms; > which will be a large value. > > [...] Here is the summary with links: - [ovs-dev,net,v2] openvswitch: meter: fix race when getting now_ms. https://git.kernel.org/netdev/net/c/e4df1b0c2435 You are awesome, thank you! -- Deet-doot-dot, I am a bot. https://korg.docs.kernel.org/patchwork/pwbot.html ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH ovn 5/5] ovn-nbctl: Support ssl cert rotation for daemon mode.
Update SSL in the server_loop so that updated pki files can be reapplied. Signed-off-by: Han Zhou --- utilities/ovn-nbctl.c | 30 +- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/utilities/ovn-nbctl.c b/utilities/ovn-nbctl.c index 48fd0b7ee..290b4d30d 100644 --- a/utilities/ovn-nbctl.c +++ b/utilities/ovn-nbctl.c @@ -57,6 +57,11 @@ static bool oneline; /* --dry-run: Do not commit any changes. */ static bool dry_run; +/* SSL options */ +static const char *ssl_private_key_file; +static const char *ssl_certificate_file; +static const char *ssl_ca_cert_file; + /* --wait=TYPE: Wait for configuration change to take effect? */ enum nbctl_wait_type { NBCTL_WAIT_NONE,/* Do not wait. */ @@ -549,6 +554,16 @@ add_local_option(const char *name, const char *arg, return NULL; } +static void +update_ssl_config(void) +{ +if (!ssl_private_key_file || !ssl_certificate_file || !ssl_ca_cert_file) { +return; +} +stream_ssl_set_key_and_cert(ssl_private_key_file, ssl_certificate_file); +stream_ssl_set_ca_cert_file(ssl_ca_cert_file, false); +} + static void apply_options_direct(const struct ovs_cmdl_parsed_option *parsed_options, size_t n, struct shash *local_options) @@ -621,7 +636,18 @@ apply_options_direct(const struct ovs_cmdl_parsed_option *parsed_options, OVN_DAEMON_OPTION_HANDLERS VLOG_OPTION_HANDLERS TABLE_OPTION_HANDLERS(_style) -STREAM_SSL_OPTION_HANDLERS + +case 'p': +ssl_private_key_file = optarg; +break; + +case 'c': +ssl_certificate_file = optarg; +break; + +case 'C': +ssl_ca_cert_file = optarg; +break; case OPT_BOOTSTRAP_CA_CERT: stream_ssl_set_ca_cert_file(po->arg, true); @@ -641,6 +667,7 @@ apply_options_direct(const struct ovs_cmdl_parsed_option *parsed_options, if (!db) { db = default_nb_db(); } +update_ssl_config(); } static void @@ -6956,6 +6983,7 @@ server_loop(struct ovsdb_idl *idl, int argc, char *argv[]) server_cmd_init(idl, ); for (;;) { +update_ssl_config(); memory_run(); if (memory_should_report()) { struct simap usage = SIMAP_INITIALIZER(); -- 2.30.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH ovn 3/5] ovn-northd-ddlog: Support ssl cert rotation.
Update SSL in the main loop so that updated pki files can be reapplied. Signed-off-by: Han Zhou --- northd/ovn-northd-ddlog.c | 29 - 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/northd/ovn-northd-ddlog.c b/northd/ovn-northd-ddlog.c index b7d2c8a5e..46f734c11 100644 --- a/northd/ovn-northd-ddlog.c +++ b/northd/ovn-northd-ddlog.c @@ -74,6 +74,11 @@ static const char *ovnnb_db; static const char *ovnsb_db; static const char *unixctl_path; +/* SSL options */ +static const char *ssl_private_key_file; +static const char *ssl_certificate_file; +static const char *ssl_ca_cert_file; + /* Frequently used table ids. */ static table_id WARNING_TABLE_ID; static table_id NB_CFG_TIMESTAMP_ID; @@ -1094,7 +1099,18 @@ parse_options(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) switch (c) { OVN_DAEMON_OPTION_HANDLERS; VLOG_OPTION_HANDLERS; -STREAM_SSL_OPTION_HANDLERS; + +case 'p': +ssl_private_key_file = optarg; +break; + +case 'c': +ssl_certificate_file = optarg; +break; + +case 'C': +ssl_ca_cert_file = optarg; +break; case OPT_DDLOG_RECORD: record_file = optarg; @@ -1140,6 +1156,16 @@ parse_options(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) free(short_options); } +static void +update_ssl_config(void) +{ +if (!ssl_private_key_file || !ssl_certificate_file || !ssl_ca_cert_file) { +return; +} +stream_ssl_set_key_and_cert(ssl_private_key_file, ssl_certificate_file); +stream_ssl_set_ca_cert_file(ssl_ca_cert_file, false); +} + int main(int argc, char *argv[]) { @@ -1219,6 +1245,7 @@ main(int argc, char *argv[]) /* Main loop. */ exiting = false; while (!exiting) { +update_ssl_config(); memory_run(); if (memory_should_report()) { struct simap usage = SIMAP_INITIALIZER(); -- 2.30.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH ovn 2/5] ovn-northd: Support ssl cert rotation.
Update SSL in the main loop so that updated pki files can be reapplied. Signed-off-by: Han Zhou --- northd/ovn-northd.c | 29 - 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c index f503ddd5e..4804093fd 100644 --- a/northd/ovn-northd.c +++ b/northd/ovn-northd.c @@ -107,6 +107,11 @@ static bool use_ct_inv_match = true; static int northd_probe_interval_nb = 0; static int northd_probe_interval_sb = 0; +/* SSL options */ +static const char *ssl_private_key_file; +static const char *ssl_certificate_file; +static const char *ssl_ca_cert_file; + #define MAX_OVN_TAGS 4096 /* Pipeline stages. */ @@ -13978,7 +13983,18 @@ parse_options(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) switch (c) { OVN_DAEMON_OPTION_HANDLERS; VLOG_OPTION_HANDLERS; -STREAM_SSL_OPTION_HANDLERS; + +case 'p': +ssl_private_key_file = optarg; +break; + +case 'c': +ssl_certificate_file = optarg; +break; + +case 'C': +ssl_ca_cert_file = optarg; +break; case 'd': ovnsb_db = optarg; @@ -14028,6 +14044,16 @@ add_column_noalert(struct ovsdb_idl *idl, ovsdb_idl_omit_alert(idl, column); } +static void +update_ssl_config(void) +{ +if (!ssl_private_key_file || !ssl_certificate_file || !ssl_ca_cert_file) { +return; +} +stream_ssl_set_key_and_cert(ssl_private_key_file, ssl_certificate_file); +stream_ssl_set_ca_cert_file(ssl_ca_cert_file, false); +} + int main(int argc, char *argv[]) { @@ -14344,6 +14370,7 @@ main(int argc, char *argv[]) state.paused = false; while (!exiting) { +update_ssl_config(); memory_run(); if (memory_should_report()) { struct simap usage = SIMAP_INITIALIZER(); -- 2.30.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH ovn 4/5] ovn-ic: Support ssl cert rotation.
Update SSL in the main loop so that updated pki files can be reapplied. Signed-off-by: Han Zhou --- ic/ovn-ic.c | 29 - 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/ic/ovn-ic.c b/ic/ovn-ic.c index 18e37a31f..dec29fcfd 100644 --- a/ic/ovn-ic.c +++ b/ic/ovn-ic.c @@ -80,6 +80,11 @@ static const char *ovn_ic_nb_db; static const char *ovn_ic_sb_db; static const char *unixctl_path; +/* SSL options */ +static const char *ssl_private_key_file; +static const char *ssl_certificate_file; +static const char *ssl_ca_cert_file; + static void usage(void) @@ -1519,7 +1524,18 @@ parse_options(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) switch (c) { OVN_DAEMON_OPTION_HANDLERS; VLOG_OPTION_HANDLERS; -STREAM_SSL_OPTION_HANDLERS; + +case 'p': +ssl_private_key_file = optarg; +break; + +case 'c': +ssl_certificate_file = optarg; +break; + +case 'C': +ssl_ca_cert_file = optarg; +break; case 'd': ovnsb_db = optarg; @@ -1585,6 +1601,16 @@ add_column_noalert(struct ovsdb_idl *idl, ovsdb_idl_omit_alert(idl, column); } +static void +update_ssl_config(void) +{ +if (!ssl_private_key_file || !ssl_certificate_file || !ssl_ca_cert_file) { +return; +} +stream_ssl_set_key_and_cert(ssl_private_key_file, ssl_certificate_file); +stream_ssl_set_ca_cert_file(ssl_ca_cert_file, false); +} + int main(int argc, char *argv[]) { @@ -1655,6 +1681,7 @@ main(int argc, char *argv[]) state.had_lock = false; state.paused = false; while (!exiting) { +update_ssl_config(); memory_run(); if (memory_should_report()) { struct simap usage = SIMAP_INITIALIZER(); -- 2.30.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH ovn 1/5] ovn-controller: Support ssl cert rotation when command line options are used.
When SSL configurations are set in Open_vSwitch SSL table, ovn-controller handles file update properly by re-applying the settings in the main loop. However, it is also valid to set the options in command line of ovn-controller without using the SSL table. In this case, the options are set onetime only and it never reapplies when the file content changes. This patch fixes this by allowing reapplying the command line options in the main loop, if they are set. SSL table settings still takes precedence if both exist. Signed-off-by: Han Zhou --- controller/ovn-controller.c | 24 +++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index 67c51a86f..5a755276b 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -97,6 +97,11 @@ static unixctl_cb_func debug_delay_nb_cfg_report; static char *parse_options(int argc, char *argv[]); OVS_NO_RETURN static void usage(void); +/* SSL options */ +static const char *ssl_private_key_file; +static const char *ssl_certificate_file; +static const char *ssl_ca_cert_file; + /* By default don't set an upper bound for the lflow cache. */ #define DEFAULT_LFLOW_CACHE_MAX_ENTRIES UINT32_MAX #define DEFAULT_LFLOW_CACHE_MAX_MEM_KB (UINT64_MAX / 1024) @@ -441,6 +446,11 @@ update_ssl_config(const struct ovsrec_ssl_table *ssl_table) if (ssl) { stream_ssl_set_key_and_cert(ssl->private_key, ssl->certificate); stream_ssl_set_ca_cert_file(ssl->ca_cert, ssl->bootstrap_ca_cert); +} else if (ssl_private_key_file && ssl_certificate_file && + ssl_ca_cert_file) { +stream_ssl_set_key_and_cert(ssl_private_key_file, +ssl_certificate_file); +stream_ssl_set_ca_cert_file(ssl_ca_cert_file, false); } } @@ -3320,7 +3330,19 @@ parse_options(int argc, char *argv[]) VLOG_OPTION_HANDLERS OVN_DAEMON_OPTION_HANDLERS -STREAM_SSL_OPTION_HANDLERS + +case 'p': +ssl_private_key_file = optarg; +break; + +case 'c': +ssl_certificate_file = optarg; +break; + +case 'C': +ssl_ca_cert_file = optarg; +break; + case OPT_PEER_CA_CERT: stream_ssl_set_peer_ca_cert_file(optarg); -- 2.30.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 2/2] stream-ssl.c: Fix the comment of stream_ssl_set_ca_cert_file.
Signed-off-by: Han Zhou --- lib/stream-ssl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c index e67ccb4bd..6a515e465 100644 --- a/lib/stream-ssl.c +++ b/lib/stream-ssl.c @@ -1448,7 +1448,7 @@ stream_ssl_set_ca_cert_file__(const char *file_name, /* Sets 'file_name' as the name of the file from which to read the CA * certificate used to verify the peer within SSL connections. If 'bootstrap' - * is false, the file must exist. If 'bootstrap' is false, then the file is + * is false, the file must exist. If 'bootstrap' is true, then the file is * read if it is exists; if it does not, then it will be created from the CA * certificate received from the peer on the first SSL connection. */ void -- 2.30.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 1/2] stream-ssl.c: Fix stream_ssl_set_key_and_cert.
>From the description of this interface, one of the problems it tries to solve is when one of the files is changed before the other: * But, if the private * key is changed before the certificate (e.g. someone "scp"s or "mv"s the new * private key in place before the certificate), then OpenSSL would reject that * change, and then the change of certificate would succeed, but there would be * no associated private key (because it had only changed once and therefore * there was no point in re-reading it). * This function avoids both problems by, whenever either the certificate or * the private key file changes, re-reading both of them ... However, in the implement it used "&&" instead of "||", and so it was in fact re-reading both of them only when both are changed. This patch fixes it by using "||". Reported-by: Girish Moodalbail Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2020-December/050859.html Signed-off-by: Han Zhou --- lib/stream-ssl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c index 078fcbc3a..e67ccb4bd 100644 --- a/lib/stream-ssl.c +++ b/lib/stream-ssl.c @@ -1215,7 +1215,7 @@ stream_ssl_set_key_and_cert(const char *private_key_file, const char *certificate_file) { if (update_ssl_config(_key, private_key_file) -&& update_ssl_config(, certificate_file)) { +|| update_ssl_config(, certificate_file)) { stream_ssl_set_certificate_file__(certificate_file); stream_ssl_set_private_key_file__(private_key_file); } -- 2.30.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH] flow: Consistent VXLAN UDP src ports for fragmented packets
On 1/12/21 10:50 AM, Parvathy Tarur Ramachandran via dev wrote: > Issue: > The src-port for UDP is based on RSS hash in the packet metadata. > In case of packets coming from VM it will be 5-tuple, if available, > otherwise just IP addresses. If the VM fragments a large IP packet > and sends the fragments to OVS, only the first fragment will contain > the L4 header. Therefore, the first fragment and subsequent fragments > get different UDP src ports in the outgoing VXLAN header. This can > lead to fragment re-ordering in the fabric as packet will take > different paths. > > Fix: > With this patch, we ignore the L4 header during hash calculation in > the case of fragmented packets. Hi. Sorry for late reply. I see the problem, but I'm not sure that this change will actually fix it, because if the first packet is fragmented and the second one is not fragmented (originally had smaller size for any reason), they will still be hashed differently even if they are from the same flow. The solution for this would be to completely disable L4 hashing, but that doesn't sound right. What do you think? Best regards, Ilya Maximets. > > Signed-off-by: Parvathy Tarur Ramachandran > > --- > lib/flow.c | 10 +- > 1 file changed, 9 insertions(+), 1 deletion(-) > > diff --git a/lib/flow.c b/lib/flow.c > index cc1b3f2..38bf377 100644 > --- a/lib/flow.c > +++ b/lib/flow.c > @@ -2178,7 +2178,7 @@ miniflow_hash_5tuple(const struct miniflow *flow, > uint32_t basis) > > if (flow) { > ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type); > -uint8_t nw_proto; > +uint8_t nw_proto, nw_frag; > > if (dl_type == htons(ETH_TYPE_IPV6)) { > struct flowmap map = FLOWMAP_EMPTY_INITIALIZER; > @@ -2200,6 +2200,14 @@ miniflow_hash_5tuple(const struct miniflow *flow, > uint32_t basis) > > nw_proto = MINIFLOW_GET_U8(flow, nw_proto); > hash = hash_add(hash, nw_proto); > +/* Skip l4 header fields if IP packet is fragmented since > + * only first fragment will carry l4 header. > + */ > +nw_frag = MINIFLOW_GET_U8(flow, nw_frag); > +if (nw_frag) { > +goto out; > +} > + > if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP > && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP > && nw_proto != IPPROTO_ICMPV6) { > ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH] ovsdb-tool: add election timer argument to create-cluster command
On Wed, May 12, 2021 at 08:40:43PM -0500, Dan Williams wrote: > On Tue, 2021-04-13 at 20:50 -0500, Dan Williams wrote: > > After creating the new cluster database write a raft entry that > > sets the desired election timer. This allows CMSes to set the > > election timer at cluster start and avoid an error-prone > > election timer modification process after the cluster is up. > > > > Reported-at: https://bugzilla.redhat.com/1831778 > > Anyone have thoughts on this or a better approach? > > Thanks, > Dan >From a UI point of view, I'd argue for a command-line option rather than an extra argument, because if we need one of these kinds of optional tweaks we'll probably need more at some point, and half a dozen optional arguments are unmanageable without names. >From a docs point of view, this needs documentation in the --help text, in the manpage, and probably a NEWS item too. I'd recommend that the documentation explain what the election timer is, what the default is, what range of values admins should consider, and how they can make a reasonable choice. (That could be just a sentence or two.) ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v3] datapath: Add a new action dec_ttl
On 11/24/20 11:43 AM, Eelco Chaudron wrote: > Add support for the dec_ttl action. Instead of programming the datapath with > a flow that matches the packet TTL and an IP set, use a single dec_ttl action. > > The old behavior is kept if the new action is not supported by the datapath. > > # ovs-ofctl dump-flows br0 >cookie=0x0, duration=12.538s, table=0, n_packets=4, n_bytes=392, ip > actions=dec_ttl,NORMAL >cookie=0x0, duration=12.536s, table=0, n_packets=4, n_bytes=168, > actions=NORMAL > > # ping -c1 -t 20 192.168.0.2 > PING 192.168.0.2 (192.168.0.2) 56(84) bytes of data. > IP (tos 0x0, ttl 19, id 45336, offset 0, flags [DF], proto ICMP (1), length > 84) > 192.168.0.1 > 192.168.0.2: ICMP echo request, id 8865, seq 1, length 64 > > Linux netlink datapath support depends on upstream Linux commit: > 744676e77720 ("openvswitch: add TTL decrement action") > > > Note that in the Linux kernel tree the OVS_ACTION_ATTR_ADD_MPLS has been > defined, and to make sure the IDs are in sync, it had to be added to the > OVS source tree. This required some additional case statements, which > should be revisited once the OVS implementation is added. > > > Co-developed-by: Matteo Croce > Co-developed-by: Bindiya Kurle > Signed-off-by: Eelco Chaudron > > --- > v2: - Used definition instead of numeric value in format_dec_ttl_action() > - Changed format from "dec_ttl(ttl<=1()) to > "dec_ttl(le_1())" to be more in line with the check_pkt_len > action. > - Cleaned up format_dec_ttl_action() > v3: > - Fixed parsing of "dec_ttl()" action for adding a dp flow. > - Changed implementation to use the fixed kernel mod implementation > https://marc.info/?l=linux-netdev=160577671609295=2 > - Removed introduced force_last flag from odp_execute_actions > - For now, do not use this new attribute if HW offload is supported, as > it's causing a performance regression due to HW offload not being > supported. I will fix this in a separate patch. > - Added datapath test case for dec_ttl action. > > datapath/linux/compat/include/linux/openvswitch.h | 10 ++ > lib/dpif-netdev.c |2 > lib/dpif.c|2 > lib/odp-execute.c | 87 > + > lib/odp-util.c| 45 +++ > lib/packets.h | 13 +++ > ofproto/ofproto-dpif-ipfix.c |2 > ofproto/ofproto-dpif-sflow.c |2 > ofproto/ofproto-dpif-xlate.c | 60 -- > ofproto/ofproto-dpif.c| 40 ++ > ofproto/ofproto-dpif.h|6 + > tests/system-traffic.at | 28 +++ > 12 files changed, 282 insertions(+), 15 deletions(-) > > diff --git a/datapath/linux/compat/include/linux/openvswitch.h > b/datapath/linux/compat/include/linux/openvswitch.h > index 2d884312f..3016576fe 100644 > --- a/datapath/linux/compat/include/linux/openvswitch.h > +++ b/datapath/linux/compat/include/linux/openvswitch.h > @@ -1021,6 +1021,8 @@ enum ovs_action_attr { > OVS_ACTION_ATTR_METER,/* u32 meter number. */ > OVS_ACTION_ATTR_CLONE,/* Nested OVS_CLONE_ATTR_*. */ > OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ > + OVS_ACTION_ATTR_ADD_MPLS, /* struct ovs_action_add_mpls. */ > + OVS_ACTION_ATTR_DEC_TTL, /* Nested OVS_DEC_TTL_ATTR_*. */ > > #ifndef __KERNEL__ > OVS_ACTION_ATTR_TUNNEL_PUSH, /* struct ovs_action_push_tnl*/ > @@ -1124,4 +1126,12 @@ struct ovs_zone_limit { > * keys. False otherwise. > */ > > +enum ovs_dec_ttl_attr { > + OVS_DEC_TTL_ATTR_UNSPEC, > + OVS_DEC_TTL_ATTR_ACTION,/* Nested struct nlattr */ > + __OVS_DEC_TTL_ATTR_MAX > + }; Here is an extra space before '}'. > + > +#define OVS_DEC_TTL_ATTR_MAX (__OVS_DEC_TTL_ATTR_MAX - 1) > + > #endif /* _LINUX_OPENVSWITCH_H */ > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 300861ca5..b6e313304 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -7975,6 +7975,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > case OVS_ACTION_ATTR_CT_CLEAR: > case OVS_ACTION_ATTR_CHECK_PKT_LEN: > case OVS_ACTION_ATTR_DROP: > +case OVS_ACTION_ATTR_DEC_TTL: > +case OVS_ACTION_ATTR_ADD_MPLS: > case __OVS_ACTION_ATTR_MAX: > OVS_NOT_REACHED(); > } > diff --git a/lib/dpif.c b/lib/dpif.c > index ac2860764..f87afd2f5 100644 > --- a/lib/dpif.c > +++ b/lib/dpif.c > @@ -1273,6 +1273,8 @@ dpif_execute_helper_cb(void *aux_, struct > dp_packet_batch *packets_, > case OVS_ACTION_ATTR_UNSPEC: > case OVS_ACTION_ATTR_CHECK_PKT_LEN: > case OVS_ACTION_ATTR_DROP:
Re: [ovs-dev] [RFC PATCH ovn] Introduce representor port plugging support
On Thu, May 13, 2021 at 5:12 PM Ilya Maximets wrote: > > On 5/9/21 4:03 PM, Frode Nordahl wrote: > > Introduce plugging module that adds and removes ports on the > > integration bridge, as directed by Port_Binding options. > > > > Traditionally it has been the CMSs responsibility to create Virtual > > Interfaces (VIFs) as part of instance (Container, Pod, Virtual > > Machine etc.) life cycle, and subsequently manage plug/unplug > > operations on the Open vSwitch integration bridge. > > > > With the advent of NICs connected to multiple distinct CPUs we can > > have a topology where the instance runs on one host and Open > > vSwitch and OVN runs on a different host, the smartnic CPU. > > > > The act of plugging and unplugging the representor port in Open > > vSwitch running on the smartnic host CPU would be the same for > > every smartnic variant (thanks to the devlink-port[0][1] > > infrastructure) and every CMS (Kubernetes, LXD, OpenStack, etc.). > > As such it is natural to extend OVN to provide this common > > functionality through its CMS facing API. > > Hi, Frode. Thanks for putting this together, but it doesn't look > natural to me. OVN, AFAIK, never touched physical devices or > interacted with the kernel directly. This change introduces completely > new functionality inside OVN. With the same effect we can run a fully > separate service on these smartnic CPUs that will do plugging > and configuration job for CMS. You may even make it independent > from a particular CMS by creating a REST API for it or whatever. > This will additionally allow using same service for non-OVN setups. Ilya, Thank you for taking the time to comment, much appreciated. Yes, this is new functionality, NICs with separate control plane CPUs and isolation from the host are also new, so this is one proposal for how we could go about to enable the use of them. The OVN controller does today get pretty close to the physical realm by maintaining patch ports in Open vSwitch based on bridge mapping configuration and presence of bridges to physical interfaces. It also does react to events of physical interfaces being plugged into the Open vSwitch instance it manages, albeit to date some other entity has been doing the act of adding the port into the bridge. The rationale for proposing to use the OVN database for coordinating this is that the information about which ports to bind, and where to bind them is already there. The timing of the information flow from the CMS is also suitable for the task. OVN relies on OVS library code, and all the necessary libraries for interfacing with the kernel through netlink and friends are there or would be easy to add. The rationale for using the netlink-devlink interface is that it provides a generic infrastructure for these types of NICs. So by using this interface we should be able to support most if not all of the variants of these cards. Providing a separate OVN service to do the task could work, but would have the cost of an extra SB DB connection, IDL and monitors. I fear it would be quite hard to build a whole separate project with its own API, feels like a lot of duplicated effort when the flow of data and APIs in OVN already align so well with CMSs interested in using this? > Interactions with physical devices also makes OVN linux-dependent > at least for this use case, IIUC. This specific bit would be linux-specific in the first iteration, yes. But the vendors manufacturing and distributing the hardware do often have drivers for other platforms, I am sure the necessary infrastructure will become available there too over time, if it is not there already. We do currently have platform specific macros in the OVN build system, so we could enable the functionality when built on a compatible platform. > Maybe, others has different opinions. I appreciate your opinion, and enjoy discussing this topic. > Another though is that there is, obviously, a network connection > between the host and smartnic system. Maybe it's possible to just > add an extra remote to the local ovsdb-server so CMS daemon on the > host system could just add interfaces over the network connection? There are a few issues with such an approach. One of the main goals with providing and using a NIC with control plane CPUs is having an extra layer of security and isolation which is separate from the hypervisor host the card happens to share a PCI complex with and draw power from. Requiring a connection between the two for operation would defy this purpose. In addition to that, this class of cards provide visibility into kernel interfaces, enumeration of representor ports etc. only from the NIC control plane CPU side of the PCI complex, this information is not provided to the host. So if a hypervisor host CMS agent were to do the plugging through a remote ovsdb connection, it would have to communicate with something else running on the NIC control plane CPU to retrieve the information it needs before it can
Re: [ovs-dev] [v2 v2 0/6] MFEX Infrastructure + Optimizations
On Thu, 13 May 2021 10:27:19 -0400 Jean Hsiao wrote: > > On 5/11/21 7:35 AM, Van Haaren, Harry wrote: > >> -Original Message- > >> From: Timothy Redaelli > >> Sent: Monday, May 10, 2021 6:43 PM > >> To: Amber, Kumar ; d...@openvswitch.org > >> Cc: i.maxim...@ovn.org; jhs...@redhat.com; f...@redhat.com; Van Haaren, > >> Harry > >> > >> Subject: Re: [ovs-dev] [v2 v2 0/6] MFEX Infrastructure + Optimizations > > > > > >> Hi, > >> we (as Red Hat) did some tests with a "special" build created on top of > >> master (a019868a6268 at that time) with with the 2 series ("DPIF > >> Framework + Optimizations" and "MFEX Infrastructure + Optimizations") > >> cherry-picked. > >> The spec file was also modified in order to use add "-msse4.2 -mpopcnt" > >> to OVS CFLAGS. > > Hi Timothy, > > > > Thanks for testing and reporting back your findings! Most of the > > configuration is clear to me, but I have a few open questions inline below > > for context. > > > > The performance numbers reported in the email below do not show benefit > > when enabling AVX512, which contradicts our > > recent whitepaper on benchmarking an Optimized Deployment of OVS, which > > includes the AVX512 patches you've benchmarked too. > > Specifically Table 8. for DPIF/MFEX patches, and Table 9. for the overall > > optimizations at a platform level are relevant: > > https://networkbuilders.intel.com/solutionslibrary/open-vswitch-optimized-deployment-benchmark-technology-guide > > > > Based on the differences between these performance reports, there must be > > some discrepancy in our testing/measurements. > > I hope that the questions below help us understand any differences so we > > can all measure the benefits from these optimizations. > > > > Regards, -Harry > > > > > >> RPM=openvswitch2.15-2.15.0-37.avx512.1.el8fdp (the "special" build with > >> the patches backported) > >> > >> * Master --- 15.2 Mpps > >> * Plus "avx512_gather 3" Only --- 15.2 Mpps > >> * Plus "dpif-set dpif_avx512" Only --- 10.1 Mpps > >> * Plus "miniflow-parser-set study" --- Failed to converge > >> * Plus all three --- 13.5 Mpps > > Open questions: > > 1) Is CPU frequency turbo enabled in any scenario, or always pinned to the > > 2.6 GHz base frequency? > > - A "perf top -C x,y" (where x,y are datapath hyperthread ids) would > > be interesting to compare with 3) below. > See attached screentshoots for two samples --- master-0 and master-1 > > > > 2) "plus Avx512 gather 3" (aka, DPCLS in AVX512), we see same performance. > > Is DPCLS in use, or is EMC doing all the work? > > - The output of " ovs-appctl dpif-netdev/pmd-perf-show" would be > > interesting to understand where packets are classified. > > EMC doing all the work --- see log below. This could explain why setting > avx512 is not helping. > > NOTE: Our initial study showed that disabling EMC didn't help avx512 > wining the case. > > [root@netqe29 jhsiao]# ovs-appctl dpif-netdev/subtable-lookup-prio-get > Available lookup functions (priority : name) > 0 : autovalidator > *1 : generic* > 0 : avx512_gather > [root@netqe29 jhsiao]# > > sleep 60; ovs-appctl dpif-netdev/pmd-perf-show > > > Time: 13:54:40.213 > Measurement duration: 2242.679 s > > pmd thread numa_id 0 core_id 24: > > Iterations: 17531214131 (0.13 us/it) > - Used TSC cycles: 5816810246080 (100.1 % of total cycles) > - idle iterations: 17446464548 ( 84.1 % of used cycles) > - busy iterations: 84749583 ( 15.9 % of used cycles) > Rx packets: 2711982944 (1209 Kpps, 340 cycles/pkt) > Datapath passes: 2711982944 (1.00 passes/pkt) > - EMC hits: 2711677677 (100.0 %) > - SMC hits: 0 ( 0.0 %) > - Megaflow hits: 305261 ( 0.0 %, 1.00 subtbl lookups/hit) > - Upcalls: 6 ( 0.0 %, 0.0 us/upcall) > - Lost upcalls: 0 ( 0.0 %) > Tx packets: 2711982944 (1209 Kpps) > Tx batches: 84749583 (32.00 pkts/batch) > > Time: 13:54:40.213 > Measurement duration: 2242.675 s > > pmd thread numa_id 0 core_id 52: > > Iterations: 17529480287 (0.13 us/it) > - Used TSC cycles: 5816709563052 (100.1 % of total cycles) > - idle iterations: 17444555421 ( 84.1 % of used cycles) > - busy iterations: 84924866 ( 15.9 % of used cycles) > Rx packets: 2717592640 (1212 Kpps, 340 cycles/pkt) > Datapath passes: 2717592640 (1.00 passes/pkt) > - EMC hits: 2717280240 (100.0 %) > - SMC hits: 0 ( 0.0 %) > - Megaflow hits: 312362 ( 0.0 %, 1.00 subtbl lookups/hit) > - Upcalls: 6 ( 0.0 %, 0.0 us/upcall) > - Lost upcalls: 0 ( 0.0 %) > Tx packets: 2717592608 (1212 Kpps) > Tx batches: 84924866 (32.00 pkts/batch) > [root@netqe29 jhsiao]# > > > > > 3) "dpif-set dpif_avx512" only. The performance here is very
[ovs-dev] [PATCH ovn] controller: fix physical flow update for localport
Properly update logical/openflow flows for localport removing the interface from the ovs bridge. Openflows in table 65 are not recomputed removing a localport from an ovs-bridge and the ovs bridge ends-up with a stale configuration adding the interface back. Fix the issue taking care of localport special case in physical_handle_ovs_iface_changes routine. Signed-off-by: Lorenzo Bianconi --- controller/ovn-controller.c | 1 + controller/physical.c | 6 +- tests/ovn.at| 21 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index 67c51a86f..8514e35ea 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -1836,6 +1836,7 @@ en_physical_flow_changes_run(struct engine_node *node, void *data) { struct ed_type_pfc_data *pfc_tdata = data; pfc_tdata->recompute_physical_flows = true; +pfc_tdata->ovs_ifaces_changed = true; engine_set_node_state(node, EN_UPDATED); } diff --git a/controller/physical.c b/controller/physical.c index 96c959d18..725959678 100644 --- a/controller/physical.c +++ b/controller/physical.c @@ -1874,7 +1874,11 @@ physical_handle_ovs_iface_changes(struct physical_ctx *p_ctx, const struct sbrec_port_binding *lb_pb = local_binding_get_primary_pb(p_ctx->local_bindings, iface_id); if (!lb_pb) { -continue; +lb_pb = lport_lookup_by_name(p_ctx->sbrec_port_binding_by_name, + iface_id); +if (!lb_pb || strcmp(lb_pb->type, "localport")) { +continue; +} } int64_t ofport = iface_rec->n_ofport ? *iface_rec->ofport : 0; diff --git a/tests/ovn.at b/tests/ovn.at index 747967576..06ec60a02 100644 --- a/tests/ovn.at +++ b/tests/ovn.at @@ -11870,6 +11870,27 @@ AT_CHECK([ test 0 -eq $pkts ]) +AT_CHECK([ovs-ofctl dump-flows br-int |awk '/output/{print substr($8, 16, 16)}' |sort], [0], [dnl +1 +2 +3 +]) + +# remove the localport from br-int and re-create it +check ovs-vsctl del-port vif2 +AT_CHECK([ovs-ofctl dump-flows br-int |awk '/output/{print substr($8, 16, 16)}' |sort], [0], [dnl +1 +3 +]) + +check ovs-vsctl add-port br-int vif2 \ +-- set Interface vif2 external-ids:iface-id=lsp +AT_CHECK([ovs-ofctl dump-flows br-int |awk '/output/{print substr($8, 16, 16)}' |sort], [0], [dnl +1 +3 +4 +]) + OVN_CLEANUP([hv1]) AT_CLEANUP ]) -- 2.31.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [RFC PATCH ovn] Introduce representor port plugging support
On 5/9/21 4:03 PM, Frode Nordahl wrote: > Introduce plugging module that adds and removes ports on the > integration bridge, as directed by Port_Binding options. > > Traditionally it has been the CMSs responsibility to create Virtual > Interfaces (VIFs) as part of instance (Container, Pod, Virtual > Machine etc.) life cycle, and subsequently manage plug/unplug > operations on the Open vSwitch integration bridge. > > With the advent of NICs connected to multiple distinct CPUs we can > have a topology where the instance runs on one host and Open > vSwitch and OVN runs on a different host, the smartnic CPU. > > The act of plugging and unplugging the representor port in Open > vSwitch running on the smartnic host CPU would be the same for > every smartnic variant (thanks to the devlink-port[0][1] > infrastructure) and every CMS (Kubernetes, LXD, OpenStack, etc.). > As such it is natural to extend OVN to provide this common > functionality through its CMS facing API. Hi, Frode. Thanks for putting this together, but it doesn't look natural to me. OVN, AFAIK, never touched physical devices or interacted with the kernel directly. This change introduces completely new functionality inside OVN. With the same effect we can run a fully separate service on these smartnic CPUs that will do plugging and configuration job for CMS. You may even make it independent from a particular CMS by creating a REST API for it or whatever. This will additionally allow using same service for non-OVN setups. Interactions with physical devices also makes OVN linux-dependent at least for this use case, IIUC. Maybe, others has different opinions. Another though is that there is, obviously, a network connection between the host and smartnic system. Maybe it's possible to just add an extra remote to the local ovsdb-server so CMS daemon on the host system could just add interfaces over the network connection? Best regards, Ilya Maximets. ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH net v2] openvswitch: meter: fix race when getting now_ms.
On 5/13/21 3:08 PM, Tao Liu wrote: > We have observed meters working unexpected if traffic is 3+Gbit/s > with multiple connections. > > now_ms is not pretected by meter->lock, we may get a negative > long_delta_ms when another cpu updated meter->used, then: > delta_ms = (u32)long_delta_ms; > which will be a large value. > > band->bucket += delta_ms * band->rate; > then we get a wrong band->bucket. > > OpenVswitch userspace datapath has fixed the same issue[1] some > time ago, and we port the implementation to kernel datapath. > > [1] > https://patchwork.ozlabs.org/project/openvswitch/patch/20191025114436.9746-1-i.maxim...@ovn.org/ > > Fixes: 96fbc13d7e77 ("openvswitch: Add meter infrastructure") > Signed-off-by: Tao Liu > Suggested-by: Ilya Maximets > --- > Changelog: > v2: just set negative long_delta_ms to zero in case of race for meter lock. > v1: make now_ms protected by meter lock. > --- Thanks! I didn't test it, but the change looks good to me. Reviewed-by: Ilya Maximets > net/openvswitch/meter.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c > index 96b524c..896b8f5 100644 > --- a/net/openvswitch/meter.c > +++ b/net/openvswitch/meter.c > @@ -611,6 +611,14 @@ bool ovs_meter_execute(struct datapath *dp, struct > sk_buff *skb, > spin_lock(>lock); > > long_delta_ms = (now_ms - meter->used); /* ms */ > + if (long_delta_ms < 0) { > + /* This condition means that we have several threads fighting > + * for a meter lock, and the one who received the packets a > + * bit later wins. Assuming that all racing threads received > + * packets at the same time to avoid overflow. > + */ > + long_delta_ms = 0; > + } > > /* Make sure delta_ms will not be too large, so that bucket will not >* wrap around below. > ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [v2 v2 0/6] MFEX Infrastructure + Optimizations
On 5/11/21 7:35 AM, Van Haaren, Harry wrote: -Original Message- From: Timothy Redaelli Sent: Monday, May 10, 2021 6:43 PM To: Amber, Kumar ; d...@openvswitch.org Cc: i.maxim...@ovn.org; jhs...@redhat.com; f...@redhat.com; Van Haaren, Harry Subject: Re: [ovs-dev] [v2 v2 0/6] MFEX Infrastructure + Optimizations Hi, we (as Red Hat) did some tests with a "special" build created on top of master (a019868a6268 at that time) with with the 2 series ("DPIF Framework + Optimizations" and "MFEX Infrastructure + Optimizations") cherry-picked. The spec file was also modified in order to use add "-msse4.2 -mpopcnt" to OVS CFLAGS. Hi Timothy, Thanks for testing and reporting back your findings! Most of the configuration is clear to me, but I have a few open questions inline below for context. The performance numbers reported in the email below do not show benefit when enabling AVX512, which contradicts our recent whitepaper on benchmarking an Optimized Deployment of OVS, which includes the AVX512 patches you've benchmarked too. Specifically Table 8. for DPIF/MFEX patches, and Table 9. for the overall optimizations at a platform level are relevant: https://networkbuilders.intel.com/solutionslibrary/open-vswitch-optimized-deployment-benchmark-technology-guide Based on the differences between these performance reports, there must be some discrepancy in our testing/measurements. I hope that the questions below help us understand any differences so we can all measure the benefits from these optimizations. Regards, -Harry RPM=openvswitch2.15-2.15.0-37.avx512.1.el8fdp (the "special" build with the patches backported) * Master --- 15.2 Mpps * Plus "avx512_gather 3" Only --- 15.2 Mpps * Plus "dpif-set dpif_avx512" Only --- 10.1 Mpps * Plus "miniflow-parser-set study" --- Failed to converge * Plus all three --- 13.5 Mpps Open questions: 1) Is CPU frequency turbo enabled in any scenario, or always pinned to the 2.6 GHz base frequency? - A "perf top -C x,y" (where x,y are datapath hyperthread ids) would be interesting to compare with 3) below. See attached screentshoots for two samples --- master-0 and master-1 2) "plus Avx512 gather 3" (aka, DPCLS in AVX512), we see same performance. Is DPCLS in use, or is EMC doing all the work? - The output of " ovs-appctl dpif-netdev/pmd-perf-show" would be interesting to understand where packets are classified. EMC doing all the work --- see log below. This could explain why setting avx512 is not helping. NOTE: Our initial study showed that disabling EMC didn't help avx512 wining the case. [root@netqe29 jhsiao]# ovs-appctl dpif-netdev/subtable-lookup-prio-get Available lookup functions (priority : name) 0 : autovalidator *1 : generic* 0 : avx512_gather [root@netqe29 jhsiao]# sleep 60; ovs-appctl dpif-netdev/pmd-perf-show Time: 13:54:40.213 Measurement duration: 2242.679 s pmd thread numa_id 0 core_id 24: Iterations: 17531214131 (0.13 us/it) - Used TSC cycles: 5816810246080 (100.1 % of total cycles) - idle iterations: 17446464548 ( 84.1 % of used cycles) - busy iterations: 84749583 ( 15.9 % of used cycles) Rx packets: 2711982944 (1209 Kpps, 340 cycles/pkt) Datapath passes: 2711982944 (1.00 passes/pkt) - EMC hits: 2711677677 (100.0 %) - SMC hits: 0 ( 0.0 %) - Megaflow hits: 305261 ( 0.0 %, 1.00 subtbl lookups/hit) - Upcalls: 6 ( 0.0 %, 0.0 us/upcall) - Lost upcalls: 0 ( 0.0 %) Tx packets: 2711982944 (1209 Kpps) Tx batches: 84749583 (32.00 pkts/batch) Time: 13:54:40.213 Measurement duration: 2242.675 s pmd thread numa_id 0 core_id 52: Iterations: 17529480287 (0.13 us/it) - Used TSC cycles: 5816709563052 (100.1 % of total cycles) - idle iterations: 17444555421 ( 84.1 % of used cycles) - busy iterations: 84924866 ( 15.9 % of used cycles) Rx packets: 2717592640 (1212 Kpps, 340 cycles/pkt) Datapath passes: 2717592640 (1.00 passes/pkt) - EMC hits: 2717280240 (100.0 %) - SMC hits: 0 ( 0.0 %) - Megaflow hits: 312362 ( 0.0 %, 1.00 subtbl lookups/hit) - Upcalls: 6 ( 0.0 %, 0.0 us/upcall) - Lost upcalls: 0 ( 0.0 %) Tx packets: 2717592608 (1212 Kpps) Tx batches: 84924866 (32.00 pkts/batch) [root@netqe29 jhsiao]# 3) "dpif-set dpif_avx512" only. The performance here is very strange, with ~30% reduction, while our testing shows performance improvement. - A "perf top" here (compared vs step 1) would be helpful to see what is going on See avx512-0 and avx512-1 attachments. 4) "miniflow parser set study", I don't understand what is meant by "Failed to converge"? This is a 64-bytes 0-loss run. So, "Failed to converge" means the binary search fail to get a meaningful Mpps value. This could be the case
[ovs-dev] [PATCH net v2] openvswitch: meter: fix race when getting now_ms.
We have observed meters working unexpected if traffic is 3+Gbit/s with multiple connections. now_ms is not pretected by meter->lock, we may get a negative long_delta_ms when another cpu updated meter->used, then: delta_ms = (u32)long_delta_ms; which will be a large value. band->bucket += delta_ms * band->rate; then we get a wrong band->bucket. OpenVswitch userspace datapath has fixed the same issue[1] some time ago, and we port the implementation to kernel datapath. [1] https://patchwork.ozlabs.org/project/openvswitch/patch/20191025114436.9746-1-i.maxim...@ovn.org/ Fixes: 96fbc13d7e77 ("openvswitch: Add meter infrastructure") Signed-off-by: Tao Liu Suggested-by: Ilya Maximets --- Changelog: v2: just set negative long_delta_ms to zero in case of race for meter lock. v1: make now_ms protected by meter lock. --- net/openvswitch/meter.c | 8 1 file changed, 8 insertions(+) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 96b524c..896b8f5 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -611,6 +611,14 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, spin_lock(>lock); long_delta_ms = (now_ms - meter->used); /* ms */ + if (long_delta_ms < 0) { + /* This condition means that we have several threads fighting +* for a meter lock, and the one who received the packets a +* bit later wins. Assuming that all racing threads received +* packets at the same time to avoid overflow. +*/ + long_delta_ms = 0; + } /* Make sure delta_ms will not be too large, so that bucket will not * wrap around below. -- 1.8.3.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH net] openvswitch: meter: fix race when getting now_ms.
From: Ilya Maximets Date: 2021-05-13 18:21:31 To: Tao Liu ,pshe...@ovn.org Cc: d...@openvswitch.org,net...@vger.kernel.org,linux-ker...@vger.kernel.org,i.maxim...@ovn.org,jean.tourril...@hpe.com,k...@kernel.org,da...@davemloft.net,Eelco Chaudron Subject: Re: [ovs-dev] [PATCH net] openvswitch: meter: fix race when getting now_ms.>On 5/13/21 12:03 PM, Tao Liu wrote: >> We have observed meters working unexpected if traffic is 3+Gbit/s >> with multiple connections. >> >> now_ms is not pretected by meter->lock, we may get a negative >> long_delta_ms when another cpu updated meter->used, then: >> delta_ms = (u32)long_delta_ms; >> which will be a large value. >> >> band->bucket += delta_ms * band->rate; >> then we get a wrong band->bucket. >> >> Fixes: 96fbc13d7e77 ("openvswitch: Add meter infrastructure") >> Signed-off-by: Tao Liu >> --- > >Hi. Thanks for the patch! >We fixed the same issue in userspace datapath some time ago and >we did that a bit differently by just setting negative long_delta_ms >to zero in assumption that all threads received their packets at >the same millisecond (which is most likely true if we have this >kind of race). This should be also cheaper from form the performance >point of view to not have an extra call and a division under the >spinlock. What do you think? Yes, I agree with you. The userspace implementation has same effection, and looks a bit more efficient. I will send a v2. >It's also a good thing to have more or less similar implementation >for all datapaths. > >Here is a userspace patch: > >commit acc5df0e3cb036524d49891fdb9ba89b609dd26a >Author: Ilya Maximets >Date: Thu Oct 24 15:15:07 2019 +0200 > >dpif-netdev: Fix time delta overflow in case of race for meter lock. > >There is a race window between getting the time and getting the meter >lock. This could lead to situation where the thread with larger >current time (this thread called time_{um}sec() later than others) >will acquire meter lock first and update meter->used to the large >value. Next threads will try to calculate time delta by subtracting >the large meter->used from their lower time getting the negative value >which will be converted to a big unsigned delta. > >Fix that by assuming that all these threads received packets in the >same time in this case, i.e. dropping negative delta to 0. > >CC: Jarno Rajahalme >Fixes: 4b27db644a8c ("dpif-netdev: Simple DROP meter implementation.") >Reported-at: > https://mail.openvswitch.org/pipermail/ovs-dev/2019-September/363126.html >Signed-off-by: Ilya Maximets >Acked-by: William Tu > >diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >index c09b8fd95..4720ba1ab 100644 >--- a/lib/dpif-netdev.c >+++ b/lib/dpif-netdev.c >@@ -5646,6 +5646,14 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct >dp_packet_batch *packets_, > /* All packets will hit the meter at the same time. */ > long_delta_t = now / 1000 - meter->used / 1000; /* msec */ > >+if (long_delta_t < 0) { >+/* This condition means that we have several threads fighting for a >+ meter lock, and the one who received the packets a bit later wins. >+ Assuming that all racing threads received packets at the same time >+ to avoid overflow. */ >+long_delta_t = 0; >+} >+ > /* Make sure delta_t will not be too large, so that bucket will not > * wrap around below. */ > delta_t = (long_delta_t > (long long int)meter->max_delta_t) >--- ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH 0/2] Enable support for non-contiguous NUMA nodes
On 5/12/21 6:27 PM, David Christensen wrote: > Systems such as the IBM POWER9 do not allocate contiguous NUMA nodes, > nor does the DPDK framework require that they be contiguous. This > patchset enables OVS support for systems with non-contiguous NUMA nodes > and adds additional tests using the "--dummy-numa" parameter to verify > the functionality. > > David Christensen (2): > dpdk: support non-contiguous NUMA nodes for IBM POWER systems > dpdk: add non-contiguous NUMA node support to auto tests > > lib/dpdk.c| 27 ++--- > lib/ovs-numa.c| 12 +--- > lib/ovs-numa.h| 1 + > tests/dpif-netdev.at | 64 +++- > tests/ofproto-dpif.at | 100 ++-- > tests/pmd.at | 132 ++ > 6 files changed, 186 insertions(+), 150 deletions(-) > Hi, David Christensen. Thanks for working on this, but there is already almost exactly the same patch-set from David Wilder here: https://patchwork.ozlabs.org/project/openvswitch/list/?series=157389 It didn't get enough attention review-wise, so it would be great if you can review it. That patch set also additionally accounts for offline cores. It needs a slight rebase, though. I was looking at oldest patches in our patchwork recently and had an intention to accept these, as they are in a relatively good shape. Best regards, Ilya Maximets. ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH net] openvswitch: meter: fix race when getting now_ms.
On 5/13/21 12:03 PM, Tao Liu wrote: > We have observed meters working unexpected if traffic is 3+Gbit/s > with multiple connections. > > now_ms is not pretected by meter->lock, we may get a negative > long_delta_ms when another cpu updated meter->used, then: > delta_ms = (u32)long_delta_ms; > which will be a large value. > > band->bucket += delta_ms * band->rate; > then we get a wrong band->bucket. > > Fixes: 96fbc13d7e77 ("openvswitch: Add meter infrastructure") > Signed-off-by: Tao Liu > --- Hi. Thanks for the patch! We fixed the same issue in userspace datapath some time ago and we did that a bit differently by just setting negative long_delta_ms to zero in assumption that all threads received their packets at the same millisecond (which is most likely true if we have this kind of race). This should be also cheaper from form the performance point of view to not have an extra call and a division under the spinlock. What do you think? It's also a good thing to have more or less similar implementation for all datapaths. Here is a userspace patch: commit acc5df0e3cb036524d49891fdb9ba89b609dd26a Author: Ilya Maximets Date: Thu Oct 24 15:15:07 2019 +0200 dpif-netdev: Fix time delta overflow in case of race for meter lock. There is a race window between getting the time and getting the meter lock. This could lead to situation where the thread with larger current time (this thread called time_{um}sec() later than others) will acquire meter lock first and update meter->used to the large value. Next threads will try to calculate time delta by subtracting the large meter->used from their lower time getting the negative value which will be converted to a big unsigned delta. Fix that by assuming that all these threads received packets in the same time in this case, i.e. dropping negative delta to 0. CC: Jarno Rajahalme Fixes: 4b27db644a8c ("dpif-netdev: Simple DROP meter implementation.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-September/363126.html Signed-off-by: Ilya Maximets Acked-by: William Tu diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index c09b8fd95..4720ba1ab 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -5646,6 +5646,14 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, /* All packets will hit the meter at the same time. */ long_delta_t = now / 1000 - meter->used / 1000; /* msec */ +if (long_delta_t < 0) { +/* This condition means that we have several threads fighting for a + meter lock, and the one who received the packets a bit later wins. + Assuming that all racing threads received packets at the same time + to avoid overflow. */ +long_delta_t = 0; +} + /* Make sure delta_t will not be too large, so that bucket will not * wrap around below. */ delta_t = (long_delta_t > (long long int)meter->max_delta_t) --- ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH net] openvswitch: meter: fix race when getting now_ms.
We have observed meters working unexpected if traffic is 3+Gbit/s with multiple connections. now_ms is not pretected by meter->lock, we may get a negative long_delta_ms when another cpu updated meter->used, then: delta_ms = (u32)long_delta_ms; which will be a large value. band->bucket += delta_ms * band->rate; then we get a wrong band->bucket. Fixes: 96fbc13d7e77 ("openvswitch: Add meter infrastructure") Signed-off-by: Tao Liu --- net/openvswitch/meter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 96b524c..c50ab7f 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -593,7 +593,7 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { - long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); + long long int now_ms; long long int long_delta_ms; struct dp_meter_band *band; struct dp_meter *meter; @@ -610,6 +610,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, /* Lock the meter while using it. */ spin_lock(>lock); + now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long_delta_ms = (now_ms - meter->used); /* ms */ /* Make sure delta_ms will not be too large, so that bucket will not -- 1.8.3.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [v2 v2 4/6] dpif-netdev: add avx512 miniflow extract for traffic ip/udp
> -Original Message- > From: dev On Behalf Of Timothy Redaelli > Sent: Thursday, April 29, 2021 2:52 PM > To: Amber, Kumar > Cc: d...@openvswitch.org; i.maxim...@ovn.org > Subject: Re: [ovs-dev] [v2 v2 4/6] dpif-netdev: add avx512 miniflow extract > for > traffic ip/udp > > On Wed, 28 Apr 2021 14:49:29 +0530 > Kumar Amber wrote: > > > This patch introduces avx512 optimized function > > pointer for IP/UDP traffic type and supporting > > functions in dpif-netdev-extract-avx512. > > > > Signed-off-by: Harry van Haaren > > Co-authored-by: Kumar Amber > > Signed-off-by: Kumar Amber > > --- > > lib/automake.mk | 1 + > > lib/dpdk.c| 1 + > > lib/dpif-netdev-extract-avx512.c | 218 ++ > > lib/dpif-netdev-private-extract.c | 5 + > > lib/dpif-netdev-private-extract.h | 11 ++ > > 5 files changed, 236 insertions(+) > > create mode 100644 lib/dpif-netdev-extract-avx512.c > > > > Hi, > unlucky this patch breaks compilation on non-x86 arches: > > libtool: compile: gcc -DHAVE_CONFIG_H -I. -I.. -I ../include -I ./include -I > ../lib -I ./lib > -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat - > Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast - > Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes - > Wmissing-field-initializers -fno-strict-aliasing -Wswitch-bool -Wlogical-not- > parentheses -Wsizeof-array-argument -Wbool-compare -Wshift-negative-value - > Wduplicated-cond -Wshadow -Wmultistatement-macros -Wcast-align=strict -O2 -g - > pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,- > D_GLIBCXX_ASSERTIONS -fexceptions -fstack-protector-strong -grecord-gcc- > switches -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 - > specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -m64 -march=z13 -mtune=z14 - > fasynchronous-unwind-tables -fstack-clash-protection -c > ../lib/dpif-netdev-extract- > avx512.c -fPIC -DPIC -o lib/.libs/dpif-netdev-extract-avx512.o > ../lib/dpif-netdev-extract-avx512.c:18:10: fatal error: immintrin.h: No such > file or > directory > #include > ^ > compilation terminated. > make[2]: *** [Makefile:4562: lib/dpif-netdev-extract-avx512.lo] Error 1 > make[2]: *** Waiting for unfinished jobs > > You should, probably, keep all the file content inside an #ifdef > __x86_64__, and probably also inside an #if !defined(__CHECKER__), like > dpif-netdev-lookup-avx512-gather.c and dpif-netdev-avx512.c. Hi Timothy, Apologies for response in delay - just saw your review on the patchwork: https://patchwork.ozlabs.org/project/openvswitch/patch/20210428091931.2090062-5-kumar.am...@intel.com/ You're absolutely right that the code here wasn't portable to other Archs, this was a known limitation of the v2, and is fixed in the V3 which we intend to send to the mailing list in the next days. Thanks for review & input, -Harry ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev