Felix Huettner via dev <ovs-dev@openvswitch.org> writes: > Previously the kernel did not provide a netlink interface to flush/list > only conntrack entries matching a specific zone. With [1] and [2] it is now > possible to flush and list conntrack entries filtered by zone. Older > kernels not yet supporting this feature will ignore the filter. > For the list request that means just returning all entries (which we can > then filter in userspace as before). > For the flush request that means deleting all conntrack entries. > > These significantly improves the performance of flushing conntrack zones > when the conntrack table is large. Since flushing a conntrack zone is > normally triggered via an openflow command it blocks the main ovs thread > and thereby also blocks new flows from being applied. Using this new > feature we can reduce the flushing time for zones by around 93%. > > In combination with OVN the creation of a Logical_Router (which causes > the flushing of a ct zone) could block other operations, e.g. the > failover of Logical_Routers (as they cause new flows to be created). > This is visible from a user perspective as a ovn-controller that is idle > (as it waits for vswitchd) and vswitchd reporting: > "blocked 1000 ms waiting for main to quiesce" (potentially with ever > increasing times). > > The following performance tests where run in a qemu vm with 500.000 > conntrack entries distributed evenly over 500 ct zones using `ovstest > test-netlink-conntrack flush zone=<zoneid>`. > > With this patch and kernel v6.8-rc4: > > ----------------------------------------------------------------------------------------------------------------------------------------------------- > Min (s) Median (s) 90%ile (s) > 99%ile (s) Max (s) Mean (s) Total (s) Count > ----------------------------------------------------------------------------------------------------------------------------------------------------- > flush zone with 1000 entries 0.260 0.319 0.335 > 0.348 0.362 0.320 80.02 250 > flush zone with no entry 0.228 0.298 0.325 > 0.340 0.348 0.296 73.93 250 > ----------------------------------------------------------------------------------------------------------------------------------------------------- > > With this patch and kernel v6.7.1: > > ----------------------------------------------------------------------------------------------------------------------------------------------------- > Min (s) Median (s) 90%ile (s) > 99%ile (s) Max (s) Mean (s) Total (s) Count > ----------------------------------------------------------------------------------------------------------------------------------------------------- > flush zone with 1000 entries 3.946 4.237 4.367 > 4.495 4.543 4.236 1058.992 250 > flush zone with no entry 3.462 4.460 4.662 > 4.931 5.390 4.430 1107.479 250 > ----------------------------------------------------------------------------------------------------------------------------------------------------- > > Without this patch and kernel v6.8-rc4: > > ----------------------------------------------------------------------------------------------------------------------------------------------------- > Min (s) Median (s) 90%ile (s) > 99%ile (s) Max (s) Mean (s) Total (s) Count > ----------------------------------------------------------------------------------------------------------------------------------------------------- > flush zone with 1000 entries 3.497 4.349 4.522 > 4.773 5.054 4.331 1082.802 250 > flush zone with no entry 3.212 4.010 4.572 > 6.003 6.396 4.071 1017.838 250 > ----------------------------------------------------------------------------------------------------------------------------------------------------- > > [1]: > https://github.com/torvalds/linux/commit/eff3c558bb7e61c41b53e4c8130e514a5a4df9ba > [2]: > https://github.com/torvalds/linux/commit/fa173a1b4e3fd1ab5451cbc57de6fc624c824b0a > > Co-Authored-By: Luca Czesla <luca.czesla@mail.schwarz> > Signed-off-by: Luca Czesla <luca.czesla@mail.schwarz> > Co-Authored-By: Max Lamprecht <max.lamprecht@mail.schwarz> > Signed-off-by: Max Lamprecht <max.lamprecht@mail.schwarz> > Signed-off-by: Felix Huettner <felix.huettner@mail.schwarz> > ---
Thanks so much - I haven't gotten a chance to test on a new kernel, but it's on my agenda. > v2->v3: > - update description to include upstream fix (Thanks to Ilya for finding > that issue) > v1->v2: > - fixed wrong signed-off-by > > lib/netlink-conntrack.c | 57 +++++++++++++++++++++++++++++++++++++++-- > tests/system-traffic.at | 8 ++++++ > 2 files changed, 63 insertions(+), 2 deletions(-) > > diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c > index 492bfcffb..1b050894d 100644 > --- a/lib/netlink-conntrack.c > +++ b/lib/netlink-conntrack.c > @@ -25,6 +25,7 @@ > #include <linux/netfilter/nf_conntrack_tcp.h> > #include <linux/netfilter/nf_conntrack_ftp.h> > #include <linux/netfilter/nf_conntrack_sctp.h> > +#include <sys/utsname.h> > > #include "byte-order.h" > #include "compiler.h" > @@ -141,6 +142,9 @@ nl_ct_dump_start(struct nl_ct_dump_state **statep, const > uint16_t *zone, > > nl_msg_put_nfgenmsg(&state->buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, > IPCTNL_MSG_CT_GET, NLM_F_REQUEST); > + if (zone) { > + nl_msg_put_be16(&state->buf, CTA_ZONE, htons(*zone)); > + } > nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf); > ofpbuf_clear(&state->buf); > > @@ -283,23 +287,72 @@ nl_ct_flush_zone(uint16_t flush_zone) > return err; > } > #else > + > +static bool > +netlink_flush_supports_zone(void) > +{ > + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; > + static bool supported = false; > + > + if (ovsthread_once_start(&once)) { > + struct utsname utsname; > + int major, minor; > + > + if (uname(&utsname) == -1) { > + VLOG_WARN("uname failed (%s)", ovs_strerror(errno)); > + } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) { > + VLOG_WARN("uname reported bad OS release (%s)", utsname.release); Do these WARN calls need to be severe like this? There isn't much for the user to do about this, and it won't impact the functioning of the system in such a major way. Maybe they can be INFO instead? Otherwise, we may need to change most of the selftests to ignore the "uname failed" warnings. More of a nit, because it shouldn't generally fail on any systems. > + } else if (major < 6 || (major == 6 && minor < 8)) { > + VLOG_INFO("disabling conntrack flush by zone in Linux kernel %s", > + utsname.release); > + } else { > + supported = true; > + } > + ovsthread_once_done(&once); > + } > + return supported; > +} > + > int > nl_ct_flush_zone(uint16_t flush_zone) > { > - /* Apparently, there's no netlink interface to flush a specific zone. > + /* In older kernels, there was no netlink interface to flush a specific > + * conntrack zone. > * This code dumps every connection, checks the zone and eventually > * delete the entry. > + * In newer kernels there is the option to specifiy a zone for filtering > + * during dumps. Older kernels ignore this option. We set it here in the > + * hope we only get relevant entries back, but fall back to filtering > here > + * to keep compatibility. > * > - * This is race-prone, but it is better than using shell scripts. */ > + * This is race-prone, but it is better than using shell scripts. > + * > + * Additionally newer kernels also support flushing a zone without > listing > + * it first. */ > > struct nl_dump dump; > struct ofpbuf buf, reply, delete; > + int err; > + > + if (netlink_flush_supports_zone()) { > + ofpbuf_init(&buf, NL_DUMP_BUFSIZE); > + > + nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, > + IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST); > + nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone)); > + > + err = nl_transact(NETLINK_NETFILTER, &buf, NULL); > + ofpbuf_uninit(&buf); > + > + return err; > + } > > ofpbuf_init(&buf, NL_DUMP_BUFSIZE); > ofpbuf_init(&delete, NL_DUMP_BUFSIZE); > > nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, > IPCTNL_MSG_CT_GET, NLM_F_REQUEST); > + nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone)); > nl_dump_start(&dump, NETLINK_NETFILTER, &buf); > ofpbuf_clear(&buf); > > diff --git a/tests/system-traffic.at b/tests/system-traffic.at > index 4fd5dbe59..14010dc41 100644 > --- a/tests/system-traffic.at > +++ b/tests/system-traffic.at > @@ -3297,6 +3297,14 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | > FORMAT_CT(10.1.1.4)], [0], [dnl > > tcp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=<cleared>,dport=<cleared>),reply=(src=10.1.1.4,dst=10.1.1.3,sport=<cleared>,dport=<cleared>),zone=2,protoinfo=(state=<cleared>) > ]) > > +dnl flushing one zone should leave the others intact > +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=2]) > +AT_CHECK([ovs-appctl dpctl/dump-conntrack zone=1 | FORMAT_CT(10.1.1.2)], > [0], [dnl > +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=<cleared>,dport=<cleared>),reply=(src=10.1.1.2,dst=10.1.1.1,sport=<cleared>,dport=<cleared>),zone=1,protoinfo=(state=<cleared>) > +]) > +AT_CHECK([ovs-appctl dpctl/dump-conntrack zone=2 | FORMAT_CT(10.1.1.4)], > [0], [dnl > +]) > + > OVS_TRAFFIC_VSWITCHD_STOP > AT_CLEANUP > > > base-commit: b3fc822208e89c6ba04e1fc972da0bf4426a5847 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev