Logical_Flows are immutable, that is, when the match or action change
ovn-northd will actually remove the old logical flow and add a new one.
Commit ecc941c70f16 added flows to flood ARPs to routers for unreachable
addresses.
In the following topology (2 switches connected to a router with a load
balancer attached to it):
S1 --- (IP1/MASK1) R (IP2/MASK2) --- S2
LB1 (applied on R) with N VIPs.
The following logical flows were generated:
S1 (in_l2_lkup): if "arp.op == 1 && arp.tpa == { VIP1, .., VIPn, IP1 }" then ...
S2 (in_l2_lkup): if "arp.op == 1 && arp.tpa == { VIP1, .., VIPn, IP2 }" then ...
While this seemed a good idea initially because it would reduce the
total number of logical flows, this approach has a few major downsides
that severely affect scalability:
1. When logical datapath grouping is enabled (which is how high scale
deployments should use OVN) there is no chance that the flows for
reachable/unreachable router owned IPs are grouped together for all
datapaths that correspond to the logical switches they are generated
for.
2. Whenever a VIP (VIPn+1) is added to LB1 all these flows will be
removed and new ones will be added with the only difference being the
VIPn+1 aded to the flow match. As this is a new logical flow record,
ovn-controller will have to remove all previously installed openflows
(for the deleted logical flow) and add new ones. However, for most
of these openflows, the only change is the cookie value (the first 32
bits of the logical flow UUID). At scale this unnecessary openflow
churn will induce significant latency. Moreover, this also creates
unnecessary jsonrpc traffic from ovn-northd towards OVN_Southbound
because the old flow needs to be removed and a new one (with a large
match field) needs to be installed.
To avoid this, we now install individual flows, one per IP, for
managing ARP/ND traffic from logical switches towards router owned
IPs that are reachable/unreachable on the logical port connecting
the switch.
Fixes: ecc941c70f16 ("northd: Flood ARPs to routers for "unreachable"
addresses.")
Signed-off-by: Dumitru Ceara <[email protected]>
---
On a deployment based on an ovn-kubernetes cluster with 20 nodes, 2500
load balancer VIPs, 5000 pods, we:
- measure number of logical flows in the SB DB, size of the SB DB
- then create an additional 250 pods (logical ports), bind them, and add
a load balancer VIP (with a single backend) for each of the additional
pods, and measure how long it takes for all openflows corresponding to
each of the pods to be installed in OVS.
Results:
- without fix:
------------
SB lflows: 67976
SB size (uncompacted): 46 MiB
SB size (compacted): 40 MiB
after 250 ports + VIPs added:
SB lflows: 71479
SB size (uncompacted): 248 MiB
SB size (compacted): 42 MiB
Max time to install all relevant openflows for a single pod: ~6sec
- with fix:
---------
SB lflows: 70399
SB size (uncompacted): 46 MiB
SB size (compacted): 40 MiB
after 250 ports + VIPs added:
SB lflows: 74149
SB size (uncompacted): 165 MiB
SB size (compacted): 42 MiB
Max time to install all relevant openflows for a single pod: ~100msec
---
northd/ovn-northd.c | 94 ++++++++++++++++----------------------------
northd/ovn_northd.dl | 28 ++++++-------
tests/ovn-northd.at | 8 ++--
3 files changed, 50 insertions(+), 80 deletions(-)
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index ebe12cace..4b8554e8b 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -6651,7 +6651,7 @@ build_lswitch_rport_arp_req_self_orig_flow(struct
ovn_port *op,
}
static void
-arp_nd_ns_match(struct ds *ips, int addr_family, struct ds *match)
+arp_nd_ns_match(const char *ips, int addr_family, struct ds *match)
{
/* Packets received from VXLAN tunnels have already been through the
* router pipeline so we should skip them. Normally this is done by the
@@ -6661,13 +6661,10 @@ arp_nd_ns_match(struct ds *ips, int addr_family, struct
ds *match)
ds_put_cstr(match, FLAGBIT_NOT_VXLAN " && ");
if (addr_family == AF_INET) {
- ds_put_cstr(match, "arp.op == 1 && arp.tpa == {");
+ ds_put_format(match, "arp.op == 1 && arp.tpa == %s", ips);
} else {
- ds_put_cstr(match, "nd_ns && nd.target == {");
+ ds_put_format(match, "nd_ns && nd.target == %s", ips);
}
-
- ds_put_cstr(match, ds_cstr_ro(ips));
- ds_put_cstr(match, "}");
}
/*
@@ -6676,7 +6673,7 @@ arp_nd_ns_match(struct ds *ips, int addr_family, struct
ds *match)
* switching domain as regular broadcast.
*/
static void
-build_lswitch_rport_arp_req_flow_for_reachable_ip(struct ds *ips,
+build_lswitch_rport_arp_req_flow_for_reachable_ip(const char *ips,
int addr_family, struct ovn_port *patch_op, struct ovn_datapath *od,
uint32_t priority, struct hmap *lflows,
const struct ovsdb_idl_row *stage_hint)
@@ -6715,7 +6712,7 @@ build_lswitch_rport_arp_req_flow_for_reachable_ip(struct
ds *ips,
* broadcast.
*/
static void
-build_lswitch_rport_arp_req_flow_for_unreachable_ip(struct ds *ips,
+build_lswitch_rport_arp_req_flow_for_unreachable_ip(const char *ips,
int addr_family, struct ovn_datapath *od, uint32_t priority,
struct hmap *lflows, const struct ovsdb_idl_row *stage_hint)
{
@@ -6757,10 +6754,6 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
* router port.
* Priority: 80.
*/
- struct ds ips_v4_match_reachable = DS_EMPTY_INITIALIZER;
- struct ds ips_v6_match_reachable = DS_EMPTY_INITIALIZER;
- struct ds ips_v4_match_unreachable = DS_EMPTY_INITIALIZER;
- struct ds ips_v6_match_unreachable = DS_EMPTY_INITIALIZER;
const char *ip_addr;
SSET_FOR_EACH (ip_addr, &op->od->lb_ips_v4) {
@@ -6771,9 +6764,13 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
*/
if (ip_parse(ip_addr, &ipv4_addr)) {
if (lrouter_port_ipv4_reachable(op, ipv4_addr)) {
- ds_put_format(&ips_v4_match_reachable, "%s, ", ip_addr);
+ build_lswitch_rport_arp_req_flow_for_reachable_ip(
+ ip_addr, AF_INET, sw_op, sw_od, 80, lflows,
+ stage_hint);
} else {
- ds_put_format(&ips_v4_match_unreachable, "%s, ", ip_addr);
+ build_lswitch_rport_arp_req_flow_for_unreachable_ip(
+ ip_addr, AF_INET, sw_od, 90, lflows,
+ stage_hint);
}
}
}
@@ -6785,9 +6782,13 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
*/
if (ipv6_parse(ip_addr, &ipv6_addr)) {
if (lrouter_port_ipv6_reachable(op, &ipv6_addr)) {
- ds_put_format(&ips_v6_match_reachable, "%s, ", ip_addr);
+ build_lswitch_rport_arp_req_flow_for_reachable_ip(
+ ip_addr, AF_INET6, sw_op, sw_od, 80, lflows,
+ stage_hint);
} else {
- ds_put_format(&ips_v6_match_unreachable, "%s, ", ip_addr);
+ build_lswitch_rport_arp_req_flow_for_unreachable_ip(
+ ip_addr, AF_INET6, sw_od, 90, lflows,
+ stage_hint);
}
}
}
@@ -6812,65 +6813,42 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
if (!sset_contains(&op->od->lb_ips_v6, nat->external_ip)) {
if (lrouter_port_ipv6_reachable(op, addr)) {
- ds_put_format(&ips_v6_match_reachable, "%s, ",
- nat->external_ip);
+ build_lswitch_rport_arp_req_flow_for_reachable_ip(
+ nat->external_ip, AF_INET6, sw_op, sw_od, 80, lflows,
+ stage_hint);
} else {
- ds_put_format(&ips_v6_match_unreachable, "%s, ",
- nat->external_ip);
+ build_lswitch_rport_arp_req_flow_for_unreachable_ip(
+ nat->external_ip, AF_INET6, sw_od, 90, lflows,
+ stage_hint);
}
}
} else {
ovs_be32 addr = nat_entry->ext_addrs.ipv4_addrs[0].addr;
if (!sset_contains(&op->od->lb_ips_v4, nat->external_ip)) {
if (lrouter_port_ipv4_reachable(op, addr)) {
- ds_put_format(&ips_v4_match_reachable, "%s, ",
- nat->external_ip);
+ build_lswitch_rport_arp_req_flow_for_reachable_ip(
+ nat->external_ip, AF_INET, sw_op, sw_od, 80, lflows,
+ stage_hint);
} else {
- ds_put_format(&ips_v4_match_unreachable, "%s, ",
- nat->external_ip);
+ build_lswitch_rport_arp_req_flow_for_unreachable_ip(
+ nat->external_ip, AF_INET, sw_od, 90, lflows,
+ stage_hint);
}
}
}
}
for (size_t i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
- ds_put_format(&ips_v4_match_reachable, "%s, ",
- op->lrp_networks.ipv4_addrs[i].addr_s);
- }
- for (size_t i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
- ds_put_format(&ips_v6_match_reachable, "%s, ",
- op->lrp_networks.ipv6_addrs[i].addr_s);
- }
-
- if (ds_last(&ips_v4_match_reachable) != EOF) {
- ds_chomp(&ips_v4_match_reachable, ' ');
- ds_chomp(&ips_v4_match_reachable, ',');
build_lswitch_rport_arp_req_flow_for_reachable_ip(
- &ips_v4_match_reachable, AF_INET, sw_op, sw_od, 80, lflows,
- stage_hint);
+ op->lrp_networks.ipv4_addrs[i].addr_s, AF_INET, sw_op, sw_od, 80,
+ lflows, stage_hint);
}
- if (ds_last(&ips_v6_match_reachable) != EOF) {
- ds_chomp(&ips_v6_match_reachable, ' ');
- ds_chomp(&ips_v6_match_reachable, ',');
+ for (size_t i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
build_lswitch_rport_arp_req_flow_for_reachable_ip(
- &ips_v6_match_reachable, AF_INET6, sw_op, sw_od, 80, lflows,
- stage_hint);
+ op->lrp_networks.ipv6_addrs[i].addr_s, AF_INET6, sw_op, sw_od, 80,
+ lflows, stage_hint);
}
- if (ds_last(&ips_v4_match_unreachable) != EOF) {
- ds_chomp(&ips_v4_match_unreachable, ' ');
- ds_chomp(&ips_v4_match_unreachable, ',');
- build_lswitch_rport_arp_req_flow_for_unreachable_ip(
- &ips_v4_match_unreachable, AF_INET, sw_od, 90, lflows,
- stage_hint);
- }
- if (ds_last(&ips_v6_match_unreachable) != EOF) {
- ds_chomp(&ips_v6_match_unreachable, ' ');
- ds_chomp(&ips_v6_match_unreachable, ',');
- build_lswitch_rport_arp_req_flow_for_unreachable_ip(
- &ips_v6_match_unreachable, AF_INET6, sw_od, 90, lflows,
- stage_hint);
- }
/* Self originated ARP requests/ND need to be flooded as usual.
*
* However, if the switch doesn't have any non-router ports we shouldn't
@@ -6881,10 +6859,6 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
if (sw_od->n_router_ports != sw_od->nbs->n_ports) {
build_lswitch_rport_arp_req_self_orig_flow(op, 75, sw_od, lflows);
}
- ds_destroy(&ips_v4_match_reachable);
- ds_destroy(&ips_v6_match_reachable);
- ds_destroy(&ips_v4_match_unreachable);
- ds_destroy(&ips_v6_match_unreachable);
}
static void
diff --git a/northd/ovn_northd.dl b/northd/ovn_northd.dl
index 7bfaae992..d82f43cc8 100644
--- a/northd/ovn_northd.dl
+++ b/northd/ovn_northd.dl
@@ -4375,8 +4375,7 @@ Flow(.logical_datapath = sw._uuid,
.stage = s_SWITCH_IN_L2_LKUP(),
.priority = 80,
.__match = fLAGBIT_NOT_VXLAN() ++
- " && arp.op == 1 && arp.tpa == { " ++
- ipv4.to_vec().join(", ") ++ "}",
+ " && arp.op == 1 && arp.tpa == " ++ ipv4,
.actions = if (sw.has_non_router_port) {
"clone {outport = ${sp.json_name}; output; }; "
"outport = ${mc_flood_l2}; output;"
@@ -4386,15 +4385,14 @@ Flow(.logical_datapath = sw._uuid,
.external_ids = stage_hint(sp.lsp._uuid)) :-
sp in &SwitchPort(.sw = sw, .peer = Some{rp}),
rp.is_enabled(),
- &SwitchPortARPForwards(.port = sp, .reachable_ips_v4 = ipv4),
- not ipv4.is_empty(),
+ &SwitchPortARPForwards(.port = sp, .reachable_ips_v4 = ips_v4),
+ var ipv4 = FlatMap(ips_v4),
var mc_flood_l2 = json_string_escape(mC_FLOOD_L2().0).
Flow(.logical_datapath = sw._uuid,
.stage = s_SWITCH_IN_L2_LKUP(),
.priority = 80,
.__match = fLAGBIT_NOT_VXLAN() ++
- " && nd_ns && nd.target == { " ++
- ipv6.to_vec().join(", ") ++ "}",
+ " && nd_ns && nd.target == " ++ ipv6,
.actions = if (sw.has_non_router_port) {
"clone {outport = ${sp.json_name}; output; }; "
"outport = ${mc_flood_l2}; output;"
@@ -4404,36 +4402,34 @@ Flow(.logical_datapath = sw._uuid,
.external_ids = stage_hint(sp.lsp._uuid)) :-
sp in &SwitchPort(.sw = sw, .peer = Some{rp}),
rp.is_enabled(),
- &SwitchPortARPForwards(.port = sp, .reachable_ips_v6 = ipv6),
- not ipv6.is_empty(),
+ &SwitchPortARPForwards(.port = sp, .reachable_ips_v6 = ips_v6),
+ var ipv6 = FlatMap(ips_v6),
var mc_flood_l2 = json_string_escape(mC_FLOOD_L2().0).
Flow(.logical_datapath = sw._uuid,
.stage = s_SWITCH_IN_L2_LKUP(),
.priority = 90,
.__match = fLAGBIT_NOT_VXLAN() ++
- " && arp.op == 1 && arp.tpa == {" ++
- ipv4.to_vec().join(", ") ++ "}",
+ " && arp.op == 1 && arp.tpa == " ++ ipv4,
.actions = "outport = ${flood}; output;",
.external_ids = stage_hint(sp.lsp._uuid)) :-
sp in &SwitchPort(.sw = sw, .peer = Some{rp}),
rp.is_enabled(),
- &SwitchPortARPForwards(.port = sp, .unreachable_ips_v4 = ipv4),
- not ipv4.is_empty(),
+ &SwitchPortARPForwards(.port = sp, .unreachable_ips_v4 = ips_v4),
+ var ipv4 = FlatMap(ips_v4),
var flood = json_string_escape(mC_FLOOD().0).
Flow(.logical_datapath = sw._uuid,
.stage = s_SWITCH_IN_L2_LKUP(),
.priority = 90,
.__match = fLAGBIT_NOT_VXLAN() ++
- " && nd_ns && nd.target == {" ++
- ipv6.to_vec().join(", ") ++ "}",
+ " && nd_ns && nd.target == " ++ ipv6,
.actions = "outport = ${flood}; output;",
.external_ids = stage_hint(sp.lsp._uuid)) :-
sp in &SwitchPort(.sw = sw, .peer = Some{rp}),
rp.is_enabled(),
- &SwitchPortARPForwards(.port = sp, .unreachable_ips_v6 = ipv6),
- not ipv6.is_empty(),
+ &SwitchPortARPForwards(.port = sp, .unreachable_ips_v6 = ips_v6),
+ var ipv6 = FlatMap(ips_v6),
var flood = json_string_escape(mC_FLOOD().0).
for (SwitchPortNewDynamicAddress(.port = &SwitchPort{.lsp = lsp, .json_name =
json_name, .sw = sw},
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 94405349e..3df4bf615 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -4276,20 +4276,20 @@ AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup
| grep priority=90 -c], [
])
# We expect that the installed flows will match the unreachable DNAT addresses
only.
-AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 |
grep "arp.tpa == {30.0.0.100}" -c], [0], [dnl
+AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 |
grep "arp.tpa == 30.0.0.100" -c], [0], [dnl
1
])
-AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 |
grep "arp.tpa == {40.0.0.100}" -c], [0], [dnl
+AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 |
grep "arp.tpa == 40.0.0.100" -c], [0], [dnl
1
])
# Ensure that we do not create flows for SNAT addresses
-AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 |
grep "arp.tpa == {30.0.0.200}" -c], [1], [dnl
+AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 |
grep "arp.tpa == 30.0.0.200" -c], [1], [dnl
0
])
-AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 |
grep "arp.tpa == {40.0.0.200}" -c], [1], [dnl
+AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 |
grep "arp.tpa == 40.0.0.200" -c], [1], [dnl
0
])
--
2.27.0
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev