Commit 506f7d4bcfbc ("northd: rely on new actions for ecmp-symmetric
routing") relied on commit_ecmp_nh() to learn the mapping between a
traffic session's 5-tuple and the MAC of the next-hop to be used for
that session. This logical action translates to OVS learn() action.
While in theory this is the most correct way of tracking the mapping
between ECMP symmetric reply sessions and their next hop's MAC address,
it also creates additional load in ovs-vswitchd (to learn the new flows)
and introduces latency and affects traffic throughput.
An alternative is to instead re-commit the 5-tuple (along with the
next-hop MAC and port information) to conntrack every time traffic
received from the next-hop side is forwarded by the OVN router.
Testing shows that in a scenario with 4 next-hops and ECMP symmetric
replies enabled with traffic running for 600 seconds latency, throughput
and ovs-vswitchd CPU usage are significantly better with this change:
- Before:
- ovs-vswitchd ~1200% CPU (distributed across 17 revalidator threads)
- Sent: 638.72MiB, 1.06MiB/s
- Recv: 7.17GiB, 12.24MiB/s
- After:
- ovs-vswitchd ~7% CPU (distributed across 17 revalidator threads)
- Sent: 892.69MiB, 1.49MiB/s
- Recv: 8.63GiB, 14.72MiB/s
The only downside of not using learn() flows is that OVN cannot
determine on its own when a next-hop goes away (without using BFD).
This scenario however can probably be handled by the CMS which has more
knowledge about the rest of the network (outside OVN), e.g.,
ovn-kubernetes currently flushes conntrack entries created for ECMP
symmetric reply sessions when it detects that the next-hop went away.
If a next-hops changes MAC address that's handled by OVN gracefully and
the conntrack entry corresponding to that session gets updated
accordingly.
NOTE: we don't remove the logical actions' implementation as
ovn-controller needs to be able to translate logical flows generated by
older versions of ovn-northd.
Fixes: 506f7d4bcfbc ("northd: rely on new actions for ecmp-symmetric routing")
Signed-off-by: Dumitru Ceara <[email protected]>
---
northd/northd.c | 52 ++++++++++++++++++---------------------------
tests/ovn-northd.at | 20 ++++++-----------
2 files changed, 27 insertions(+), 45 deletions(-)
diff --git a/northd/northd.c b/northd/northd.c
index 459aaafb1c..e67d34cd28 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -262,7 +262,6 @@ enum ovn_stage {
#define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
#define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
#define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
-#define REGBIT_KNOWN_ECMP_NH "reg9[5]"
#define REGBIT_KNOWN_LB_SESSION "reg9[6]"
/* Register to store the eth address associated to a router port for packets
@@ -370,8 +369,7 @@ enum ovn_stage {
* | | EGRESS_LOOPBACK/ | G | UNUSED |
* | R9 | PKT_LARGER/ | 4 | |
* | | LOOKUP_NEIGHBOR_RESULT/ | | |
- * | | SKIP_LOOKUP_NEIGHBOR/ | | |
- * | | KNOWN_ECMP_NH} | | |
+ * | | SKIP_LOOKUP_NEIGHBOR} | | |
* | | | | |
* | | REG_ORIG_TP_DPORT_ROUTER | | |
* | | | | |
@@ -10846,15 +10844,13 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
cidr);
free(cidr);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
- ds_cstr(&base_match),
- REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
- &st_route->header_);
+ ds_cstr(&base_match), "ct_next;",
+ &st_route->header_);
/* And packets that go out over an ECMP route need conntrack */
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
- ds_cstr(route_match),
- REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
- &st_route->header_);
+ ds_cstr(route_match), "ct_next;",
+ &st_route->header_);
/* Save src eth and inport in ct_label for packets that arrive over
* an ECMP route.
@@ -10867,9 +10863,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
- "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
- ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
- IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+ "next;",
+ ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
@@ -10880,9 +10875,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
- "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
- ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
- IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+ "next;",
+ ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
@@ -10893,53 +10887,49 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
- "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
- ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
- IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+ "next;",
+ ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
ds_clear(&match);
ds_put_format(&match,
- "%s && (!ct.rpl && ct.est) && tcp && "REGBIT_KNOWN_ECMP_NH" == 0",
+ "%s && (!ct.rpl && ct.est) && tcp",
ds_cstr(&base_match));
ds_clear(&actions);
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
- "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
- ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
- IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+ "next;",
+ ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
ds_clear(&match);
ds_put_format(&match,
- "%s && (!ct.rpl && ct.est) && udp && "REGBIT_KNOWN_ECMP_NH" == 0",
+ "%s && (!ct.rpl && ct.est) && udp",
ds_cstr(&base_match));
ds_clear(&actions);
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
- "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
- ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
- IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+ "next;",
+ ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
ds_clear(&match);
ds_put_format(&match,
- "%s && (!ct.rpl && ct.est) && sctp && "REGBIT_KNOWN_ECMP_NH" == 0",
+ "%s && (!ct.rpl && ct.est) && sctp",
ds_cstr(&base_match));
ds_clear(&actions);
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
- "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
- ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
- IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+ "next;",
+ ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
@@ -10948,7 +10938,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
* for where to route the packet.
*/
ds_put_format(&ecmp_reply,
- "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s == %"PRId64,
+ "ct.rpl && %s == %"PRId64,
ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ds_clear(&match);
ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 3ef92bb3ff..2e1a85c9ac 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -6264,24 +6264,16 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp" lr0flows |
sed 's/192\.168\.0\..0/192.
table=??(lr_in_ip_routing_ecmp), priority=150 , match=(reg8[[0..15]] == 0),
action=(next;)
])
-AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows | sed
's/table=../table=??/' | sort], [0], [dnl
- table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public"
&& ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp && reg9[[5]] == 0),
action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp);
next;)
- table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public"
&& ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp && reg9[[5]] == 0),
action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
next;)
- table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public"
&& ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp && reg9[[5]] == 0),
action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
next;)
- table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public"
&& ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp), action=(ct_commit {
ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;};
commit_ecmp_nh(ipv6 = false, proto = sctp); next;)
- table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public"
&& ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp), action=(ct_commit {
ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;};
commit_ecmp_nh(ipv6 = false, proto = tcp); next;)
- table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public"
&& ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp), action=(ct_commit {
ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;};
commit_ecmp_nh(ipv6 = false, proto = udp); next;)
-])
-
-AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed
's/table=../table=??/' | sort], [0], [dnl
- table=??(lr_in_defrag ), priority=100 , match=(inport == "lr0-public"
&& ip4.src == 1.0.0.1), action=(reg9[[5]] = chk_ecmp_nh_mac(); ct_next;)
- table=??(lr_in_defrag ), priority=100 , match=(reg7 == 0 && ip4.dst
== 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
+AT_CHECK([grep -e "lr_in_defrag" lr0flows | sed 's/table=../table=??/' |
sort], [0], [dnl
+ table=??(lr_in_defrag ), priority=0 , match=(1), action=(next;)
+ table=??(lr_in_defrag ), priority=100 , match=(inport == "lr0-public"
&& ip4.src == 1.0.0.1), action=(ct_next;)
+ table=??(lr_in_defrag ), priority=100 , match=(reg7 == 0 && ip4.dst
== 1.0.0.1/32), action=(ct_next;)
])
dnl The chassis was created with other_config:ct-no-masked-label=false, the
flows
dnl should be using ct_label.ecmp_reply_port.
AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
's/table=../table=??/'], [0], [dnl
- table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && reg9[[5]] ==
1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
+ table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl &&
ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
])
dnl Simulate an ovn-controller upgrade to a version that supports
@@ -6291,7 +6283,7 @@ check ovn-sbctl set chassis ch1
other_config:ct-no-masked-label=true
check ovn-nbctl --wait=sb sync
ovn-sbctl dump-flows lr0 > lr0flows
AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
's/table=../table=??/'], [0], [dnl
- table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && reg9[[5]] ==
1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
+ table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl &&
ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst
= xxreg1[[32..79]]; pop(xxreg1); next;)
])
# add ecmp route with wrong nexthop
--
2.39.3
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev