On Fri, Sep 1, 2023 at 12:56 PM Dumitru Ceara <[email protected]> wrote:
> Commit 506f7d4bcfbc ("northd: rely on new actions for ecmp-symmetric
> routing") relied on commit_ecmp_nh() to learn the mapping between a
> traffic session's 5-tuple and the MAC of the next-hop to be used for
> that session. This logical action translates to OVS learn() action.
>
> While in theory this is the most correct way of tracking the mapping
> between ECMP symmetric reply sessions and their next hop's MAC address,
> it also creates additional load in ovs-vswitchd (to learn the new flows)
> and introduces latency and affects traffic throughput.
>
> An alternative is to instead re-commit the 5-tuple (along with the
> next-hop MAC and port information) to conntrack every time traffic
> received from the next-hop side is forwarded by the OVN router.
>
> Testing shows that in a scenario with 4 next-hops and ECMP symmetric
> replies enabled with traffic running for 600 seconds latency, throughput
> and ovs-vswitchd CPU usage are significantly better with this change:
>
> - Before:
> - ovs-vswitchd ~1200% CPU (distributed across 17 revalidator threads)
> - Sent: 638.72MiB, 1.06MiB/s
> - Recv: 7.17GiB, 12.24MiB/s
>
> - After:
> - ovs-vswitchd ~7% CPU (distributed across 17 revalidator threads)
> - Sent: 892.69MiB, 1.49MiB/s
> - Recv: 8.63GiB, 14.72MiB/s
>
> The only downside of not using learn() flows is that OVN cannot
> determine on its own when a next-hop goes away (without using BFD).
> This scenario however can probably be handled by the CMS which has more
> knowledge about the rest of the network (outside OVN), e.g.,
> ovn-kubernetes currently flushes conntrack entries created for ECMP
> symmetric reply sessions when it detects that the next-hop went away.
>
> If a next-hops changes MAC address that's handled by OVN gracefully and
> the conntrack entry corresponding to that session gets updated
> accordingly.
>
> NOTE: we don't remove the logical actions' implementation as
> ovn-controller needs to be able to translate logical flows generated by
> older versions of ovn-northd.
>
> Fixes: 506f7d4bcfbc ("northd: rely on new actions for ecmp-symmetric
> routing")
> Signed-off-by: Dumitru Ceara <[email protected]>
> ---
> northd/northd.c | 52 ++++++++++++++++++---------------------------
> tests/ovn-northd.at | 20 ++++++-----------
> 2 files changed, 27 insertions(+), 45 deletions(-)
>
> diff --git a/northd/northd.c b/northd/northd.c
> index 459aaafb1c..e67d34cd28 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -262,7 +262,6 @@ enum ovn_stage {
> #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
> #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
> #define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
> -#define REGBIT_KNOWN_ECMP_NH "reg9[5]"
> #define REGBIT_KNOWN_LB_SESSION "reg9[6]"
>
> /* Register to store the eth address associated to a router port for
> packets
> @@ -370,8 +369,7 @@ enum ovn_stage {
> * | | EGRESS_LOOPBACK/ | G | UNUSED |
> * | R9 | PKT_LARGER/ | 4 | |
> * | | LOOKUP_NEIGHBOR_RESULT/ | | |
> - * | | SKIP_LOOKUP_NEIGHBOR/ | | |
> - * | | KNOWN_ECMP_NH} | | |
> + * | | SKIP_LOOKUP_NEIGHBOR} | | |
> * | | | | |
> * | | REG_ORIG_TP_DPORT_ROUTER | | |
> * | | | | |
> @@ -10846,15 +10844,13 @@ add_ecmp_symmetric_reply_flows(struct hmap
> *lflows,
> cidr);
> free(cidr);
> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> - ds_cstr(&base_match),
> - REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
> - &st_route->header_);
> + ds_cstr(&base_match), "ct_next;",
> + &st_route->header_);
>
> /* And packets that go out over an ECMP route need conntrack */
> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> - ds_cstr(route_match),
> - REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
> - &st_route->header_);
> + ds_cstr(route_match), "ct_next;",
> + &st_route->header_);
>
> /* Save src eth and inport in ct_label for packets that arrive over
> * an ECMP route.
> @@ -10867,9 +10863,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
> ds_put_format(&actions,
> "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> " %s = %" PRId64 ";}; "
> - "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> + "next;",
> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> ds_cstr(&match), ds_cstr(&actions),
> &st_route->header_);
> @@ -10880,9 +10875,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
> ds_put_format(&actions,
> "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> " %s = %" PRId64 ";}; "
> - "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> + "next;",
> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> ds_cstr(&match), ds_cstr(&actions),
> &st_route->header_);
> @@ -10893,53 +10887,49 @@ add_ecmp_symmetric_reply_flows(struct hmap
> *lflows,
> ds_put_format(&actions,
> "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> " %s = %" PRId64 ";}; "
> - "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> + "next;",
> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> ds_cstr(&match), ds_cstr(&actions),
> &st_route->header_);
>
> ds_clear(&match);
> ds_put_format(&match,
> - "%s && (!ct.rpl && ct.est) && tcp && "REGBIT_KNOWN_ECMP_NH"
> == 0",
> + "%s && (!ct.rpl && ct.est) && tcp",
> ds_cstr(&base_match));
> ds_clear(&actions);
> ds_put_format(&actions,
> "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> " %s = %" PRId64 ";}; "
> - "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> + "next;",
> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> ds_cstr(&match), ds_cstr(&actions),
> &st_route->header_);
>
> ds_clear(&match);
> ds_put_format(&match,
> - "%s && (!ct.rpl && ct.est) && udp && "REGBIT_KNOWN_ECMP_NH"
> == 0",
> + "%s && (!ct.rpl && ct.est) && udp",
> ds_cstr(&base_match));
> ds_clear(&actions);
> ds_put_format(&actions,
> "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> " %s = %" PRId64 ";}; "
> - "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> + "next;",
> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> ds_cstr(&match), ds_cstr(&actions),
> &st_route->header_);
> ds_clear(&match);
> ds_put_format(&match,
> - "%s && (!ct.rpl && ct.est) && sctp && "REGBIT_KNOWN_ECMP_NH"
> == 0",
> + "%s && (!ct.rpl && ct.est) && sctp",
> ds_cstr(&base_match));
> ds_clear(&actions);
> ds_put_format(&actions,
> "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> " %s = %" PRId64 ";}; "
> - "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> + "next;",
> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> ds_cstr(&match), ds_cstr(&actions),
> &st_route->header_);
> @@ -10948,7 +10938,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
> * for where to route the packet.
> */
> ds_put_format(&ecmp_reply,
> - "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s ==
> %"PRId64,
> + "ct.rpl && %s == %"PRId64,
> ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> ds_clear(&match);
> ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index 3ef92bb3ff..2e1a85c9ac 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -6264,24 +6264,16 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp" lr0flows
> | sed 's/192\.168\.0\..0/192.
> table=??(lr_in_ip_routing_ecmp), priority=150 , match=(reg8[[0..15]]
> == 0), action=(next;)
> ])
>
> -AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows | sed
> 's/table=../table=??/' | sort], [0], [dnl
> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp &&
> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp);
> next;)
> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp &&
> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
> next;)
> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp &&
> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
> next;)
> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp),
> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp);
> next;)
> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp),
> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
> next;)
> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp),
> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
> next;)
> -])
> -
> -AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed
> 's/table=../table=??/' | sort], [0], [dnl
> - table=??(lr_in_defrag ), priority=100 , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] = chk_ecmp_nh_mac();
> ct_next;)
> - table=??(lr_in_defrag ), priority=100 , match=(reg7 == 0 &&
> ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
> +AT_CHECK([grep -e "lr_in_defrag" lr0flows | sed 's/table=../table=??/' |
> sort], [0], [dnl
> + table=??(lr_in_defrag ), priority=0 , match=(1), action=(next;)
> + table=??(lr_in_defrag ), priority=100 , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1), action=(ct_next;)
> + table=??(lr_in_defrag ), priority=100 , match=(reg7 == 0 &&
> ip4.dst == 1.0.0.1/32), action=(ct_next;)
> ])
>
> dnl The chassis was created with other_config:ct-no-masked-label=false,
> the flows
> dnl should be using ct_label.ecmp_reply_port.
> AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
> 's/table=../table=??/'], [0], [dnl
> - table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl &&
> reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1);
> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> + table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl &&
> ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> ])
>
> dnl Simulate an ovn-controller upgrade to a version that supports
> @@ -6291,7 +6283,7 @@ check ovn-sbctl set chassis ch1
> other_config:ct-no-masked-label=true
> check ovn-nbctl --wait=sb sync
> ovn-sbctl dump-flows lr0 > lr0flows
> AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
> 's/table=../table=??/'], [0], [dnl
> - table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl &&
> reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1);
> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> + table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl &&
> ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> ])
>
> # add ecmp route with wrong nexthop
> --
> 2.39.3
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Looks good to me, thanks.
Reviewed-by: Ales Musil <[email protected]>
--
Ales Musil
Senior Software Engineer - OVN Core
Red Hat EMEA <https://www.redhat.com>
[email protected] IM: amusil
<https://red.ht/sig>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev