On 7/13/22 14:36, Ales Musil wrote:
> The ovn-controller had a race condition over MAC
> binding table with other controllers. When multiple
> controllers received GARP from single source usually
> the one who was able to win the race put it into SB.
> The others got transaction error which triggered
> full recompute even if it's not needed.
>
> In order to reduce the chance of multiple controllers
> trying to insert same row at the same time add slight
> delay to the MAC binding processing. This delay
> is random interval between 1-50 in ms. This greatly
> reduces the chance that multiple controllers will try
> to add MAC binding at exactly the same time. This applies
> only to multicast ARP request which applies only to GARP
> that is sent to broadcast address.
>
> Local testing with this delay vs without show significantly
> reduced chance of hitting the transaction error.
>
> During the testing 10 GARPs was sent to two controllers
> at the same time. Without proposed fix at least one controller
> had multiple transaction errors present every test iteration.
>
> With the proposed fix the transaction error was reduced to a
> single one when it happened which was usually in less than
> 10% of the iterations.
>
> As was mentioned before the race condition can still happen,
> but the chance is greatly reduced.
>
> Suggested-by: Daniel Alvarez Sanchez <[email protected]>
> Signed-off-by: Ales Musil <[email protected]>
> ---
Hi Ales,
Thanks for this revision! In general it makes sense but I have some
comments here and there.
> v3: Rebase on top of current main.
> Change the delay to be per ARP request instead of
> single delay for everything. This allows the possibility
> to delay only multicast/broadcast AP as suggested by Han.
> ---
> controller/mac-learn.c | 23 ++++++++++++++++++++++-
> controller/mac-learn.h | 7 ++++++-
> controller/pinctrl.c | 33 ++++++++++++++++++++++++---------
> tests/ovn.at | 7 +++++++
> 4 files changed, 59 insertions(+), 11 deletions(-)
>
> diff --git a/controller/mac-learn.c b/controller/mac-learn.c
> index 27634dca8..b7b392bb1 100644
> --- a/controller/mac-learn.c
> +++ b/controller/mac-learn.c
> @@ -20,12 +20,15 @@
> /* OpenvSwitch lib includes. */
> #include "openvswitch/vlog.h"
> #include "lib/packets.h"
> +#include "lib/random.h"
> #include "lib/smap.h"
> +#include "lib/timeval.h"
>
> VLOG_DEFINE_THIS_MODULE(mac_learn);
>
> #define MAX_MAC_BINDINGS 1000
> #define MAX_FDB_ENTRIES 1000
> +#define MAX_MAC_BINDING_DELAY_MSEC 50
>
> static size_t mac_binding_hash(uint32_t dp_key, uint32_t port_key,
> struct in6_addr *);
> @@ -64,7 +67,7 @@ ovn_mac_bindings_destroy(struct hmap *mac_bindings)
> struct mac_binding *
> ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t dp_key,
> uint32_t port_key, struct in6_addr *ip,
> - struct eth_addr mac)
> + struct eth_addr mac, bool is_delayed)
What if we pass 'int max_delay_ms' instead of 'bool is_delayed'?
> {
> uint32_t hash = mac_binding_hash(dp_key, port_key, ip);
>
> @@ -79,6 +82,11 @@ ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t
> dp_key,
> mb->dp_key = dp_key;
> mb->port_key = port_key;
> mb->ip = *ip;
> + mb->delay = 0;
> + mb->created = time_msec();
> + if (is_delayed) {
> + mb->delay = random_range(MAX_MAC_BINDING_DELAY_MSEC) + 1;
> + }
This could become:
mb->delay = max_delay_ms ? random_range(max_delay_ms) + 1 : 0;
> hmap_insert(mac_bindings, &mb->hmap_node, hash);
> }
> mb->mac = mac;
> @@ -86,6 +94,19 @@ ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t
> dp_key,
> return mb;
> }
>
> +long long
> +ovn_mac_binding_shortest_delay(struct hmap *mac_bindings)
> +{
> + long long min = LLONG_MAX;
> +
> + struct mac_binding *mb;
> + HMAP_FOR_EACH (mb, hmap_node, mac_bindings) {
> + min = MIN(mb->delay, min);
> + }
> +
> + return min;
> +}
We don't really need to return the shortest delay. We can rewrite this as:
void
ovn_mac_binding_wait(struct hmap *mac_bindings)
{
struct mac_binding *mb;
HMAP_FOR_EACH (mb, hmap_node, mac_bindings) {
poll_timer_wait(mb->delay);
}
}
and call it in wait_put_mac_bindings().
> +
> /* fdb functions. */
> void
> ovn_fdb_init(struct hmap *fdbs)
> diff --git a/controller/mac-learn.h b/controller/mac-learn.h
> index e7e8ba2d3..d306bfa5a 100644
> --- a/controller/mac-learn.h
> +++ b/controller/mac-learn.h
> @@ -31,16 +31,21 @@ struct mac_binding {
>
> /* Value. */
> struct eth_addr mac;
> +
> + /* Delay for multicast ARP. */
This doesn't seem accurate. It's actually "delay before adding
mac_binding to the SB". It will be 0 for mac_bindings learned from
unicast replies.
> + long long created;
> + uint32_t delay;
Let's add the units please, something like:
long long created_ms;
uint32_t delay_ms;
Or maybe 'commit_delay_ms'?
> };
>
> void ovn_mac_bindings_init(struct hmap *mac_bindings);
> void ovn_mac_bindings_flush(struct hmap *mac_bindings);
> void ovn_mac_bindings_destroy(struct hmap *mac_bindings);
> +long long ovn_mac_binding_shortest_delay(struct hmap *mac_bindings);
>
> struct mac_binding *ovn_mac_binding_add(struct hmap *mac_bindings,
> uint32_t dp_key, uint32_t port_key,
> struct in6_addr *ip,
> - struct eth_addr mac);
> + struct eth_addr mac, bool
> is_delayed);
>
>
>
> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
> index f954362b7..9c615ebd5 100644
> --- a/controller/pinctrl.c
> +++ b/controller/pinctrl.c
> @@ -4134,9 +4134,13 @@ pinctrl_handle_put_mac_binding(const struct flow *md,
> memcpy(&ip_key, &ip6, sizeof ip_key);
> }
>
> + /* If the ARP reply was unicast we should not delay it,
> + * there won't be any race. */
> + bool is_multicast = eth_addr_is_multicast(headers->dl_dst);
> struct mac_binding *mb = ovn_mac_binding_add(&put_mac_bindings, dp_key,
> port_key, &ip_key,
> - headers->dl_src);
> + headers->dl_src,
> + is_multicast);
It's mainly a personal preference but I'd rewrite this as:
uint32_t max_delay_ms = eth_addr_is_multicast(headers->dl_dst)
? MAX_MAC_BINDING_DELAY_MSEC
: 0;
truct mac_binding *mb = ovn_mac_binding_add(&put_mac_bindings, dp_key,
port_key, &ip_key,
headers->dl_src,
max_delay_ms);
> if (!mb) {
> COVERAGE_INC(pinctrl_drop_put_mac_binding);
> return;
> @@ -4296,14 +4300,23 @@ run_put_mac_bindings(struct ovsdb_idl_txn
> *ovnsb_idl_txn,
> return;
> }
>
> - const struct mac_binding *mb;
> - HMAP_FOR_EACH (mb, hmap_node, &put_mac_bindings) {
> - run_put_mac_binding(ovnsb_idl_txn, sbrec_datapath_binding_by_key,
> - sbrec_port_binding_by_key,
> - sbrec_mac_binding_by_lport_ip,
> - mb);
> + long long now = time_msec();
> +
> + struct mac_binding *mb;
> + HMAP_FOR_EACH_SAFE (mb, hmap_node, &put_mac_bindings) {
> + long long elapsed = now - mb->created;
> + if (elapsed >= mb->delay) {
> + run_put_mac_binding(ovnsb_idl_txn,
> + sbrec_datapath_binding_by_key,
> + sbrec_port_binding_by_key,
> + sbrec_mac_binding_by_lport_ip,
> + mb);
> + hmap_remove(&put_mac_bindings, &mb->hmap_node);
> + free(mb);
We used to call ovn_mac_bindings_flush() to remove all elements, now we
remove them in the caller. I think this breaks a bit the encapsulation
that mac-learn.[hc] tried to add. Can we add a new api, e.g.,
ovn_mac_binding_remove(), that will remove the entry from the map and
free it?
Now that we're here, what if the SB commit fails for some other
unrelated reason? We'll force a new ARP request for future packets. It
used to be like that before your patch too. I'm just wondering if we
need to flush 'put_mac_bindings' only after we know for sure that the SB
record was created. Definitely something that should be addressed (if
we go that way) in a different patch though.
> + } else {
> + mb->delay -= elapsed;
Hmm, isn't this wrong? We don't need the "else", do we?
> + }
> }
> - ovn_mac_bindings_flush(&put_mac_bindings);
ovn_mac_bindings_flush() is not used anymore anywhere else except
mac-learn.c, in ovn_mac_bindings_destroy(). We can just inline it there.
> }
>
> static void
> @@ -4352,9 +4365,11 @@ run_buffered_binding(struct ovsdb_idl_index
> *sbrec_mac_binding_by_lport_ip,
>
> static void
> wait_put_mac_bindings(struct ovsdb_idl_txn *ovnsb_idl_txn)
> + OVS_REQUIRES(pinctrl_mutex)
> {
> if (ovnsb_idl_txn && !hmap_is_empty(&put_mac_bindings)) {
> - poll_immediate_wake();
> + long long delay = ovn_mac_binding_shortest_delay(&put_mac_bindings);
> + poll_timer_wait(delay);
> }
> }
>
> diff --git a/tests/ovn.at b/tests/ovn.at
> index c346975e6..af82024ac 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -19601,6 +19601,7 @@ data=0800bee4391a0001
>
> send_icmp_packet 1 1 $src_mac $router_mac0 $src_ip $dst_ip 0d1c $data
> send_arp_reply 2 1 $dst_mac $router_mac1 $dst_ip $router_ip
> +OVS_WAIT_UNTIL([ovn-sbctl list mac_binding | grep -q "172.16.1.10"])
> echo $(get_arp_req $router_mac1 $router_ip $dst_ip) > expected
> echo
> "${dst_mac}${router_mac1}08004500001c00004000fe0122b5${router_ip}${dst_ip}${data}"
> >> expected
>
> @@ -19641,6 +19642,7 @@ gw_router_mac=f00000010a0a
> send_icmp_packet 2 2 f00000110203 $router_mac0 $src_ip $dst_ip 0c1b $data
> echo $(get_arp_req f00000010204 $fip_ip $gw_router_ip) >> expected
> send_arp_reply 2 1 $gw_router_mac f00000010204 $gw_router_ip $fip_ip
> +OVS_WAIT_UNTIL([ovn-sbctl list mac_binding | grep -q "172.16.1.11"])
I'm not sure I understand why we need this.
> echo
> "${gw_router_mac}f0000001020408004500001c00004000fe0121b4${fip_ip}${dst_ip}${data}"
> >> expected
>
> AT_CHECK([as hv2 ovs-ofctl dump-flows br-int table=37 | grep pkt_mark=0x64 |
> grep -q n_packets=1],[0])
> @@ -23049,6 +23051,11 @@ grep table_id=10 | wc -l`])
>
> check_row_count MAC_Binding 1
>
> +OVS_WAIT_UNTIL(
> + [test 1 = `as hv1 ovs-ofctl dump-flows br-int table=67 | grep
> dl_src=50:54:00:00:00:13 \
> +| wc -l`]
> +)
> +
> AT_CHECK([ovn-sbctl --format=csv --bare --columns logical_port,ip,mac \
> list mac_binding], [0], [lr0-sw0
> 10.0.0.30
Thanks,
Dumitru
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev