On 7/13/22 14:36, Ales Musil wrote:
> The ovn-controller had a race condition over MAC
> binding table with other controllers. When multiple
> controllers received GARP from single source usually
> the one who was able to win the race put it into SB.
> The others got transaction error which triggered
> full recompute even if it's not needed.
> 
> In order to reduce the chance of multiple controllers
> trying to insert same row at the same time add slight
> delay to the MAC binding processing. This delay
> is random interval between 1-50 in ms. This greatly
> reduces the chance that multiple controllers will try
> to add MAC binding at exactly the same time. This applies
> only to multicast ARP request which applies only to GARP
> that is sent to broadcast address.
> 
> Local testing with this delay vs without show significantly
> reduced chance of hitting the transaction error.
> 
> During the testing 10 GARPs was sent to two controllers
> at the same time. Without proposed fix at least one controller
> had multiple transaction errors present every test iteration.
> 
> With the proposed fix the transaction error was reduced to a
> single one when it happened which was usually in less than
> 10% of the iterations.
> 
> As was mentioned before the race condition can still happen,
> but the chance is greatly reduced.
> 
> Suggested-by: Daniel Alvarez Sanchez <[email protected]>
> Signed-off-by: Ales Musil <[email protected]>
> ---

Hi Ales,

Thanks for this revision!  In general it makes sense but I have some
comments here and there.

> v3: Rebase on top of current main.
>     Change the delay to be per ARP request instead of
>     single delay for everything. This allows the possibility
>     to delay only multicast/broadcast AP as suggested by Han.
> ---
>  controller/mac-learn.c | 23 ++++++++++++++++++++++-
>  controller/mac-learn.h |  7 ++++++-
>  controller/pinctrl.c   | 33 ++++++++++++++++++++++++---------
>  tests/ovn.at           |  7 +++++++
>  4 files changed, 59 insertions(+), 11 deletions(-)
> 
> diff --git a/controller/mac-learn.c b/controller/mac-learn.c
> index 27634dca8..b7b392bb1 100644
> --- a/controller/mac-learn.c
> +++ b/controller/mac-learn.c
> @@ -20,12 +20,15 @@
>  /* OpenvSwitch lib includes. */
>  #include "openvswitch/vlog.h"
>  #include "lib/packets.h"
> +#include "lib/random.h"
>  #include "lib/smap.h"
> +#include "lib/timeval.h"
>  
>  VLOG_DEFINE_THIS_MODULE(mac_learn);
>  
>  #define MAX_MAC_BINDINGS 1000
>  #define MAX_FDB_ENTRIES  1000
> +#define MAX_MAC_BINDING_DELAY_MSEC 50
>  
>  static size_t mac_binding_hash(uint32_t dp_key, uint32_t port_key,
>                                 struct in6_addr *);
> @@ -64,7 +67,7 @@ ovn_mac_bindings_destroy(struct hmap *mac_bindings)
>  struct mac_binding *
>  ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t dp_key,
>                      uint32_t port_key, struct in6_addr *ip,
> -                    struct eth_addr mac)
> +                    struct eth_addr mac, bool is_delayed)

What if we pass 'int max_delay_ms' instead of 'bool is_delayed'?

>  {
>      uint32_t hash = mac_binding_hash(dp_key, port_key, ip);
>  
> @@ -79,6 +82,11 @@ ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t 
> dp_key,
>          mb->dp_key = dp_key;
>          mb->port_key = port_key;
>          mb->ip = *ip;
> +        mb->delay = 0;
> +        mb->created = time_msec();
> +        if (is_delayed) {
> +            mb->delay = random_range(MAX_MAC_BINDING_DELAY_MSEC) + 1;
> +        }

This could become:

mb->delay = max_delay_ms ? random_range(max_delay_ms) + 1 : 0;

>          hmap_insert(mac_bindings, &mb->hmap_node, hash);
>      }
>      mb->mac = mac;
> @@ -86,6 +94,19 @@ ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t 
> dp_key,
>      return mb;
>  }
>  
> +long long
> +ovn_mac_binding_shortest_delay(struct hmap *mac_bindings)
> +{
> +    long long min = LLONG_MAX;
> +
> +    struct mac_binding *mb;
> +    HMAP_FOR_EACH (mb, hmap_node, mac_bindings) {
> +        min = MIN(mb->delay, min);
> +    }
> +
> +    return min;
> +}

We don't really need to return the shortest delay.  We can rewrite this as:

void
ovn_mac_binding_wait(struct hmap *mac_bindings)
{
    struct mac_binding *mb;
    HMAP_FOR_EACH (mb, hmap_node, mac_bindings) {
        poll_timer_wait(mb->delay);
    }
}

and call it in wait_put_mac_bindings().

> +
>  /* fdb functions. */
>  void
>  ovn_fdb_init(struct hmap *fdbs)
> diff --git a/controller/mac-learn.h b/controller/mac-learn.h
> index e7e8ba2d3..d306bfa5a 100644
> --- a/controller/mac-learn.h
> +++ b/controller/mac-learn.h
> @@ -31,16 +31,21 @@ struct mac_binding {
>  
>      /* Value. */
>      struct eth_addr mac;
> +
> +    /* Delay for multicast ARP. */

This doesn't seem accurate.  It's actually "delay before adding
mac_binding to the SB".  It will be 0 for mac_bindings learned from
unicast replies.

> +    long long created;
> +    uint32_t delay;

Let's add the units please, something like:

long long created_ms;
uint32_t delay_ms;

Or maybe 'commit_delay_ms'?

>  };
>  
>  void ovn_mac_bindings_init(struct hmap *mac_bindings);
>  void ovn_mac_bindings_flush(struct hmap *mac_bindings);
>  void ovn_mac_bindings_destroy(struct hmap *mac_bindings);
> +long long ovn_mac_binding_shortest_delay(struct hmap *mac_bindings);
>  
>  struct mac_binding *ovn_mac_binding_add(struct hmap *mac_bindings,
>                                          uint32_t dp_key, uint32_t port_key,
>                                          struct in6_addr *ip,
> -                                        struct eth_addr mac);
> +                                        struct eth_addr mac, bool 
> is_delayed);
>  
>  
>  
> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
> index f954362b7..9c615ebd5 100644
> --- a/controller/pinctrl.c
> +++ b/controller/pinctrl.c
> @@ -4134,9 +4134,13 @@ pinctrl_handle_put_mac_binding(const struct flow *md,
>          memcpy(&ip_key, &ip6, sizeof ip_key);
>      }
>  
> +    /* If the ARP reply was unicast we should not delay it,
> +     * there won't be any race. */
> +    bool is_multicast = eth_addr_is_multicast(headers->dl_dst);
>      struct mac_binding *mb = ovn_mac_binding_add(&put_mac_bindings, dp_key,
>                                                   port_key, &ip_key,
> -                                                 headers->dl_src);
> +                                                 headers->dl_src,
> +                                                 is_multicast);

It's mainly a personal preference but I'd rewrite this as:

uint32_t max_delay_ms = eth_addr_is_multicast(headers->dl_dst)
                        ? MAX_MAC_BINDING_DELAY_MSEC
                        : 0;
truct mac_binding *mb = ovn_mac_binding_add(&put_mac_bindings, dp_key,
                                            port_key, &ip_key,
                                            headers->dl_src,
                                            max_delay_ms);

>      if (!mb) {
>          COVERAGE_INC(pinctrl_drop_put_mac_binding);
>          return;
> @@ -4296,14 +4300,23 @@ run_put_mac_bindings(struct ovsdb_idl_txn 
> *ovnsb_idl_txn,
>          return;
>      }
>  
> -    const struct mac_binding *mb;
> -    HMAP_FOR_EACH (mb, hmap_node, &put_mac_bindings) {
> -        run_put_mac_binding(ovnsb_idl_txn, sbrec_datapath_binding_by_key,
> -                            sbrec_port_binding_by_key,
> -                            sbrec_mac_binding_by_lport_ip,
> -                            mb);
> +    long long now = time_msec();
> +
> +    struct mac_binding *mb;
> +    HMAP_FOR_EACH_SAFE (mb, hmap_node, &put_mac_bindings) {
> +            long long elapsed = now - mb->created;
> +            if (elapsed >= mb->delay) {
> +                run_put_mac_binding(ovnsb_idl_txn,
> +                                    sbrec_datapath_binding_by_key,
> +                                    sbrec_port_binding_by_key,
> +                                    sbrec_mac_binding_by_lport_ip,
> +                                    mb);
> +                hmap_remove(&put_mac_bindings, &mb->hmap_node);
> +                free(mb);

We used to call ovn_mac_bindings_flush() to remove all elements, now we
remove them in the caller.  I think this breaks a bit the encapsulation
that mac-learn.[hc] tried to add.  Can we add a new api, e.g.,
ovn_mac_binding_remove(), that will remove the entry from the map and
free it?

Now that we're here, what if the SB commit fails for some other
unrelated reason?  We'll force a new ARP request for future packets.  It
used to be like that before your patch too.  I'm just wondering if we
need to flush 'put_mac_bindings' only after we know for sure that the SB
record was created.  Definitely something that should be addressed (if
we go that way) in a different patch though.

> +            } else {
> +                mb->delay -= elapsed;

Hmm, isn't this wrong?  We don't need the "else", do we?

> +            }
>      }
> -    ovn_mac_bindings_flush(&put_mac_bindings);

ovn_mac_bindings_flush() is not used anymore anywhere else except
mac-learn.c, in ovn_mac_bindings_destroy().  We can just inline it there.

>  }
>  
>  static void
> @@ -4352,9 +4365,11 @@ run_buffered_binding(struct ovsdb_idl_index 
> *sbrec_mac_binding_by_lport_ip,
>  
>  static void
>  wait_put_mac_bindings(struct ovsdb_idl_txn *ovnsb_idl_txn)
> +    OVS_REQUIRES(pinctrl_mutex)
>  {
>      if (ovnsb_idl_txn && !hmap_is_empty(&put_mac_bindings)) {
> -        poll_immediate_wake();
> +        long long delay = ovn_mac_binding_shortest_delay(&put_mac_bindings);
> +        poll_timer_wait(delay);
>      }
>  }
>  
> diff --git a/tests/ovn.at b/tests/ovn.at
> index c346975e6..af82024ac 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -19601,6 +19601,7 @@ data=0800bee4391a0001
>  
>  send_icmp_packet 1 1 $src_mac $router_mac0 $src_ip $dst_ip 0d1c $data
>  send_arp_reply 2 1 $dst_mac $router_mac1 $dst_ip $router_ip
> +OVS_WAIT_UNTIL([ovn-sbctl list mac_binding | grep -q "172.16.1.10"])
>  echo $(get_arp_req $router_mac1 $router_ip $dst_ip) > expected
>  echo 
> "${dst_mac}${router_mac1}08004500001c00004000fe0122b5${router_ip}${dst_ip}${data}"
>  >> expected
>  
> @@ -19641,6 +19642,7 @@ gw_router_mac=f00000010a0a
>  send_icmp_packet 2 2 f00000110203 $router_mac0 $src_ip $dst_ip 0c1b $data
>  echo $(get_arp_req f00000010204 $fip_ip $gw_router_ip) >> expected
>  send_arp_reply 2 1 $gw_router_mac f00000010204 $gw_router_ip $fip_ip
> +OVS_WAIT_UNTIL([ovn-sbctl list mac_binding | grep -q "172.16.1.11"])

I'm not sure I understand why we need this.

>  echo 
> "${gw_router_mac}f0000001020408004500001c00004000fe0121b4${fip_ip}${dst_ip}${data}"
>  >> expected
>  
>  AT_CHECK([as hv2 ovs-ofctl dump-flows br-int table=37 | grep pkt_mark=0x64 | 
> grep -q n_packets=1],[0])
> @@ -23049,6 +23051,11 @@ grep table_id=10 | wc -l`])
>  
>  check_row_count MAC_Binding 1
>  
> +OVS_WAIT_UNTIL(
> +    [test 1 = `as hv1 ovs-ofctl dump-flows br-int table=67 | grep 
> dl_src=50:54:00:00:00:13 \
> +| wc -l`]
> +)
> +
>  AT_CHECK([ovn-sbctl --format=csv --bare --columns logical_port,ip,mac \
>  list mac_binding], [0], [lr0-sw0
>  10.0.0.30

Thanks,
Dumitru

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to