Hi, thank you for the review, I have posted v4. See also my replies inline.
On Wed, Jul 13, 2022 at 10:02 PM Dumitru Ceara <[email protected]> wrote: > On 7/13/22 14:36, Ales Musil wrote: > > The ovn-controller had a race condition over MAC > > binding table with other controllers. When multiple > > controllers received GARP from single source usually > > the one who was able to win the race put it into SB. > > The others got transaction error which triggered > > full recompute even if it's not needed. > > > > In order to reduce the chance of multiple controllers > > trying to insert same row at the same time add slight > > delay to the MAC binding processing. This delay > > is random interval between 1-50 in ms. This greatly > > reduces the chance that multiple controllers will try > > to add MAC binding at exactly the same time. This applies > > only to multicast ARP request which applies only to GARP > > that is sent to broadcast address. > > > > Local testing with this delay vs without show significantly > > reduced chance of hitting the transaction error. > > > > During the testing 10 GARPs was sent to two controllers > > at the same time. Without proposed fix at least one controller > > had multiple transaction errors present every test iteration. > > > > With the proposed fix the transaction error was reduced to a > > single one when it happened which was usually in less than > > 10% of the iterations. > > > > As was mentioned before the race condition can still happen, > > but the chance is greatly reduced. > > > > Suggested-by: Daniel Alvarez Sanchez <[email protected]> > > Signed-off-by: Ales Musil <[email protected]> > > --- > > Hi Ales, > > Thanks for this revision! In general it makes sense but I have some > comments here and there. > > > v3: Rebase on top of current main. > > Change the delay to be per ARP request instead of > > single delay for everything. This allows the possibility > > to delay only multicast/broadcast AP as suggested by Han. > > --- > > controller/mac-learn.c | 23 ++++++++++++++++++++++- > > controller/mac-learn.h | 7 ++++++- > > controller/pinctrl.c | 33 ++++++++++++++++++++++++--------- > > tests/ovn.at | 7 +++++++ > > 4 files changed, 59 insertions(+), 11 deletions(-) > > > > diff --git a/controller/mac-learn.c b/controller/mac-learn.c > > index 27634dca8..b7b392bb1 100644 > > --- a/controller/mac-learn.c > > +++ b/controller/mac-learn.c > > @@ -20,12 +20,15 @@ > > /* OpenvSwitch lib includes. */ > > #include "openvswitch/vlog.h" > > #include "lib/packets.h" > > +#include "lib/random.h" > > #include "lib/smap.h" > > +#include "lib/timeval.h" > > > > VLOG_DEFINE_THIS_MODULE(mac_learn); > > > > #define MAX_MAC_BINDINGS 1000 > > #define MAX_FDB_ENTRIES 1000 > > +#define MAX_MAC_BINDING_DELAY_MSEC 50 > > > > static size_t mac_binding_hash(uint32_t dp_key, uint32_t port_key, > > struct in6_addr *); > > @@ -64,7 +67,7 @@ ovn_mac_bindings_destroy(struct hmap *mac_bindings) > > struct mac_binding * > > ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t dp_key, > > uint32_t port_key, struct in6_addr *ip, > > - struct eth_addr mac) > > + struct eth_addr mac, bool is_delayed) > > What if we pass 'int max_delay_ms' instead of 'bool is_delayed'? > This kinda breaks the encapsulation a bit. Why would the code outside need to know what is the max delay? I am still inclined to have it as a bool. > > > { > > uint32_t hash = mac_binding_hash(dp_key, port_key, ip); > > > > @@ -79,6 +82,11 @@ ovn_mac_binding_add(struct hmap *mac_bindings, > uint32_t dp_key, > > mb->dp_key = dp_key; > > mb->port_key = port_key; > > mb->ip = *ip; > > + mb->delay = 0; > > + mb->created = time_msec(); > > + if (is_delayed) { > > + mb->delay = random_range(MAX_MAC_BINDING_DELAY_MSEC) + 1; > > + } > > This could become: > > mb->delay = max_delay_ms ? random_range(max_delay_ms) + 1 : 0; > Done > > > hmap_insert(mac_bindings, &mb->hmap_node, hash); > > } > > mb->mac = mac; > > @@ -86,6 +94,19 @@ ovn_mac_binding_add(struct hmap *mac_bindings, > uint32_t dp_key, > > return mb; > > } > > > > +long long > > +ovn_mac_binding_shortest_delay(struct hmap *mac_bindings) > > +{ > > + long long min = LLONG_MAX; > > + > > + struct mac_binding *mb; > > + HMAP_FOR_EACH (mb, hmap_node, mac_bindings) { > > + min = MIN(mb->delay, min); > > + } > > + > > + return min; > > +} > > We don't really need to return the shortest delay. We can rewrite this as: > > void > ovn_mac_binding_wait(struct hmap *mac_bindings) > { > struct mac_binding *mb; > HMAP_FOR_EACH (mb, hmap_node, mac_bindings) { > poll_timer_wait(mb->delay); > } > } > > and call it in wait_put_mac_bindings(). > That's actually a good trick, thanks. > > > + > > /* fdb functions. */ > > void > > ovn_fdb_init(struct hmap *fdbs) > > diff --git a/controller/mac-learn.h b/controller/mac-learn.h > > index e7e8ba2d3..d306bfa5a 100644 > > --- a/controller/mac-learn.h > > +++ b/controller/mac-learn.h > > @@ -31,16 +31,21 @@ struct mac_binding { > > > > /* Value. */ > > struct eth_addr mac; > > + > > + /* Delay for multicast ARP. */ > > This doesn't seem accurate. It's actually "delay before adding > mac_binding to the SB". It will be 0 for mac_bindings learned from > unicast replies. > Right, done. > > > + long long created; > > + uint32_t delay; > > Let's add the units please, something like: > > long long created_ms; > uint32_t delay_ms; > > Or maybe 'commit_delay_ms'? > Done > > > }; > > > > void ovn_mac_bindings_init(struct hmap *mac_bindings); > > void ovn_mac_bindings_flush(struct hmap *mac_bindings); > > void ovn_mac_bindings_destroy(struct hmap *mac_bindings); > > +long long ovn_mac_binding_shortest_delay(struct hmap *mac_bindings); > > > > struct mac_binding *ovn_mac_binding_add(struct hmap *mac_bindings, > > uint32_t dp_key, uint32_t > port_key, > > struct in6_addr *ip, > > - struct eth_addr mac); > > + struct eth_addr mac, bool > is_delayed); > > > > > > > > diff --git a/controller/pinctrl.c b/controller/pinctrl.c > > index f954362b7..9c615ebd5 100644 > > --- a/controller/pinctrl.c > > +++ b/controller/pinctrl.c > > @@ -4134,9 +4134,13 @@ pinctrl_handle_put_mac_binding(const struct flow > *md, > > memcpy(&ip_key, &ip6, sizeof ip_key); > > } > > > > + /* If the ARP reply was unicast we should not delay it, > > + * there won't be any race. */ > > + bool is_multicast = eth_addr_is_multicast(headers->dl_dst); > > struct mac_binding *mb = ovn_mac_binding_add(&put_mac_bindings, > dp_key, > > port_key, &ip_key, > > - headers->dl_src); > > + headers->dl_src, > > + is_multicast); > > It's mainly a personal preference but I'd rewrite this as: > > uint32_t max_delay_ms = eth_addr_is_multicast(headers->dl_dst) > ? MAX_MAC_BINDING_DELAY_MSEC > : 0; > truct mac_binding *mb = ovn_mac_binding_add(&put_mac_bindings, dp_key, > port_key, &ip_key, > headers->dl_src, > max_delay_ms); > > Same as the above, I would not expose the max_delay unless there is benefit of doing that. > > if (!mb) { > > COVERAGE_INC(pinctrl_drop_put_mac_binding); > > return; > > @@ -4296,14 +4300,23 @@ run_put_mac_bindings(struct ovsdb_idl_txn > *ovnsb_idl_txn, > > return; > > } > > > > - const struct mac_binding *mb; > > - HMAP_FOR_EACH (mb, hmap_node, &put_mac_bindings) { > > - run_put_mac_binding(ovnsb_idl_txn, > sbrec_datapath_binding_by_key, > > - sbrec_port_binding_by_key, > > - sbrec_mac_binding_by_lport_ip, > > - mb); > > + long long now = time_msec(); > > + > > + struct mac_binding *mb; > > + HMAP_FOR_EACH_SAFE (mb, hmap_node, &put_mac_bindings) { > > + long long elapsed = now - mb->created; > > + if (elapsed >= mb->delay) { > > + run_put_mac_binding(ovnsb_idl_txn, > > + sbrec_datapath_binding_by_key, > > + sbrec_port_binding_by_key, > > + sbrec_mac_binding_by_lport_ip, > > + mb); > > + hmap_remove(&put_mac_bindings, &mb->hmap_node); > > + free(mb); > > We used to call ovn_mac_bindings_flush() to remove all elements, now we > remove them in the caller. I think this breaks a bit the encapsulation > that mac-learn.[hc] tried to add. Can we add a new api, e.g., > ovn_mac_binding_remove(), that will remove the entry from the map and > free it? > Makes sense, done. > > Now that we're here, what if the SB commit fails for some other > unrelated reason? We'll force a new ARP request for future packets. It > used to be like that before your patch too. I'm just wondering if we > need to flush 'put_mac_bindings' only after we know for sure that the SB > record was created. Definitely something that should be addressed (if > we go that way) in a different patch though. > We could investigate that, however I am afraid that it would be hard to propagate as the transaction error happens in ovsdb_idl_loop_commit_and_wait, in the main context. > > > + } else { > > + mb->delay -= elapsed; > > Hmm, isn't this wrong? We don't need the "else", do we? > We actually do, consider having 2 mb waiting to be added one with 2 ms, second with 5 ms. Without removing what was elapsed the second would wait 7 ms instead, some of them could wait way longer than the max delay. > > > + } > > } > > - ovn_mac_bindings_flush(&put_mac_bindings); > > ovn_mac_bindings_flush() is not used anymore anywhere else except > mac-learn.c, in ovn_mac_bindings_destroy(). We can just inline it there. > Done. > > > } > > > > static void > > @@ -4352,9 +4365,11 @@ run_buffered_binding(struct ovsdb_idl_index > *sbrec_mac_binding_by_lport_ip, > > > > static void > > wait_put_mac_bindings(struct ovsdb_idl_txn *ovnsb_idl_txn) > > + OVS_REQUIRES(pinctrl_mutex) > > { > > if (ovnsb_idl_txn && !hmap_is_empty(&put_mac_bindings)) { > > - poll_immediate_wake(); > > + long long delay = > ovn_mac_binding_shortest_delay(&put_mac_bindings); > > + poll_timer_wait(delay); > > } > > } > > > > diff --git a/tests/ovn.at b/tests/ovn.at > > index c346975e6..af82024ac 100644 > > --- a/tests/ovn.at > > +++ b/tests/ovn.at > > @@ -19601,6 +19601,7 @@ data=0800bee4391a0001 > > > > send_icmp_packet 1 1 $src_mac $router_mac0 $src_ip $dst_ip 0d1c $data > > send_arp_reply 2 1 $dst_mac $router_mac1 $dst_ip $router_ip > > +OVS_WAIT_UNTIL([ovn-sbctl list mac_binding | grep -q "172.16.1.10"]) > > echo $(get_arp_req $router_mac1 $router_ip $dst_ip) > expected > > echo > "${dst_mac}${router_mac1}08004500001c00004000fe0122b5${router_ip}${dst_ip}${data}" > >> expected > > > > @@ -19641,6 +19642,7 @@ gw_router_mac=f00000010a0a > > send_icmp_packet 2 2 f00000110203 $router_mac0 $src_ip $dst_ip 0c1b > $data > > echo $(get_arp_req f00000010204 $fip_ip $gw_router_ip) >> expected > > send_arp_reply 2 1 $gw_router_mac f00000010204 $gw_router_ip $fip_ip > > +OVS_WAIT_UNTIL([ovn-sbctl list mac_binding | grep -q "172.16.1.11"]) > > I'm not sure I understand why we need this. > You are right, it was needed when we were delaying everything, removed. > > > echo > "${gw_router_mac}f0000001020408004500001c00004000fe0121b4${fip_ip}${dst_ip}${data}" > >> expected > > > > AT_CHECK([as hv2 ovs-ofctl dump-flows br-int table=37 | grep > pkt_mark=0x64 | grep -q n_packets=1],[0]) > > @@ -23049,6 +23051,11 @@ grep table_id=10 | wc -l`]) > > > > check_row_count MAC_Binding 1 > > > > +OVS_WAIT_UNTIL( > > + [test 1 = `as hv1 ovs-ofctl dump-flows br-int table=67 | grep > dl_src=50:54:00:00:00:13 \ > > +| wc -l`] > > +) > > + > > AT_CHECK([ovn-sbctl --format=csv --bare --columns logical_port,ip,mac \ > > list mac_binding], [0], [lr0-sw0 > > 10.0.0.30 > > Thanks, > Dumitru > > Thanks, Ales -- Ales Musil Senior Software Engineer - OVN Core Red Hat EMEA <https://www.redhat.com> [email protected] IM: amusil <https://red.ht/sig> _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
