On 6/16/26 11:48 AM, Ales Musil via dev wrote:
> The put_ARP/ND/FDB were bound to pinctrl lock to prevent data races
> as they have used shared hmap to store the data that were later on
> added into SB. This is problematic in situations when main thread
> holds the lock and we have to process any put_ARP/ND/FDB packet. That
> contention would block the pinctrl thread from processing the
> incoming packets. The same can happen from the opposite side, pinctrl
> could be holding a lock, delaying main thread from processing other
> updates.
>
> To prevent that use the hmap only by main thread, this will ensure
> that we wait properly for the multicast timeout to expire and at the
> same time to do the deduplication. Once the entry arrives it will
> be shared by thread-safe ring buffer and consumed by main thread.
>
> Reported-at: https://redhat.atlassian.net/browse/FDP-3924
> Signed-off-by: Ales Musil <[email protected]>
> ---
Hi Ales,
Thanks for the patch!
> v3: Rebase on top of latest main.
> Reword the comment about entry expiration.
> Increase the spsc_ring size to be 4x larger than the hmap.
>
> v2: Rebase on top of latest main.
> Make sure the run_put_* and wait_put_* are running outside of
> the pinctrl mutex critical section.
> ---
> controller/pinctrl.c | 188 +++++++++++++++++++++++++------------------
> 1 file changed, 110 insertions(+), 78 deletions(-)
>
> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
> index bb8d20e7f..782c7e2e2 100644
> --- a/controller/pinctrl.c
> +++ b/controller/pinctrl.c
> @@ -65,6 +65,7 @@
> #include "ovn-sb-idl.h"
> #include "ovn-dns.h"
> #include "garp_rarp.h"
> +#include "spsc-ring.h"
>
> VLOG_DEFINE_THIS_MODULE(pinctrl);
>
> @@ -83,24 +84,32 @@ VLOG_DEFINE_THIS_MODULE(pinctrl);
> * and pinctrl_run().
> *
> *
> - * - put_arp/put_nd - These actions stores the IPv4/IPv6 and MAC addresses
> + * - put_arp/put_nd - These actions store the IPv4/IPv6 and MAC addresses
> * in the 'MAC_Binding' table.
> * The function 'pinctrl_handle_put_mac_binding()'
> (which
> - * is called with in the pinctrl_handler thread), stores
> - * the IPv4/IPv6 and MAC addresses in the
> - * hmap - put_mac_bindings.
> + * is called within the pinctrl_handler thread), pushes
> + * the IPv4/IPv6 and MAC addresses directly into a
> + * lock-free SPSC ring buffer ('mac_bindings_ring').
> + * This does not require pinctrl_mutex. For multicast
> + * replies, a special cookie value
> + * (MAC_BINDINGS_MC_COOKIE) is set to signal the main
> + * thread to apply a random delay.
> *
> - * pinctrl_run(), reads these mac bindings from the hmap
> - * 'put_mac_bindings' and writes to the 'MAC_Binding'
> - * table in the Southbound DB.
> + * pinctrl_run() drains the ring in
> + * run_put_mac_bindings(), inserts entries into the
> + * main-thread-only hmap 'put_mac_bindings' (applying
> + * a random delay for multicast replies to avoid
> + * thundering herd), and writes entries whose delay
> + * has elapsed to the 'MAC_Binding' table in the
> + * Southbound DB.
> *
> * - arp/nd_ns - These actions generate an ARP/IPv6 Neighbor solicit
> * requests. The original packets are buffered and
> * injected back when put_arp/put_nd resolves
> * corresponding ARP/IPv6 Neighbor solicit requests.
> - * When pinctrl_run(), writes the mac bindings from the
> - * 'put_mac_bindings' hmap to the MAC_Binding table in
> - * SB DB, run_buffered_binding will add the buffered
> + * When pinctrl_run() writes the mac bindings from the
> + * ring buffer to the MAC_Binding table in SB DB,
> + * run_buffered_binding will add the buffered
> * packets to buffered_mac_bindings and notify
> * pinctrl_handler.
> *
> @@ -197,16 +206,14 @@ run_buffered_binding(const struct
> sbrec_mac_binding_table *mac_binding_table,
>
> static void pinctrl_handle_put_mac_binding(const struct flow *md,
> const struct flow *headers,
> - bool is_arp)
> - OVS_REQUIRES(pinctrl_mutex);
> -static void init_put_mac_bindings(void);
> -static void destroy_put_mac_bindings(void);
> + bool is_arp);
> +static void init_mac_bindings(void);
> +static void destroy_mac_bindings(void);
> static void run_put_mac_bindings(
> struct ovsdb_idl_txn *ovnsb_idl_txn,
> struct ovsdb_idl_index *sbrec_datapath_binding_by_key,
> struct ovsdb_idl_index *sbrec_port_binding_by_key,
> - struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip)
> - OVS_REQUIRES(pinctrl_mutex);
> + struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip);
> static void wait_put_mac_bindings(void);
> static void send_mac_binding_buffered_pkts(struct rconn *swconn);
>
> @@ -366,18 +373,15 @@ static void run_put_fdb(struct ovsdb_idl_txn
> *ovnsb_idl_txn,
> struct ovsdb_idl_index *sbrec_fdb_by_dp_key_mac,
> struct ovsdb_idl_index *sbrec_port_binding_by_key,
> struct ovsdb_idl_index *sbrec_datapath_binding_by_key,
> - struct fdb *fdb, uint64_t cur_cfg)
> - OVS_REQUIRES(pinctrl_mutex);
> + struct fdb *fdb, uint64_t cur_cfg);
> static void run_put_fdbs(struct ovsdb_idl_txn *ovnsb_idl_txn,
> struct ovsdb_idl_index *sbrec_port_binding_by_key,
> struct ovsdb_idl_index *sbrec_datapath_binding_by_key,
> struct ovsdb_idl_index *sbrec_fdb_by_dp_key_mac,
> - uint64_t cur_cfg)
> - OVS_REQUIRES(pinctrl_mutex);
> + uint64_t cur_cfg);
> static void wait_put_fdbs(void);
> static void pinctrl_handle_put_fdb(const struct flow *md,
> - const struct flow *headers)
> - OVS_REQUIRES(pinctrl_mutex);
> + const struct flow *headers);
>
> static void set_from_ctrl_flag_in_pkt_metadata(struct ofputil_packet_in *);
>
> @@ -556,7 +560,7 @@ out:
> void
> pinctrl_init(void)
> {
> - init_put_mac_bindings();
> + init_mac_bindings();
> init_send_arps_nds();
> init_ipv6_ras();
> init_ipv6_prefixd();
> @@ -3776,10 +3780,8 @@ process_packet_in(struct rconn *swconn, const struct
> ofp_header *msg)
> break;
>
> case ACTION_OPCODE_PUT_ARP:
> - ovs_mutex_lock(&pinctrl_mutex);
> pinctrl_handle_put_mac_binding(&pin.flow_metadata.flow, &headers,
> true);
> - ovs_mutex_unlock(&pinctrl_mutex);
> break;
>
> case ACTION_OPCODE_DHCP_RELAY_REQ_CHK:
> @@ -3808,16 +3810,12 @@ process_packet_in(struct rconn *swconn, const struct
> ofp_header *msg)
> break;
>
> case ACTION_OPCODE_PUT_ND:
> - ovs_mutex_lock(&pinctrl_mutex);
> pinctrl_handle_put_mac_binding(&pin.flow_metadata.flow, &headers,
> false);
> - ovs_mutex_unlock(&pinctrl_mutex);
> break;
>
> case ACTION_OPCODE_PUT_FDB:
> - ovs_mutex_lock(&pinctrl_mutex);
> pinctrl_handle_put_fdb(&pin.flow_metadata.flow, &headers);
> - ovs_mutex_unlock(&pinctrl_mutex);
> break;
>
> case ACTION_OPCODE_PUT_DHCPV6_OPTS:
> @@ -4205,9 +4203,6 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
>
> main_seq = seq_read(pinctrl_main_seq);
>
> - run_put_mac_bindings(ovnsb_idl_txn, sbrec_datapath_binding_by_key,
> - sbrec_port_binding_by_key,
> - sbrec_mac_binding_by_lport_ip);
> run_put_vport_bindings(ovnsb_idl_txn, sbrec_datapath_binding_by_key,
> sbrec_port_binding_by_key, chassis, cur_cfg);
> send_garp_rarp_prepare(ecmp_nh_table, chassis, ovs_table);
> @@ -4225,9 +4220,6 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
> chassis);
> bfd_monitor_run(ovnsb_idl_txn, bfd_table, sbrec_port_binding_by_name,
> chassis);
> - run_put_fdbs(ovnsb_idl_txn, sbrec_port_binding_by_key,
> - sbrec_datapath_binding_by_key, sbrec_fdb_by_dp_key_mac,
> - cur_cfg);
> run_activated_ports(ovnsb_idl_txn, sbrec_datapath_binding_by_key,
> sbrec_port_binding_by_key, chassis);
> ovs_mutex_unlock(&pinctrl_mutex);
> @@ -4237,6 +4229,12 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
> sbrec_datapath_binding_by_key,
> sbrec_port_binding_by_name,
> sbrec_mac_binding_by_lport_ip);
> + run_put_mac_bindings(ovnsb_idl_txn, sbrec_datapath_binding_by_key,
> + sbrec_port_binding_by_key,
> + sbrec_mac_binding_by_lport_ip);
> + run_put_fdbs(ovnsb_idl_txn, sbrec_port_binding_by_key,
> + sbrec_datapath_binding_by_key, sbrec_fdb_by_dp_key_mac,
> + cur_cfg);
> }
>
> /* Table of ipv6_ra_state structures, keyed on logical port name.
> @@ -4724,14 +4722,15 @@ pinctrl_wait(struct ovsdb_idl_txn *ovnsb_idl_txn)
> {
> ovs_mutex_lock(&pinctrl_mutex);
> if (ovnsb_idl_txn) {
Not related to this pach, but why did we ever need to do the check only
if ovnsb_idl_txn?
I think https://github.com/ovn-org/ovn/commit/d305d9f shouldn't have
needed to add it at all, right?
> - wait_put_mac_bindings();
> wait_controller_event();
> wait_put_vport_bindings();
> - wait_put_fdbs();
> seq_wait(pinctrl_main_seq, main_seq);
> }
> wait_activated_ports();
> ovs_mutex_unlock(&pinctrl_mutex);
> +
> + wait_put_mac_bindings();
> + wait_put_fdbs();
> }
>
> #define PINCTRL_CFG_INTERVAL 100
> @@ -4757,7 +4756,7 @@ pinctrl_destroy(void)
> destroy_buffered_packets_map();
> destroy_activated_ports();
> event_table_destroy();
> - destroy_put_mac_bindings();
> + destroy_mac_bindings();
> destroy_put_vport_bindings();
> ip_mcast_snoop_destroy();
> destroy_svc_monitors();
> @@ -4782,22 +4781,31 @@ pinctrl_destroy(void)
>
> #define MAX_MAC_BINDING_DELAY_MSEC 50
> #define MAX_FDB_DELAY_MSEC 50
> -#define MAX_MAC_BINDINGS 1000
> +#define MAX_MAC_BINDINGS 1024
> +#define MAC_BINDINGS_MC_COOKIE 0xff
>
> -/* Contains "struct mac_binding"s. */
> +/* Contains "struct mac_binding"s. This is used by main thread only. */
> static struct hmap put_mac_bindings;
> +/* Contains "struct mac_binding_data". This is populated by pinctrl thread
> + * and consumed by main thread. */
> +static struct spsc_ring mac_bindings_ring;
>
> static void
> -init_put_mac_bindings(void)
> +init_mac_bindings(void)
> {
> hmap_init(&put_mac_bindings);
> + /* Make the capacity larger than the hmap to account for some in-flight
> + * duplicates. */
> + spsc_ring_init(&mac_bindings_ring, MAX_MAC_BINDINGS * 4,
> + sizeof (struct mac_binding_data));
> }
>
> static void
> -destroy_put_mac_bindings(void)
> +destroy_mac_bindings(void)
> {
> mac_bindings_clear(&put_mac_bindings);
> hmap_destroy(&put_mac_bindings);
> + spsc_ring_destroy(&mac_bindings_ring);
> }
>
> /* Called with in the pinctrl_handler thread context. */
> @@ -4805,14 +4813,11 @@ static void
> pinctrl_handle_put_mac_binding(const struct flow *md,
> const struct flow *headers,
> bool is_arp)
> - OVS_REQUIRES(pinctrl_mutex)
> {
> - if (hmap_count(&put_mac_bindings) >= MAX_MAC_BINDINGS) {
> - COVERAGE_INC(pinctrl_drop_put_mac_binding);
> - return;
> - }
> -
> struct mac_binding_data mb_data = (struct mac_binding_data) {
> + .cookie = eth_addr_is_multicast(headers->dl_dst)
> + ? MAC_BINDINGS_MC_COOKIE
> + : 0,
> .dp_key = ntohll(md->metadata),
> .port_key = md->regs[MFF_LOG_INPORT - MFF_REG0],
> .mac = headers->dl_src,
> @@ -4825,18 +4830,13 @@ pinctrl_handle_put_mac_binding(const struct flow *md,
> memcpy(&mb_data.ip, &ip6, sizeof mb_data.ip);
> }
>
> - /* If the ARP reply was unicast we should not delay it,
> - * there won't be any race. */
> - uint32_t delay = eth_addr_is_multicast(headers->dl_dst)
> - ? random_range(MAX_MAC_BINDING_DELAY_MSEC) + 1
> - : 0;
> - long long timestamp = time_msec() + delay;
> - mac_binding_add(&put_mac_bindings, mb_data, NULL, timestamp);
> -
> - /* We can send the buffered packet once the main ovn-controller
> - * thread calls pinctrl_run() and it writes the mac_bindings stored
> - * in 'put_mac_bindings' hmap into the Southbound MAC_Binding table. */
> - notify_pinctrl_main();
> + if (!spsc_ring_push(&mac_bindings_ring, &mb_data)) {
> + /* This shouldn't happen, the main thread drains the ring buffer
> + * every iteration. */
> + COVERAGE_INC(pinctrl_drop_put_mac_binding);
While I agree that this shouldn't happen, if it does it would be nice to
have counters for it. But now, because we reuse this counter also to
track drops when the main thread hmap is full, we lose that information.
I'd add a pinctrl_ring_full_put_mac_binding coverage counter or
something along those lines.
> + } else {
> + notify_pinctrl_main();
> + }
> }
>
> /* Called with in the pinctrl_handler thread context. */
> @@ -4898,14 +4898,32 @@ run_put_mac_bindings(struct ovsdb_idl_txn
> *ovnsb_idl_txn,
> struct ovsdb_idl_index *sbrec_datapath_binding_by_key,
> struct ovsdb_idl_index *sbrec_port_binding_by_key,
> struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip)
> - OVS_REQUIRES(pinctrl_mutex)
> {
> + long long now = time_msec();
> +
> + struct mac_binding_data mb_data;
> + SPSC_RING_FOR_EACH_POP (&mac_bindings_ring, mb_data) {
> + if (hmap_count(&put_mac_bindings) >= MAX_MAC_BINDINGS) {
> + COVERAGE_INC(pinctrl_drop_put_mac_binding);
> + continue;
> + }
> +
> + /* If the ARP reply was unicast we should not delay it,
> + * there won't be any race. */
> + uint32_t delay = mb_data.cookie == MAC_BINDINGS_MC_COOKIE
> + ? random_range(MAX_MAC_BINDING_DELAY_MSEC) + 1
> + : 0;
> + /* Clear the cookie, it's needed just to know if it was delayed or
> + * not. */
> + mb_data.cookie = 0;
> +
> + mac_binding_add(&put_mac_bindings, mb_data, NULL, now + delay);
> + }
> +
> if (!ovnsb_idl_txn) {
> return;
> }
>
> - long long now = time_msec();
> -
> struct mac_binding *mb;
> HMAP_FOR_EACH_SAFE (mb, hmap_node, &put_mac_bindings) {
> if (now >= mb->timestamp) {
> @@ -4986,7 +5004,6 @@ run_buffered_binding(const struct
> sbrec_mac_binding_table *mac_binding_table,
>
> static void
> wait_put_mac_bindings(void)
> - OVS_REQUIRES(pinctrl_mutex)
> {
> struct mac_binding *mb;
> HMAP_FOR_EACH (mb, hmap_node, &put_mac_bindings) {
> @@ -8878,9 +8895,13 @@ pinctrl_split_buf_action_handler(struct rconn *swconn,
> struct dp_packet *pkt,
> ofpbuf_uninit(&ofpacts);
> }
>
> -#define MAX_FDB_ENTRIES 1000
> +#define MAX_FDB_ENTRIES 1024
>
> +/* Contains "struct fdb"s. This is used by main thread only. */
> static struct hmap put_fdbs;
> +/* Contains "struct fdb_data". This is populated by pinctrl thread
> + * and consumed by main thread. */
> +static struct spsc_ring fdbs_ring;
>
> /* MAC learning (fdb) related functions. Runs within the main
> * ovn-controller thread context. */
> @@ -8889,6 +8910,9 @@ static void
> init_fdb_entries(void)
> {
> hmap_init(&put_fdbs);
> + /* Make the capacity larger than the hmap to account for some in-flight
> + * duplicates. */
> + spsc_ring_init(&fdbs_ring, MAX_FDB_ENTRIES * 4, sizeof (struct
> fdb_data));
> }
>
> static void
> @@ -8896,6 +8920,7 @@ destroy_fdb_entries(void)
> {
> fdbs_clear(&put_fdbs);
> hmap_destroy(&put_fdbs);
> + spsc_ring_destroy(&fdbs_ring);
> }
>
> static const struct sbrec_fdb *
> @@ -8975,13 +9000,24 @@ run_put_fdbs(struct ovsdb_idl_txn *ovnsb_idl_txn,
> struct ovsdb_idl_index *sbrec_port_binding_by_key,
> struct ovsdb_idl_index *sbrec_datapath_binding_by_key,
> struct ovsdb_idl_index *sbrec_fdb_by_dp_key_mac, uint64_t
> cur_cfg)
> - OVS_REQUIRES(pinctrl_mutex)
> {
> + long long now = time_msec();
> +
> + struct fdb_data fdb_data;
> + SPSC_RING_FOR_EACH_POP (&fdbs_ring, fdb_data) {
> + if (hmap_count(&put_fdbs) >= MAX_FDB_ENTRIES) {
> + COVERAGE_INC(pinctrl_drop_put_fdb);
> + continue;
> + }
> +
> + uint32_t delay = random_range(MAX_FDB_DELAY_MSEC) + 1;
> + fdb_add(&put_fdbs, fdb_data, now + delay);
> + }
> +
> if (!ovnsb_idl_txn) {
> return;
> }
>
> - long long now = time_msec();
> struct fdb *fdb;
> HMAP_FOR_EACH_SAFE (fdb, hmap_node, &put_fdbs) {
> if (fdb->cfg >= 0 && pinctrl_is_sb_commited(fdb->cfg, cur_cfg)) {
> @@ -8997,7 +9033,6 @@ run_put_fdbs(struct ovsdb_idl_txn *ovnsb_idl_txn,
>
> static void
> wait_put_fdbs(void)
> - OVS_REQUIRES(pinctrl_mutex)
> {
> struct fdb *fdb;
> HMAP_FOR_EACH (fdb, hmap_node, &put_fdbs) {
> @@ -9008,23 +9043,20 @@ wait_put_fdbs(void)
> /* Called with in the pinctrl_handler thread context. */
> static void
> pinctrl_handle_put_fdb(const struct flow *md, const struct flow *headers)
> - OVS_REQUIRES(pinctrl_mutex)
> {
> - if (hmap_count(&put_fdbs) >= MAX_FDB_ENTRIES) {
> - COVERAGE_INC(pinctrl_drop_put_fdb);
> - return;
> - }
> -
> struct fdb_data fdb_data = (struct fdb_data) {
> .dp_key = ntohll(md->metadata),
> .port_key = md->regs[MFF_LOG_INPORT - MFF_REG0],
> .mac = headers->dl_src,
> };
>
> - uint32_t delay = random_range(MAX_FDB_DELAY_MSEC) + 1;
> - long long timestamp = time_msec() + delay;
> - fdb_add(&put_fdbs, fdb_data, timestamp);
> - notify_pinctrl_main();
> + if (!spsc_ring_push(&fdbs_ring, &fdb_data)) {
> + /* This shouldn't happen, the main thread drains the ring buffer
> + * every iteration. */
> + COVERAGE_INC(pinctrl_drop_put_fdb);
Same comment as for mac_bindings, I'd use a new coverage counter here.
> + } else {
> + notify_pinctrl_main();
> + }
> }
>
> /* This function sets the register bit 'MLF_FROM_CTRL_BIT'
I essentially only had comments on the lack of new ring-full drop
counters but that's minor. If you take care of that the rest looks good
to me:
Acked-by: Dumitru Ceara <[email protected]>
Regards,
Dumitru
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev