On 11 Jun 2026, at 6:58, Adrian Moreno wrote:
> In order to protect flow operations from RTNL contention, this patch
> decouples flow_table modifications from ovs_mutex by means of the
> following:
>
> 1 - Create a new mutex inside the flow_table that protects it from
> concurrent modifications.
> Putting the mutex inside flow_table makes it easier to consume for
> functions inside flow_table.c that do not currently take pointers to the
> datapath.
> Some function signatures need to be changed to accept flow_table so that
> lockdep checks can be performed.
>
> 2 - Create a reference count to temporarily extend rcu protection from
> the datapath to the flow_table.
> One reference is held by the datapath, the other is temporarily
> increased during flow modifications.
>
> Signed-off-by: Adrian Moreno <[email protected]>
Hi Adrian,
Thanks for this patch. I did spend quite some time analyzing
the implications of decoupling the table from ovs_mutex, and
I think all the concerns I had are covered. I did not do any
extensive traffic-based testing, but if time allows I will do
so with your next revision.
Find some comments below.
Cheers,
Eelco
[...]
> @@ -1678,8 +1722,12 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp,
> struct sk_buff *skb,
> if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
> goto nla_put_failure;
>
> - if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE,
> - ovs_flow_tbl_masks_cache_size(table)))
> + rcu_read_lock();
> + table = rcu_dereference(dp->table);
> + err = table ? nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE,
> + ovs_flow_tbl_masks_cache_size(table)) : 0;
> + rcu_read_unlock();
> + if (err)
> goto nla_put_failure;
>
> if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) {
> @@ -1817,7 +1865,9 @@ static int ovs_dp_change(struct datapath *dp, struct
> nlattr *a[])
> return -ENODEV;
>
> cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]);
> + mutex_lock(&table->lock);
> err = ovs_flow_tbl_masks_cache_resize(table, cache_size);
> + mutex_unlock(&table->lock);
The locking schema in flow_table.h does not document that
ovs_mutex-held writers may skip the refcount steps. Should
this either be documented as an exception, or should the
full 7-step protocol be followed here for consistency?
> if (err)
> return err;
> }
[...]
> @@ -2656,9 +2701,12 @@ static void ovs_dp_masks_rebalance(struct work_struct
> *work)
> ovs_lock();
> list_for_each_entry(dp, &ovs_net->dps, list_node) {
> table = ovsl_dereference(dp->table);
> - if (!table)
> + if (!table || !ovs_flow_tbl_get(table))
> continue;
> + mutex_lock(&table->lock);
> ovs_flow_masks_rebalance(table);
> + mutex_unlock(&table->lock);
Same question as above, but here the RCU steps are skipped
instead of the refcount steps.
> + ovs_flow_tbl_put(table);
> }
> ovs_unlock();
>
>
[...]
> @@ -513,7 +528,7 @@ static void table_instance_destroy(struct table_instance
> *ti,
> }
>
> /* No need for locking this function is called from RCU callback. */
> -void ovs_flow_tbl_destroy_rcu(struct rcu_head *rcu)
> +static void ovs_flow_tbl_destroy_rcu(struct rcu_head *rcu)
> {
> struct flow_table *table = container_of(rcu, struct flow_table, rcu);
>
> @@ -525,9 +540,22 @@ void ovs_flow_tbl_destroy_rcu(struct rcu_head *rcu)
> call_rcu(&mc->rcu, mask_cache_rcu_cb);
> call_rcu(&ma->rcu, mask_array_rcu_cb);
> table_instance_destroy(ti, ufid_ti);
> + mutex_destroy(&table->lock);
> kfree(table);
> }
>
> +void ovs_flow_tbl_put(struct flow_table *table)
> +{
> + if (refcount_dec_and_test(&table->refcnt)) {
Do we need a comment explaining why the 7-step protocol is not
followed here? Something like:
/* Last reference dropped, no concurrent writers possible.
* Lock is only needed for lockdep assertions below.
*/
> + mutex_lock(&table->lock);
> + table_instance_flow_flush(table,
> + ovs_tbl_dereference(table->ti, table),
> + ovs_tbl_dereference(table->ufid_ti,
> table));
> + mutex_unlock(&table->lock);
> + call_rcu(&table->rcu, ovs_flow_tbl_destroy_rcu);
> + }
> +}
> +
> struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
> u32 *bucket, u32 *last)
> {
[...]
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev