Currently the entire ovs module is write-protected using the global
ovs_mutex. While this simple approach works fine for control-plane
operations (such as vport configurations), requiring the global mutex
for flow modifications can be problematic.

During periods of high control-plane operations, e.g: netdevs (vports)
coming and going, RTNL can suffer contention. This contention is easily
transferred to the ovs_mutex as RTNL nests inside ovs_mutex. Flow
modifications, however, are done as part of packet processing and having
them wait for RTNL pressure to go away can lead to packet drops.

This patch decouples flow_table modifications from ovs_mutex by means of
the following:

1 - Make flow_table an rcu-protected pointer inside the datapath.
This allows both objects to be protected independently while reducing the
amount of changes required in "flow_table.c".

2 - Create a new mutex inside the flow_table that protects it from
concurrent modifications.
Putting the mutex inside flow_table makes it easier to consume for
functions inside flow_table.c that do not currently take pointers to the
datapath.
Some function signatures need to be changed to accept flow_table so that
lockdep checks can be performed.

3 - Create a reference count to temporarily extend rcu protection from
the datapath to the flow_table.
In order to use the flow_table without locking ovs_mutex, the flow_table
pointer must be first dereferenced within an rcu-protected region.
Next, the table->mutex needs to be locked to protect it from
concurrent writes but mutexes must not be locked inside an rcu-protected
region, so the rcu-protected region must be left at which point the
datapath can be concurrently freed.
To extend the protection beyond the rcu region, a reference count is used.
One reference is held by the datapath, the other is temporarily
increased during flow modifications. For example:

Datapath deletion:

  ovs_lock();
  table = rcu_dereference_protected(dp->table, ...);
  rcu_assign_pointer(dp->table, NULL);
  ovs_flow_tbl_put(table);
  ovs_unlock();

Flow modification:

  rcu_read_lock();
  dp = get_dp(...);
  table = rcu_dereference(dp->table);
  ovs_flow_tbl_get(table);
  rcu_read_unlock();

  mutex_lock(&table->lock);
  /* Perform modifications on the flow_table */
  mutex_unlock(&table->lock);
  ovs_flow_tbl_put(table);

Signed-off-by: Adrian Moreno <[email protected]>
---
 net/openvswitch/datapath.c   | 284 ++++++++++++++++++++++++-----------
 net/openvswitch/datapath.h   |   2 +-
 net/openvswitch/flow.c       |  13 +-
 net/openvswitch/flow.h       |   9 +-
 net/openvswitch/flow_table.c | 180 ++++++++++++++--------
 net/openvswitch/flow_table.h |  51 ++++++-
 6 files changed, 379 insertions(+), 160 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index d5b6e2002bc1..133701fb0c77 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -88,13 +88,17 @@ static void ovs_notify(struct genl_family *family,
  * DOC: Locking:
  *
  * All writes e.g. Writes to device state (add/remove datapath, port, set
- * operations on vports, etc.), Writes to other state (flow table
- * modifications, set miscellaneous datapath parameters, etc.) are protected
- * by ovs_lock.
+ * operations on vports, etc.) and writes to other datapath parameters
+ * are protected by ovs_lock.
+ *
+ * Writes to the flow table are NOT protected by ovs_lock. Instead, a per-table
+ * mutex and reference count are used (see comment above "struct flow_table"
+ * definition). On some few occasions, the per-flow table mutex is nested
+ * inside ovs_mutex.
  *
  * Reads are protected by RCU.
  *
- * There are a few special cases (mostly stats) that have their own
+ * There are a few other special cases (mostly stats) that have their own
  * synchronization but they nest under all of above and don't interact with
  * each other.
  *
@@ -166,7 +170,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 {
        struct datapath *dp = container_of(rcu, struct datapath, rcu);
 
-       ovs_flow_tbl_destroy(&dp->table);
        free_percpu(dp->stats_percpu);
        kfree(dp->ports);
        ovs_meters_exit(dp);
@@ -247,6 +250,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
sw_flow_key *key)
        struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage);
        const struct vport *p = OVS_CB(skb)->input_vport;
        struct datapath *dp = p->dp;
+       struct flow_table *table;
        struct sw_flow *flow;
        struct sw_flow_actions *sf_acts;
        struct dp_stats_percpu *stats;
@@ -257,9 +261,16 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
sw_flow_key *key)
        int error;
 
        stats = this_cpu_ptr(dp->stats_percpu);
+       table = rcu_dereference(dp->table);
+       if (!table) {
+               net_dbg_ratelimited("ovs: no flow table on datapath %s\n",
+                                   ovs_dp_name(dp));
+               kfree_skb(skb);
+               return;
+       }
 
        /* Look up flow. */
-       flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
+       flow = ovs_flow_tbl_lookup_stats(table, key, skb_get_hash(skb),
                                         &n_mask_hit, &n_cache_hit);
        if (unlikely(!flow)) {
                struct dp_upcall_info upcall;
@@ -752,12 +763,16 @@ static struct genl_family dp_packet_genl_family 
__ro_after_init = {
 static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
                         struct ovs_dp_megaflow_stats *mega_stats)
 {
+       struct flow_table *table = ovsl_dereference(dp->table);
        int i;
 
        memset(mega_stats, 0, sizeof(*mega_stats));
 
-       stats->n_flows = ovs_flow_tbl_count(&dp->table);
-       mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
+       if (table) {
+               stats->n_flows = ovs_flow_tbl_count(table);
+               mega_stats->n_masks = ovs_flow_tbl_num_masks(table);
+       }
+
 
        stats->n_hit = stats->n_missed = stats->n_lost = 0;
 
@@ -829,15 +844,16 @@ static size_t ovs_flow_cmd_msg_size(const struct 
sw_flow_actions *acts,
                + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */
 }
 
-/* Called with ovs_mutex or RCU read lock. */
+/* Called with table->lock or RCU read lock. */
 static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
+                                  const struct flow_table *table,
                                   struct sk_buff *skb)
 {
        struct ovs_flow_stats stats;
        __be16 tcp_flags;
        unsigned long used;
 
-       ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
+       ovs_flow_stats_get(flow, table, &stats, &used, &tcp_flags);
 
        if (used &&
            nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used),
@@ -857,8 +873,9 @@ static int ovs_flow_cmd_fill_stats(const struct sw_flow 
*flow,
        return 0;
 }
 
-/* Called with ovs_mutex or RCU read lock. */
+/* Called with RCU read lock or table->lock held. */
 static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
+                                    const struct flow_table *table,
                                     struct sk_buff *skb, int skb_orig_len)
 {
        struct nlattr *start;
@@ -878,7 +895,7 @@ static int ovs_flow_cmd_fill_actions(const struct sw_flow 
*flow,
        if (start) {
                const struct sw_flow_actions *sf_acts;
 
-               sf_acts = rcu_dereference_ovsl(flow->sf_acts);
+               sf_acts = rcu_dereference_ovs_tbl(flow->sf_acts, table);
                err = ovs_nla_put_actions(sf_acts->actions,
                                          sf_acts->actions_len, skb);
 
@@ -897,8 +914,10 @@ static int ovs_flow_cmd_fill_actions(const struct sw_flow 
*flow,
        return 0;
 }
 
-/* Called with ovs_mutex or RCU read lock. */
-static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
+/* Called with table->lock or RCU read lock. */
+static int ovs_flow_cmd_fill_info(const struct sw_flow *flow,
+                                 const struct flow_table *table,
+                                 int dp_ifindex,
                                  struct sk_buff *skb, u32 portid,
                                  u32 seq, u32 flags, u8 cmd, u32 ufid_flags)
 {
@@ -929,12 +948,12 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow 
*flow, int dp_ifindex,
                        goto error;
        }
 
-       err = ovs_flow_cmd_fill_stats(flow, skb);
+       err = ovs_flow_cmd_fill_stats(flow, table, skb);
        if (err)
                goto error;
 
        if (should_fill_actions(ufid_flags)) {
-               err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
+               err = ovs_flow_cmd_fill_actions(flow, table, skb, skb_orig_len);
                if (err)
                        goto error;
        }
@@ -968,8 +987,9 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct 
sw_flow_actions *act
        return skb;
 }
 
-/* Called with ovs_mutex. */
+/* Called with table->lock. */
 static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
+                                              const struct flow_table *table,
                                               int dp_ifindex,
                                               struct genl_info *info, u8 cmd,
                                               bool always, u32 ufid_flags)
@@ -977,12 +997,12 @@ static struct sk_buff *ovs_flow_cmd_build_info(const 
struct sw_flow *flow,
        struct sk_buff *skb;
        int retval;
 
-       skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts),
+       skb = ovs_flow_cmd_alloc_info(ovs_tbl_dereference(flow->sf_acts, table),
                                      &flow->id, info, always, ufid_flags);
        if (IS_ERR_OR_NULL(skb))
                return skb;
 
-       retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
+       retval = ovs_flow_cmd_fill_info(flow, table, dp_ifindex, skb,
                                        info->snd_portid, info->snd_seq, 0,
                                        cmd, ufid_flags);
        if (WARN_ON_ONCE(retval < 0)) {
@@ -998,6 +1018,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
        struct nlattr **a = info->attrs;
        struct ovs_header *ovs_header = genl_info_userhdr(info);
        struct sw_flow *flow = NULL, *new_flow;
+       struct flow_table *table;
        struct sw_flow_mask mask;
        struct sk_buff *reply;
        struct datapath *dp;
@@ -1064,30 +1085,43 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
                goto err_kfree_acts;
        }
 
-       ovs_lock();
+       rcu_read_lock();
        dp = get_dp(net, ovs_header->dp_ifindex);
        if (unlikely(!dp)) {
                error = -ENODEV;
-               goto err_unlock_ovs;
+               rcu_read_unlock();
+               goto err_kfree_reply;
        }
+       table = rcu_dereference(dp->table);
+       if (!table || !ovs_flow_tbl_get(table)) {
+               error = -ENODEV;
+               rcu_read_unlock();
+               goto err_kfree_reply;
+       }
+       rcu_read_unlock();
+
+       /* It is safe to dereference "table" after leaving rcu read-protected
+        * region because it's pinned by refcount.
+        */
+       mutex_lock(&table->lock);
 
        /* Check if this is a duplicate flow */
        if (ovs_identifier_is_ufid(&new_flow->id))
-               flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
+               flow = ovs_flow_tbl_lookup_ufid(table, &new_flow->id);
        if (!flow)
-               flow = ovs_flow_tbl_lookup(&dp->table, key);
+               flow = ovs_flow_tbl_lookup(table, key);
        if (likely(!flow)) {
                rcu_assign_pointer(new_flow->sf_acts, acts);
 
                /* Put flow in bucket. */
-               error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
+               error = ovs_flow_tbl_insert(table, new_flow, &mask);
                if (unlikely(error)) {
                        acts = NULL;
-                       goto err_unlock_ovs;
+                       goto err_unlock_tbl;
                }
 
                if (unlikely(reply)) {
-                       error = ovs_flow_cmd_fill_info(new_flow,
+                       error = ovs_flow_cmd_fill_info(new_flow, table,
                                                       ovs_header->dp_ifindex,
                                                       reply, info->snd_portid,
                                                       info->snd_seq, 0,
@@ -1095,7 +1129,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
                                                       ufid_flags);
                        BUG_ON(error < 0);
                }
-               ovs_unlock();
+               mutex_unlock(&table->lock);
+               ovs_flow_tbl_put(table);
        } else {
                struct sw_flow_actions *old_acts;
 
@@ -1108,28 +1143,28 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
                if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
                                                         | NLM_F_EXCL))) {
                        error = -EEXIST;
-                       goto err_unlock_ovs;
+                       goto err_unlock_tbl;
                }
                /* The flow identifier has to be the same for flow updates.
                 * Look for any overlapping flow.
                 */
                if (unlikely(!ovs_flow_cmp(flow, &match))) {
                        if (ovs_identifier_is_key(&flow->id))
-                               flow = ovs_flow_tbl_lookup_exact(&dp->table,
+                               flow = ovs_flow_tbl_lookup_exact(table,
                                                                 &match);
                        else /* UFID matches but key is different */
                                flow = NULL;
                        if (!flow) {
                                error = -ENOENT;
-                               goto err_unlock_ovs;
+                               goto err_unlock_tbl;
                        }
                }
                /* Update actions. */
-               old_acts = ovsl_dereference(flow->sf_acts);
+               old_acts = ovs_tbl_dereference(flow->sf_acts, table);
                rcu_assign_pointer(flow->sf_acts, acts);
 
                if (unlikely(reply)) {
-                       error = ovs_flow_cmd_fill_info(flow,
+                       error = ovs_flow_cmd_fill_info(flow, table,
                                                       ovs_header->dp_ifindex,
                                                       reply, info->snd_portid,
                                                       info->snd_seq, 0,
@@ -1137,7 +1172,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
                                                       ufid_flags);
                        BUG_ON(error < 0);
                }
-               ovs_unlock();
+               mutex_unlock(&table->lock);
+               ovs_flow_tbl_put(table);
 
                ovs_nla_free_flow_actions_rcu(old_acts);
                ovs_flow_free(new_flow, false);
@@ -1149,8 +1185,10 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
        kfree(key);
        return 0;
 
-err_unlock_ovs:
-       ovs_unlock();
+err_unlock_tbl:
+       mutex_unlock(&table->lock);
+       ovs_flow_tbl_put(table);
+err_kfree_reply:
        kfree_skb(reply);
 err_kfree_acts:
        ovs_nla_free_flow_actions(acts);
@@ -1244,6 +1282,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
        struct net *net = sock_net(skb->sk);
        struct nlattr **a = info->attrs;
        struct ovs_header *ovs_header = genl_info_userhdr(info);
+       struct flow_table *table;
        struct sw_flow_key key;
        struct sw_flow *flow;
        struct sk_buff *reply = NULL;
@@ -1278,29 +1317,42 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
                }
        }
 
-       ovs_lock();
+       rcu_read_lock();
        dp = get_dp(net, ovs_header->dp_ifindex);
        if (unlikely(!dp)) {
                error = -ENODEV;
-               goto err_unlock_ovs;
+               goto err_free_reply;
        }
+       table = rcu_dereference(dp->table);
+       if (!table || !ovs_flow_tbl_get(table)) {
+               rcu_read_unlock();
+               error = -ENODEV;
+               goto err_free_reply;
+       }
+       rcu_read_unlock();
+
+       /* It is safe to dereference "table" after leaving rcu read-protected
+        * region because it's pinned by refcount.
+        */
+       mutex_lock(&table->lock);
+
        /* Check that the flow exists. */
        if (ufid_present)
-               flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid);
+               flow = ovs_flow_tbl_lookup_ufid(table, &sfid);
        else
-               flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+               flow = ovs_flow_tbl_lookup_exact(table, &match);
        if (unlikely(!flow)) {
                error = -ENOENT;
-               goto err_unlock_ovs;
+               goto err_unlock_tbl;
        }
 
        /* Update actions, if present. */
        if (likely(acts)) {
-               old_acts = ovsl_dereference(flow->sf_acts);
+               old_acts = ovs_tbl_dereference(flow->sf_acts, table);
                rcu_assign_pointer(flow->sf_acts, acts);
 
                if (unlikely(reply)) {
-                       error = ovs_flow_cmd_fill_info(flow,
+                       error = ovs_flow_cmd_fill_info(flow, table,
                                                       ovs_header->dp_ifindex,
                                                       reply, info->snd_portid,
                                                       info->snd_seq, 0,
@@ -1310,20 +1362,22 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
                }
        } else {
                /* Could not alloc without acts before locking. */
-               reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
+               reply = ovs_flow_cmd_build_info(flow, table,
+                                               ovs_header->dp_ifindex,
                                                info, OVS_FLOW_CMD_SET, false,
                                                ufid_flags);
 
                if (IS_ERR(reply)) {
                        error = PTR_ERR(reply);
-                       goto err_unlock_ovs;
+                       goto err_unlock_tbl;
                }
        }
 
        /* Clear stats. */
        if (a[OVS_FLOW_ATTR_CLEAR])
-               ovs_flow_stats_clear(flow);
-       ovs_unlock();
+               ovs_flow_stats_clear(flow, table);
+       mutex_unlock(&table->lock);
+       ovs_flow_tbl_put(table);
 
        if (reply)
                ovs_notify(&dp_flow_genl_family, reply, info);
@@ -1332,8 +1386,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
 
        return 0;
 
-err_unlock_ovs:
-       ovs_unlock();
+err_unlock_tbl:
+       mutex_unlock(&table->lock);
+       ovs_flow_tbl_put(table);
+err_free_reply:
        kfree_skb(reply);
 err_kfree_acts:
        ovs_nla_free_flow_actions(acts);
@@ -1346,6 +1402,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct 
genl_info *info)
        struct nlattr **a = info->attrs;
        struct ovs_header *ovs_header = genl_info_userhdr(info);
        struct net *net = sock_net(skb->sk);
+       struct flow_table *table;
        struct sw_flow_key key;
        struct sk_buff *reply;
        struct sw_flow *flow;
@@ -1370,33 +1427,48 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct 
genl_info *info)
        if (err)
                return err;
 
-       ovs_lock();
+       rcu_read_lock();
        dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
        if (!dp) {
-               err = -ENODEV;
-               goto unlock;
+               rcu_read_unlock();
+               return -ENODEV;
        }
+       table = rcu_dereference(dp->table);
+       if (!table || !ovs_flow_tbl_get(table)) {
+               rcu_read_unlock();
+               return -ENODEV;
+       }
+       rcu_read_unlock();
+
+       /* It is safe to dereference "table" after leaving rcu read-protected
+        * region because it's pinned by refcount.
+        */
+       mutex_lock(&table->lock);
+
 
        if (ufid_present)
-               flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
+               flow = ovs_flow_tbl_lookup_ufid(table, &ufid);
        else
-               flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+               flow = ovs_flow_tbl_lookup_exact(table, &match);
        if (!flow) {
                err = -ENOENT;
                goto unlock;
        }
 
-       reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
-                                       OVS_FLOW_CMD_GET, true, ufid_flags);
+       reply = ovs_flow_cmd_build_info(flow, table, ovs_header->dp_ifindex,
+                                       info, OVS_FLOW_CMD_GET, true,
+                                       ufid_flags);
        if (IS_ERR(reply)) {
                err = PTR_ERR(reply);
                goto unlock;
        }
 
-       ovs_unlock();
+       mutex_unlock(&table->lock);
+       ovs_flow_tbl_put(table);
        return genlmsg_reply(reply, info);
 unlock:
-       ovs_unlock();
+       mutex_unlock(&table->lock);
+       ovs_flow_tbl_put(table);
        return err;
 }
 
@@ -1405,6 +1477,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct 
genl_info *info)
        struct nlattr **a = info->attrs;
        struct ovs_header *ovs_header = genl_info_userhdr(info);
        struct net *net = sock_net(skb->sk);
+       struct flow_table *table;
        struct sw_flow_key key;
        struct sk_buff *reply;
        struct sw_flow *flow = NULL;
@@ -1425,36 +1498,49 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct 
genl_info *info)
                        return err;
        }
 
-       ovs_lock();
+       rcu_read_lock();
        dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
        if (unlikely(!dp)) {
-               err = -ENODEV;
-               goto unlock;
+               rcu_read_unlock();
+               return -ENODEV;
        }
+       table = rcu_dereference(dp->table);
+       if (!table || !ovs_flow_tbl_get(table)) {
+               rcu_read_unlock();
+               return -ENODEV;
+       }
+       rcu_read_unlock();
+
+       /* It is safe to dereference "table" after leaving rcu read-protected
+        * region because it's pinned by refcount.
+        */
+       mutex_lock(&table->lock);
+
 
        if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) {
-               err = ovs_flow_tbl_flush(&dp->table);
+               err = ovs_flow_tbl_flush(table);
                goto unlock;
        }
 
        if (ufid_present)
-               flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
+               flow = ovs_flow_tbl_lookup_ufid(table, &ufid);
        else
-               flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+               flow = ovs_flow_tbl_lookup_exact(table, &match);
        if (unlikely(!flow)) {
                err = -ENOENT;
                goto unlock;
        }
 
-       ovs_flow_tbl_remove(&dp->table, flow);
-       ovs_unlock();
+       ovs_flow_tbl_remove(table, flow);
+       mutex_unlock(&table->lock);
 
        reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force 
*) flow->sf_acts,
                                        &flow->id, info, false, ufid_flags);
        if (likely(reply)) {
                if (!IS_ERR(reply)) {
                        rcu_read_lock();        /*To keep RCU checker happy. */
-                       err = ovs_flow_cmd_fill_info(flow, 
ovs_header->dp_ifindex,
+                       err = ovs_flow_cmd_fill_info(flow, table,
+                                                    ovs_header->dp_ifindex,
                                                     reply, info->snd_portid,
                                                     info->snd_seq, 0,
                                                     OVS_FLOW_CMD_DEL,
@@ -1473,10 +1559,12 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct 
genl_info *info)
        }
 
 out_free:
+       ovs_flow_tbl_put(table);
        ovs_flow_free(flow, true);
        return 0;
 unlock:
-       ovs_unlock();
+       mutex_unlock(&table->lock);
+       ovs_flow_tbl_put(table);
        return err;
 }
 
@@ -1485,6 +1573,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct 
netlink_callback *cb)
        struct nlattr *a[__OVS_FLOW_ATTR_MAX];
        struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
        struct table_instance *ti;
+       struct flow_table *table;
        struct datapath *dp;
        u32 ufid_flags;
        int err;
@@ -1501,8 +1590,13 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct 
netlink_callback *cb)
                rcu_read_unlock();
                return -ENODEV;
        }
+       table = rcu_dereference(dp->table);
+       if (!table) {
+               rcu_read_unlock();
+               return -ENODEV;
+       }
 
-       ti = rcu_dereference(dp->table.ti);
+       ti = rcu_dereference(table->ti);
        for (;;) {
                struct sw_flow *flow;
                u32 bucket, obj;
@@ -1513,8 +1607,8 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct 
netlink_callback *cb)
                if (!flow)
                        break;
 
-               if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
-                                          NETLINK_CB(cb->skb).portid,
+               if (ovs_flow_cmd_fill_info(flow, table, ovs_header->dp_ifindex,
+                                          skb, NETLINK_CB(cb->skb).portid,
                                           cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                           OVS_FLOW_CMD_GET, ufid_flags) < 0)
                        break;
@@ -1598,8 +1692,13 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, 
struct sk_buff *skb,
        struct ovs_dp_stats dp_stats;
        struct ovs_dp_megaflow_stats dp_megaflow_stats;
        struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids);
+       struct flow_table *table;
        int err, pids_len;
 
+       table = ovsl_dereference(dp->table);
+       if (!table)
+               return -ENODEV;
+
        ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
                                 flags, cmd);
        if (!ovs_header)
@@ -1625,7 +1724,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, 
struct sk_buff *skb,
                goto nla_put_failure;
 
        if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE,
-                       ovs_flow_tbl_masks_cache_size(&dp->table)))
+                       ovs_flow_tbl_masks_cache_size(table)))
                goto nla_put_failure;
 
        if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) {
@@ -1736,6 +1835,7 @@ u32 ovs_dp_get_upcall_portid(const struct datapath *dp, 
uint32_t cpu_id)
 static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 {
        u32 user_features = 0, old_features = dp->user_features;
+       struct flow_table *table;
        int err;
 
        if (a[OVS_DP_ATTR_USER_FEATURES]) {
@@ -1757,8 +1857,12 @@ static int ovs_dp_change(struct datapath *dp, struct 
nlattr *a[])
                int err;
                u32 cache_size;
 
+               table = ovsl_dereference(dp->table);
+               if (!table)
+                       return -ENODEV;
+
                cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]);
-               err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size);
+               err = ovs_flow_tbl_masks_cache_resize(table, cache_size);
                if (err)
                        return err;
        }
@@ -1812,6 +1916,7 @@ static int ovs_dp_vport_init(struct datapath *dp)
 static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 {
        struct nlattr **a = info->attrs;
+       struct flow_table *table;
        struct vport_parms parms;
        struct sk_buff *reply;
        struct datapath *dp;
@@ -1835,9 +1940,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
        ovs_dp_set_net(dp, sock_net(skb->sk));
 
        /* Allocate table. */
-       err = ovs_flow_tbl_init(&dp->table);
-       if (err)
+       table = ovs_flow_tbl_alloc();
+       if (IS_ERR(table)) {
+               err = PTR_ERR(table);
                goto err_destroy_dp;
+       }
+       rcu_assign_pointer(dp->table, table);
 
        err = ovs_dp_stats_init(dp);
        if (err)
@@ -1907,7 +2015,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
 err_destroy_stats:
        free_percpu(dp->stats_percpu);
 err_destroy_table:
-       ovs_flow_tbl_destroy(&dp->table);
+       ovs_flow_tbl_put(dp->table);
 err_destroy_dp:
        kfree(dp);
 err_destroy_reply:
@@ -1919,7 +2027,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
 /* Called with ovs_mutex. */
 static void __dp_destroy(struct datapath *dp)
 {
-       struct flow_table *table = &dp->table;
+       struct flow_table *table = rcu_dereference_protected(dp->table,
+                                       lockdep_ovsl_is_held());
        int i;
 
        if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
@@ -1941,14 +2050,10 @@ static void __dp_destroy(struct datapath *dp)
         */
        ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
 
-       /* Flush sw_flow in the tables. RCU cb only releases resource
-        * such as dp, ports and tables. That may avoid some issues
-        * such as RCU usage warning.
-        */
-       table_instance_flow_flush(table, ovsl_dereference(table->ti),
-                                 ovsl_dereference(table->ufid_ti));
+       rcu_assign_pointer(dp->table, NULL);
+       ovs_flow_tbl_put(table);
 
-       /* RCU destroy the ports, meters and flow tables. */
+       /* RCU destroy the ports and meters. */
        call_rcu(&dp->rcu, destroy_dp_rcu);
 }
 
@@ -2556,13 +2661,18 @@ static void ovs_dp_masks_rebalance(struct work_struct 
*work)
 {
        struct ovs_net *ovs_net = container_of(work, struct ovs_net,
                                               masks_rebalance.work);
+       struct flow_table *table;
        struct datapath *dp;
 
        ovs_lock();
-
-       list_for_each_entry(dp, &ovs_net->dps, list_node)
-               ovs_flow_masks_rebalance(&dp->table);
-
+       list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) {
+               table = ovsl_dereference(dp->table);
+               if (!table)
+                       continue;
+               mutex_lock(&table->lock);
+               ovs_flow_masks_rebalance(table);
+               mutex_unlock(&table->lock);
+       }
        ovs_unlock();
 
        schedule_delayed_work(&ovs_net->masks_rebalance,
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index db0c3e69d66c..44773bf9f645 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -90,7 +90,7 @@ struct datapath {
        struct list_head list_node;
 
        /* Flow table. */
-       struct flow_table table;
+       struct flow_table __rcu *table;
 
        /* Switch ports. */
        struct hlist_head *ports;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 66366982f604..0a748cf20f53 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -124,8 +124,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
        spin_unlock(&stats->lock);
 }
 
-/* Must be called with rcu_read_lock or ovs_mutex. */
+/* Must be called with rcu_read_lock or table->lock held. */
 void ovs_flow_stats_get(const struct sw_flow *flow,
+                       const struct flow_table *table,
                        struct ovs_flow_stats *ovs_stats,
                        unsigned long *used, __be16 *tcp_flags)
 {
@@ -136,7 +137,8 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
        memset(ovs_stats, 0, sizeof(*ovs_stats));
 
        for_each_cpu(cpu, flow->cpu_used_mask) {
-               struct sw_flow_stats *stats = 
rcu_dereference_ovsl(flow->stats[cpu]);
+               struct sw_flow_stats *stats =
+                       rcu_dereference_ovs_tbl(flow->stats[cpu], table);
 
                if (stats) {
                        /* Local CPU may write on non-local stats, so we must
@@ -153,13 +155,14 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
        }
 }
 
-/* Called with ovs_mutex. */
-void ovs_flow_stats_clear(struct sw_flow *flow)
+/* Called with table->lock held. */
+void ovs_flow_stats_clear(struct sw_flow *flow, struct flow_table *table)
 {
        unsigned int cpu;
 
        for_each_cpu(cpu, flow->cpu_used_mask) {
-               struct sw_flow_stats *stats = 
ovsl_dereference(flow->stats[cpu]);
+               struct sw_flow_stats *stats =
+                       ovs_tbl_dereference(flow->stats[cpu], table);
 
                if (stats) {
                        spin_lock_bh(&stats->lock);
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index b5711aff6e76..e05ed6796e4e 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -23,6 +23,7 @@
 #include <net/dst_metadata.h>
 #include <net/nsh.h>
 
+struct flow_table;
 struct sk_buff;
 
 enum sw_flow_mac_proto {
@@ -280,9 +281,11 @@ static inline bool ovs_identifier_is_key(const struct 
sw_flow_id *sfid)
 
 void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags,
                           const struct sk_buff *);
-void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *,
-                       unsigned long *used, __be16 *tcp_flags);
-void ovs_flow_stats_clear(struct sw_flow *);
+void ovs_flow_stats_get(const struct sw_flow *flow,
+                       const struct flow_table *table,
+                       struct ovs_flow_stats *stats, unsigned long *used,
+                       __be16 *tcp_flags);
+void ovs_flow_stats_clear(struct sw_flow *flow, struct flow_table *table);
 u64 ovs_flow_used_time(unsigned long flow_jiffies);
 
 int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index ffc72a741a50..188e9d39ce00 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -45,6 +45,16 @@
 static struct kmem_cache *flow_cache;
 struct kmem_cache *flow_stats_cache __read_mostly;
 
+#ifdef CONFIG_LOCKDEP
+int lockdep_ovs_tbl_is_held(const struct flow_table *table)
+{
+       if (debug_locks)
+               return lockdep_is_held(&table->lock);
+       else
+               return 1;
+}
+#endif
+
 static u16 range_n_bytes(const struct sw_flow_key_range *range)
 {
        return range->end - range->start;
@@ -250,12 +260,12 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, 
int size)
        if (!new)
                return -ENOMEM;
 
-       old = ovsl_dereference(tbl->mask_array);
+       old = ovs_tbl_dereference(tbl->mask_array, tbl);
        if (old) {
                int i;
 
                for (i = 0; i < old->max; i++) {
-                       if (ovsl_dereference(old->masks[i]))
+                       if (ovs_tbl_dereference(old->masks[i], tbl))
                                new->masks[new->count++] = old->masks[i];
                }
                call_rcu(&old->rcu, mask_array_rcu_cb);
@@ -269,7 +279,7 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, 
int size)
 static int tbl_mask_array_add_mask(struct flow_table *tbl,
                                   struct sw_flow_mask *new)
 {
-       struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+       struct mask_array *ma = ovs_tbl_dereference(tbl->mask_array, tbl);
        int err, ma_count = READ_ONCE(ma->count);
 
        if (ma_count >= ma->max) {
@@ -278,7 +288,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
                if (err)
                        return err;
 
-               ma = ovsl_dereference(tbl->mask_array);
+               ma = ovs_tbl_dereference(tbl->mask_array, tbl);
        } else {
                /* On every add or delete we need to reset the counters so
                 * every new mask gets a fair chance of being prioritized.
@@ -286,7 +296,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
                tbl_mask_array_reset_counters(ma);
        }
 
-       BUG_ON(ovsl_dereference(ma->masks[ma_count]));
+       WARN_ON_ONCE(ovs_tbl_dereference(ma->masks[ma_count], tbl));
 
        rcu_assign_pointer(ma->masks[ma_count], new);
        WRITE_ONCE(ma->count, ma_count + 1);
@@ -297,12 +307,12 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
 static void tbl_mask_array_del_mask(struct flow_table *tbl,
                                    struct sw_flow_mask *mask)
 {
-       struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+       struct mask_array *ma = ovs_tbl_dereference(tbl->mask_array, tbl);
        int i, ma_count = READ_ONCE(ma->count);
 
        /* Remove the deleted mask pointers from the array */
        for (i = 0; i < ma_count; i++) {
-               if (mask == ovsl_dereference(ma->masks[i]))
+               if (mask == ovs_tbl_dereference(ma->masks[i], tbl))
                        goto found;
        }
 
@@ -330,10 +340,10 @@ static void tbl_mask_array_del_mask(struct flow_table 
*tbl,
 static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
 {
        if (mask) {
-               /* ovs-lock is required to protect mask-refcount and
+               /* table lock is required to protect mask-refcount and
                 * mask list.
                 */
-               ASSERT_OVSL();
+               ASSERT_OVS_TBL(tbl);
                BUG_ON(!mask->ref_count);
                mask->ref_count--;
 
@@ -387,7 +397,8 @@ static struct mask_cache *tbl_mask_cache_alloc(u32 size)
 }
 int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
 {
-       struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
+       struct mask_cache *mc = rcu_dereference_ovs_tbl(table->mask_cache,
+                                                       table);
        struct mask_cache *new;
 
        if (size == mc->cache_size)
@@ -407,15 +418,23 @@ int ovs_flow_tbl_masks_cache_resize(struct flow_table 
*table, u32 size)
        return 0;
 }
 
-int ovs_flow_tbl_init(struct flow_table *table)
+struct flow_table *ovs_flow_tbl_alloc(void)
 {
        struct table_instance *ti, *ufid_ti;
+       struct flow_table *table;
        struct mask_cache *mc;
        struct mask_array *ma;
 
+       table = kzalloc_obj(*table, GFP_KERNEL);
+       if (!table)
+               return ERR_PTR(-ENOMEM);
+
+       mutex_init(&table->lock);
+       refcount_set(&table->refcnt, 1);
+
        mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES);
        if (!mc)
-               return -ENOMEM;
+               goto free_table;
 
        ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
        if (!ma)
@@ -436,7 +455,7 @@ int ovs_flow_tbl_init(struct flow_table *table)
        table->last_rehash = jiffies;
        table->count = 0;
        table->ufid_count = 0;
-       return 0;
+       return table;
 
 free_ti:
        __table_instance_destroy(ti);
@@ -444,7 +463,10 @@ int ovs_flow_tbl_init(struct flow_table *table)
        __mask_array_destroy(ma);
 free_mask_cache:
        __mask_cache_destroy(mc);
-       return -ENOMEM;
+free_table:
+       mutex_destroy(&table->lock);
+       kfree(table);
+       return ERR_PTR(-ENOMEM);
 }
 
 static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
@@ -471,7 +493,7 @@ static void table_instance_flow_free(struct flow_table 
*table,
        flow_mask_remove(table, flow->mask);
 }
 
-/* Must be called with OVS mutex held. */
+/* Must be called with table mutex held. */
 void table_instance_flow_flush(struct flow_table *table,
                               struct table_instance *ti,
                               struct table_instance *ufid_ti)
@@ -506,11 +528,11 @@ static void table_instance_destroy(struct table_instance 
*ti,
        call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
 }
 
-/* No need for locking this function is called from RCU callback or
- * error path.
- */
-void ovs_flow_tbl_destroy(struct flow_table *table)
+/* No need for locking this function is called from RCU callback. */
+static void ovs_flow_tbl_destroy_rcu(struct rcu_head *rcu)
 {
+       struct flow_table *table = container_of(rcu, struct flow_table, rcu);
+
        struct table_instance *ti = rcu_dereference_raw(table->ti);
        struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
        struct mask_cache *mc = rcu_dereference_raw(table->mask_cache);
@@ -519,6 +541,20 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
        call_rcu(&mc->rcu, mask_cache_rcu_cb);
        call_rcu(&ma->rcu, mask_array_rcu_cb);
        table_instance_destroy(ti, ufid_ti);
+       mutex_destroy(&table->lock);
+       kfree(table);
+}
+
+void ovs_flow_tbl_put(struct flow_table *table)
+{
+       if (refcount_dec_and_test(&table->refcnt)) {
+               mutex_lock(&table->lock);
+               table_instance_flow_flush(table,
+                                         ovs_tbl_dereference(table->ti, table),
+                                         ovs_tbl_dereference(table->ufid_ti, 
table));
+               mutex_unlock(&table->lock);
+               call_rcu(&table->rcu, ovs_flow_tbl_destroy_rcu);
+       }
 }
 
 struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
@@ -572,7 +608,8 @@ static void ufid_table_instance_insert(struct 
table_instance *ti,
        hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head);
 }
 
-static void flow_table_copy_flows(struct table_instance *old,
+static void flow_table_copy_flows(struct flow_table *table,
+                                 struct table_instance *old,
                                  struct table_instance *new, bool ufid)
 {
        int old_ver;
@@ -589,17 +626,18 @@ static void flow_table_copy_flows(struct table_instance 
*old,
                if (ufid)
                        hlist_for_each_entry_rcu(flow, head,
                                                 ufid_table.node[old_ver],
-                                                lockdep_ovsl_is_held())
+                                                lockdep_ovs_tbl_is_held(table))
                                ufid_table_instance_insert(new, flow);
                else
                        hlist_for_each_entry_rcu(flow, head,
                                                 flow_table.node[old_ver],
-                                                lockdep_ovsl_is_held())
+                                                lockdep_ovs_tbl_is_held(table))
                                table_instance_insert(new, flow);
        }
 }
 
-static struct table_instance *table_instance_rehash(struct table_instance *ti,
+static struct table_instance *table_instance_rehash(struct flow_table *table,
+                                                   struct table_instance *ti,
                                                    int n_buckets, bool ufid)
 {
        struct table_instance *new_ti;
@@ -608,16 +646,19 @@ static struct table_instance 
*table_instance_rehash(struct table_instance *ti,
        if (!new_ti)
                return NULL;
 
-       flow_table_copy_flows(ti, new_ti, ufid);
+       flow_table_copy_flows(table, ti, new_ti, ufid);
 
        return new_ti;
 }
 
+/* Must be called with flow_table->lock held. */
 int ovs_flow_tbl_flush(struct flow_table *flow_table)
 {
        struct table_instance *old_ti, *new_ti;
        struct table_instance *old_ufid_ti, *new_ufid_ti;
 
+       ASSERT_OVS_TBL(flow_table);
+
        new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
        if (!new_ti)
                return -ENOMEM;
@@ -625,8 +666,8 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)
        if (!new_ufid_ti)
                goto err_free_ti;
 
-       old_ti = ovsl_dereference(flow_table->ti);
-       old_ufid_ti = ovsl_dereference(flow_table->ufid_ti);
+       old_ti = ovs_tbl_dereference(flow_table->ti, flow_table);
+       old_ufid_ti = ovs_tbl_dereference(flow_table->ufid_ti, flow_table);
 
        rcu_assign_pointer(flow_table->ti, new_ti);
        rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
@@ -694,7 +735,8 @@ static bool ovs_flow_cmp_unmasked_key(const struct sw_flow 
*flow,
        return cmp_key(flow->id.unmasked_key, key, key_start, key_end);
 }
 
-static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
+static struct sw_flow *masked_flow_lookup(struct flow_table *tbl,
+                                         struct table_instance *ti,
                                          const struct sw_flow_key *unmasked,
                                          const struct sw_flow_mask *mask,
                                          u32 *n_mask_hit)
@@ -710,7 +752,7 @@ static struct sw_flow *masked_flow_lookup(struct 
table_instance *ti,
        (*n_mask_hit)++;
 
        hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
-                                lockdep_ovsl_is_held()) {
+                                lockdep_ovs_tbl_is_held(tbl)) {
                if (flow->mask == mask && flow->flow_table.hash == hash &&
                    flow_cmp_masked_key(flow, &masked_key, &mask->range))
                        return flow;
@@ -737,9 +779,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
        int i;
 
        if (likely(*index < ma->max)) {
-               mask = rcu_dereference_ovsl(ma->masks[*index]);
+               mask = rcu_dereference_ovs_tbl(ma->masks[*index], tbl);
                if (mask) {
-                       flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
+                       flow = masked_flow_lookup(tbl, ti, key, mask, 
n_mask_hit);
                        if (flow) {
                                u64_stats_update_begin(&stats->syncp);
                                stats->usage_cntrs[*index]++;
@@ -755,11 +797,11 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
                if (i == *index)
                        continue;
 
-               mask = rcu_dereference_ovsl(ma->masks[i]);
+               mask = rcu_dereference_ovs_tbl(ma->masks[i], tbl);
                if (unlikely(!mask))
                        break;
 
-               flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
+               flow = masked_flow_lookup(tbl, ti, key, mask, n_mask_hit);
                if (flow) { /* Found */
                        *index = i;
                        u64_stats_update_begin(&stats->syncp);
@@ -846,8 +888,8 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table 
*tbl,
 struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
                                    const struct sw_flow_key *key)
 {
-       struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
-       struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
+       struct table_instance *ti = rcu_dereference_ovs_tbl(tbl->ti, tbl);
+       struct mask_array *ma = rcu_dereference_ovs_tbl(tbl->mask_array, tbl);
        u32 __always_unused n_mask_hit;
        u32 __always_unused n_cache_hit;
        struct sw_flow *flow;
@@ -866,21 +908,22 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table 
*tbl,
 struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
                                          const struct sw_flow_match *match)
 {
-       struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+       struct mask_array *ma = ovs_tbl_dereference(tbl->mask_array, tbl);
        int i;
 
-       /* Always called under ovs-mutex. */
+       /* Always called under tbl->lock. */
        for (i = 0; i < ma->max; i++) {
-               struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+               struct table_instance *ti =
+                               rcu_dereference_ovs_tbl(tbl->ti, tbl);
                u32 __always_unused n_mask_hit;
                struct sw_flow_mask *mask;
                struct sw_flow *flow;
 
-               mask = ovsl_dereference(ma->masks[i]);
+               mask = ovs_tbl_dereference(ma->masks[i], tbl);
                if (!mask)
                        continue;
 
-               flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit);
+               flow = masked_flow_lookup(tbl, ti, match->key, mask, 
&n_mask_hit);
                if (flow && ovs_identifier_is_key(&flow->id) &&
                    ovs_flow_cmp_unmasked_key(flow, match)) {
                        return flow;
@@ -916,7 +959,7 @@ bool ovs_flow_cmp(const struct sw_flow *flow,
 struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
                                         const struct sw_flow_id *ufid)
 {
-       struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti);
+       struct table_instance *ti = rcu_dereference_ovs_tbl(tbl->ufid_ti, tbl);
        struct sw_flow *flow;
        struct hlist_head *head;
        u32 hash;
@@ -924,7 +967,7 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table 
*tbl,
        hash = ufid_hash(ufid);
        head = find_bucket(ti, hash);
        hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
-                                lockdep_ovsl_is_held()) {
+                                lockdep_ovs_tbl_is_held(tbl)) {
                if (flow->ufid_table.hash == hash &&
                    ovs_flow_cmp_ufid(flow, ufid))
                        return flow;
@@ -934,28 +977,33 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct 
flow_table *tbl,
 
 int ovs_flow_tbl_num_masks(const struct flow_table *table)
 {
-       struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
+       struct mask_array *ma = rcu_dereference_ovs_tbl(table->mask_array,
+                                                       table);
        return READ_ONCE(ma->count);
 }
 
 u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table)
 {
-       struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
+       struct mask_cache *mc = rcu_dereference_ovs_tbl(table->mask_cache,
+                                                       table);
 
        return READ_ONCE(mc->cache_size);
 }
 
-static struct table_instance *table_instance_expand(struct table_instance *ti,
+static struct table_instance *table_instance_expand(struct flow_table *table,
+                                                   struct table_instance *ti,
                                                    bool ufid)
 {
-       return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
+       return table_instance_rehash(table, ti, ti->n_buckets * 2, ufid);
 }
 
-/* Must be called with OVS mutex held. */
+/* Must be called with table mutex held. */
 void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
 {
-       struct table_instance *ti = ovsl_dereference(table->ti);
-       struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
+       struct table_instance *ti = ovs_tbl_dereference(table->ti,
+                                                       table);
+       struct table_instance *ufid_ti = ovs_tbl_dereference(table->ufid_ti,
+                                                            table);
 
        BUG_ON(table->count == 0);
        table_instance_flow_free(table, ti, ufid_ti, flow);
@@ -989,10 +1037,10 @@ static struct sw_flow_mask *flow_mask_find(const struct 
flow_table *tbl,
        struct mask_array *ma;
        int i;
 
-       ma = ovsl_dereference(tbl->mask_array);
+       ma = ovs_tbl_dereference(tbl->mask_array, tbl);
        for (i = 0; i < ma->max; i++) {
                struct sw_flow_mask *t;
-               t = ovsl_dereference(ma->masks[i]);
+               t = ovs_tbl_dereference(ma->masks[i], tbl);
 
                if (t && mask_equal(mask, t))
                        return t;
@@ -1030,22 +1078,25 @@ static int flow_mask_insert(struct flow_table *tbl, 
struct sw_flow *flow,
        return 0;
 }
 
-/* Must be called with OVS mutex held. */
+/* Must be called with table mutex held. */
 static void flow_key_insert(struct flow_table *table, struct sw_flow *flow)
 {
        struct table_instance *new_ti = NULL;
        struct table_instance *ti;
 
+       ASSERT_OVS_TBL(table);
+
        flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range);
-       ti = ovsl_dereference(table->ti);
+       ti = ovs_tbl_dereference(table->ti, table);
        table_instance_insert(ti, flow);
        table->count++;
 
        /* Expand table, if necessary, to make room. */
        if (table->count > ti->n_buckets)
-               new_ti = table_instance_expand(ti, false);
+               new_ti = table_instance_expand(table, ti, false);
        else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
-               new_ti = table_instance_rehash(ti, ti->n_buckets, false);
+               new_ti = table_instance_rehash(table, ti, ti->n_buckets,
+                                              false);
 
        if (new_ti) {
                rcu_assign_pointer(table->ti, new_ti);
@@ -1054,13 +1105,15 @@ static void flow_key_insert(struct flow_table *table, 
struct sw_flow *flow)
        }
 }
 
-/* Must be called with OVS mutex held. */
+/* Must be called with table mutex held. */
 static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
 {
        struct table_instance *ti;
 
+       ASSERT_OVS_TBL(table);
+
        flow->ufid_table.hash = ufid_hash(&flow->id);
-       ti = ovsl_dereference(table->ufid_ti);
+       ti = ovs_tbl_dereference(table->ufid_ti, table);
        ufid_table_instance_insert(ti, flow);
        table->ufid_count++;
 
@@ -1068,7 +1121,7 @@ static void flow_ufid_insert(struct flow_table *table, 
struct sw_flow *flow)
        if (table->ufid_count > ti->n_buckets) {
                struct table_instance *new_ti;
 
-               new_ti = table_instance_expand(ti, true);
+               new_ti = table_instance_expand(table, ti, true);
                if (new_ti) {
                        rcu_assign_pointer(table->ufid_ti, new_ti);
                        call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
@@ -1076,12 +1129,14 @@ static void flow_ufid_insert(struct flow_table *table, 
struct sw_flow *flow)
        }
 }
 
-/* Must be called with OVS mutex held. */
+/* Must be called with table mutex held. */
 int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
                        const struct sw_flow_mask *mask)
 {
        int err;
 
+       ASSERT_OVS_TBL(table);
+
        err = flow_mask_insert(table, flow, mask);
        if (err)
                return err;
@@ -1100,10 +1155,11 @@ static int compare_mask_and_count(const void *a, const 
void *b)
        return (s64)mc_b->counter - (s64)mc_a->counter;
 }
 
-/* Must be called with OVS mutex held. */
+/* Must be called with table->lock held. */
 void ovs_flow_masks_rebalance(struct flow_table *table)
 {
-       struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
+       struct mask_array *ma = rcu_dereference_ovs_tbl(table->mask_array,
+                                                       table);
        struct mask_count *masks_and_count;
        struct mask_array *new;
        int masks_entries = 0;
@@ -1119,7 +1175,7 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
                struct sw_flow_mask *mask;
                int cpu;
 
-               mask = rcu_dereference_ovsl(ma->masks[i]);
+               mask = rcu_dereference_ovs_tbl(ma->masks[i], table);
                if (unlikely(!mask))
                        break;
 
@@ -1173,7 +1229,7 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
        for (i = 0; i < masks_entries; i++) {
                int index = masks_and_count[i].index;
 
-               if (ovsl_dereference(ma->masks[index]))
+               if (ovs_tbl_dereference(ma->masks[index], table))
                        new->masks[new->count++] = ma->masks[index];
        }
 
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index f524dc3e4862..cffd412c9045 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -59,7 +59,29 @@ struct table_instance {
        u32 hash_seed;
 };
 
+/* Locking:
+ *
+ * flow_table is _not_ protected by ovs_lock (see comment above ovs_mutex
+ * in datapath.c).
+ *
+ * All writes to flow_table are protected by the embedded "lock".
+ * In order to ensure datapath destruction does not trigger the destruction
+ * of the flow_table, "refcnt" is used. Therefore, writers must:
+ * 1 - Enter rcu read-protected section
+ * 2 - Increase "table->refcnt"
+ * 3 - Leave rcu read-protected section (to avoid using mutexes inside rcu)
+ * 4 - Lock "table->lock"
+ * 5 - Perform modifications
+ * 6 - Release "table->lock"
+ * 7 - Decrease "table->refcnt"
+ *
+ * Reads are protected by RCU.
+ */
 struct flow_table {
+       /* Locks flow table writes. */
+       struct mutex lock;
+       refcount_t refcnt;
+       struct rcu_head rcu;
        struct table_instance __rcu *ti;
        struct table_instance __rcu *ufid_ti;
        struct mask_cache __rcu *mask_cache;
@@ -71,15 +93,40 @@ struct flow_table {
 
 extern struct kmem_cache *flow_stats_cache;
 
+#ifdef CONFIG_LOCKDEP
+int lockdep_ovs_tbl_is_held(const struct flow_table *table);
+#else
+static inline int lockdep_ovs_tbl_is_held(const struct flow_table *table)
+{
+       (void)table;
+       return 1;
+}
+#endif
+
+#define ASSERT_OVS_TBL(tbl)   WARN_ON(!lockdep_ovs_tbl_is_held(tbl))
+
+/* Lock-protected update-allowed dereferences.*/
+#define ovs_tbl_dereference(p, tbl)    \
+       rcu_dereference_protected(p, lockdep_ovs_tbl_is_held(tbl))
+
+/* Read dereferences can be protected by either RCU, table lock or ovs_mutex. 
*/
+#define rcu_dereference_ovs_tbl(p, tbl) \
+       rcu_dereference_check(p,                \
+               lockdep_ovs_tbl_is_held(tbl) || lockdep_ovsl_is_held())
+
 int ovs_flow_init(void);
 void ovs_flow_exit(void);
 
 struct sw_flow *ovs_flow_alloc(void);
 void ovs_flow_free(struct sw_flow *, bool deferred);
 
-int ovs_flow_tbl_init(struct flow_table *);
+struct flow_table *ovs_flow_tbl_alloc(void);
+void ovs_flow_tbl_put(struct flow_table *table);
+static inline bool ovs_flow_tbl_get(struct flow_table *table)
+{
+       return refcount_inc_not_zero(&table->refcnt);
+}
 int ovs_flow_tbl_count(const struct flow_table *table);
-void ovs_flow_tbl_destroy(struct flow_table *table);
 int ovs_flow_tbl_flush(struct flow_table *flow_table);
 
 int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
-- 
2.53.0

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to