The lock priority order is for the global 'ct_lock' to be taken first
and then 'conn->lock'. This is an issue, as multiple operations on
connections are thus blocked between threads contending on the
global 'ct_lock'.

This was previously necessary due to how the expiration lists, timeout
policies and zone limits were managed. They are now using RCU-friendly
structures that allow concurrent readers. The mutual exclusion now only
needs to happen during writes.

This allows reducing the 'ct_lock' precedence, and to only take it
when writing the relevant structures. This will reduce contention on
'ct_lock', which impairs scalability when the connection tracker is
used by many threads.

Signed-off-by: Gaetan Rivet <[email protected]>
Reviewed-by: Eli Britstein <[email protected]>
---
 lib/conntrack-private.h |  7 ++++--
 lib/conntrack-tp.c      | 30 +------------------------
 lib/conntrack.c         | 49 ++++++++++++++++++++++++++---------------
 3 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index 87107a05d..183785d8a 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -134,6 +134,9 @@ struct conn {
     uint16_t nat_action;
     char *alg;
     struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
+    atomic_flag reclaimed; /* False during the lifetime of the connection,
+                            * True as soon as a thread has started freeing
+                            * its memory. */
 
     /* Inserted once by a PMD, then managed by the 'ct_clean' thread. */
     struct conn_expire exp;
@@ -229,8 +232,8 @@ struct conntrack {
 };
 
 /* Lock acquisition order:
- *    1. 'ct_lock'
- *    2. 'conn->lock'
+ *    1. 'conn->lock'
+ *    2. 'ct_lock'
  *    3. 'resources_lock'
  */
 
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index 592e10c6f..22363e7fe 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -245,58 +245,30 @@ conn_schedule_expiration(struct conn *conn, enum 
ct_timeout tm, long long now,
     ignore(atomic_flag_test_and_set(&conn->exp.reschedule));
 }
 
-static void
-conn_update_expiration__(struct conntrack *ct, struct conn *conn,
-                         enum ct_timeout tm, long long now,
-                         uint32_t tp_value)
-    OVS_REQUIRES(conn->lock)
-{
-    ovs_mutex_unlock(&conn->lock);
-
-    ovs_mutex_lock(&ct->ct_lock);
-    ovs_mutex_lock(&conn->lock);
-    conn_schedule_expiration(conn, tm, now, tp_value);
-    ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&ct->ct_lock);
-
-    ovs_mutex_lock(&conn->lock);
-}
-
 /* The conn entry lock must be held on entry and exit. */
 void
 conn_update_expiration(struct conntrack *ct, struct conn *conn,
                        enum ct_timeout tm, long long now)
-    OVS_REQUIRES(conn->lock)
 {
     struct timeout_policy *tp;
     uint32_t val;
 
-    ovs_mutex_unlock(&conn->lock);
-
-    ovs_mutex_lock(&ct->ct_lock);
-    ovs_mutex_lock(&conn->lock);
     tp = timeout_policy_lookup(ct, conn->tp_id);
     if (tp) {
         val = tp->policy.attrs[tm_to_ct_dpif_tp(tm)];
     } else {
         val = ct_dpif_netdev_tp_def[tm_to_ct_dpif_tp(tm)];
     }
-    ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&ct->ct_lock);
-
-    ovs_mutex_lock(&conn->lock);
     VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d "
                 "val=%u sec.",
                 ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
 
-    conn_update_expiration__(ct, conn, tm, now, val);
+    conn_schedule_expiration(conn, tm, now, val);
 }
 
-/* ct_lock must be held. */
 void
 conn_init_expiration(struct conntrack *ct, struct conn *conn,
                      enum ct_timeout tm, long long now)
-    OVS_REQUIRES(ct->ct_lock)
 {
     struct timeout_policy *tp;
     uint32_t val;
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 81322e405..e4262fdf3 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -468,7 +468,7 @@ zone_limit_delete(struct conntrack *ct, uint16_t zone)
 
 static void
 conn_clean_cmn(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_REQUIRES(conn->lock, ct->ct_lock)
 {
     if (conn->alg) {
         expectation_clean(ct, &conn->key);
@@ -498,18 +498,29 @@ conn_unref(struct conn *conn)
  * removes the associated nat 'conn' from the lookup datastructures. */
 static void
 conn_clean(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_EXCLUDED(conn->lock, ct->ct_lock)
 {
     ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
 
+    if (atomic_flag_test_and_set(&conn->reclaimed)) {
+        return;
+    }
+
+    ovs_mutex_lock(&conn->lock);
+
+    ovs_mutex_lock(&ct->ct_lock);
     conn_clean_cmn(ct, conn);
     if (conn->nat_conn) {
         uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
         cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
     }
+    ovs_mutex_unlock(&ct->ct_lock);
+
     conn->cleaned = true;
     conn_unref(conn);
     atomic_count_dec(&ct->n_conn);
+
+    ovs_mutex_unlock(&conn->lock);
 }
 
 static inline bool
@@ -524,14 +535,25 @@ conn_unref_one(struct conn *conn)
 
 static void
 conn_clean_one(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_EXCLUDED(conn->lock, ct->ct_lock)
 {
+    if (atomic_flag_test_and_set(&conn->reclaimed)) {
+        return;
+    }
+
+    ovs_mutex_lock(&conn->lock);
+
+    ovs_mutex_lock(&ct->ct_lock);
     conn_clean_cmn(ct, conn);
+    ovs_mutex_unlock(&ct->ct_lock);
+
     if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
         conn->cleaned = true;
         atomic_count_dec(&ct->n_conn);
     }
     conn_unref_one(conn);
+
+    ovs_mutex_unlock(&conn->lock);
 }
 
 /* Destroys the connection tracker 'ct' and frees all the allocated memory.
@@ -545,8 +567,6 @@ conntrack_destroy(struct conntrack *ct)
     pthread_join(ct->clean_thread, NULL);
     latch_destroy(&ct->clean_thread_exit);
 
-    ovs_mutex_lock(&ct->ct_lock);
-
     for (unsigned i = 0; i < N_CT_TM; i++) {
         struct mpsc_queue_node *node;
 
@@ -562,7 +582,6 @@ conntrack_destroy(struct conntrack *ct)
     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
         conn_clean_one(ct, conn);
     }
-    cmap_destroy(&ct->conns);
 
     struct zone_limit *zl;
     CMAP_FOR_EACH (zl, node, &ct->zone_limits) {
@@ -571,7 +590,6 @@ conntrack_destroy(struct conntrack *ct)
         cmap_remove(&ct->zone_limits, &zl->node, hash);
         ovsrcu_postpone(free, zl);
     }
-    cmap_destroy(&ct->zone_limits);
 
     struct timeout_policy *tp;
     CMAP_FOR_EACH (tp, node, &ct->timeout_policies) {
@@ -580,6 +598,11 @@ conntrack_destroy(struct conntrack *ct)
         cmap_remove(&ct->timeout_policies, &tp->node, hash);
         ovsrcu_postpone(free, tp);
     }
+
+    ovs_mutex_lock(&ct->ct_lock);
+
+    cmap_destroy(&ct->conns);
+    cmap_destroy(&ct->zone_limits);
     cmap_destroy(&ct->timeout_policies);
 
     ovs_mutex_unlock(&ct->ct_lock);
@@ -1119,6 +1142,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet 
*pkt,
         nc->nat_conn = nat_conn;
         ovs_mutex_init_adaptive(&nc->lock);
         nc->conn_type = CT_CONN_TYPE_DEFAULT;
+        atomic_flag_clear(&nc->reclaimed);
         cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
         conn_expire_push_back(ct, nc);
         atomic_count_inc(&ct->n_conn);
@@ -1180,11 +1204,9 @@ conn_update_state(struct conntrack *ct, struct dp_packet 
*pkt,
             pkt->md.ct_state = CS_INVALID;
             break;
         case CT_UPDATE_NEW:
-            ovs_mutex_lock(&ct->ct_lock);
             if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
                 conn_clean(ct, conn);
             }
-            ovs_mutex_unlock(&ct->ct_lock);
             create_new_conn = true;
             break;
         case CT_UPDATE_VALID_NEW:
@@ -1395,11 +1417,9 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
 
     /* Delete found entry if in wrong direction. 'force' implies commit. */
     if (OVS_UNLIKELY(force && ctx->reply && conn)) {
-        ovs_mutex_lock(&ct->ct_lock);
         if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
             conn_clean(ct, conn);
         }
-        ovs_mutex_unlock(&ct->ct_lock);
         conn = NULL;
     }
 
@@ -1593,8 +1613,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t 
limit)
     struct mpsc_queue_node *node;
     size_t count = 0;
 
-    ovs_mutex_lock(&ct->ct_lock);
-
     for (unsigned i = 0; i < N_CT_TM; i++) {
         struct conn *end_of_queue = NULL;
 
@@ -1659,7 +1677,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t 
limit)
 out:
     VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
              time_msec() - now);
-    ovs_mutex_unlock(&ct->ct_lock);
     return min_expiration;
 }
 
@@ -2858,13 +2875,11 @@ conntrack_flush(struct conntrack *ct, const uint16_t 
*zone)
 {
     struct conn *conn;
 
-    ovs_mutex_lock(&ct->ct_lock);
     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
         if (!zone || *zone == conn->key.zone) {
             conn_clean_one(ct, conn);
         }
     }
-    ovs_mutex_unlock(&ct->ct_lock);
 
     return 0;
 }
@@ -2879,7 +2894,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct 
ct_dpif_tuple *tuple,
 
     memset(&key, 0, sizeof(key));
     tuple_to_conn_key(tuple, zone, &key);
-    ovs_mutex_lock(&ct->ct_lock);
     conn_lookup(ct, &key, time_msec(), &conn, NULL);
 
     if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
@@ -2889,7 +2903,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct 
ct_dpif_tuple *tuple,
         error = ENOENT;
     }
 
-    ovs_mutex_unlock(&ct->ct_lock);
     return error;
 }
 
-- 
2.31.1

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to