The lock priority order is for the global 'ct_lock' to be taken first and then 'conn->lock'. This is an issue, as multiple operations on connections are thus blocked between threads contending on the global 'ct_lock'.
This was previously necessary due to how the expiration lists, timeout policies and zone limits were managed. They are now using RCU-friendly structures that allow concurrent readers. The mutual exclusion now only needs to happen during writes. This allows reducing the 'ct_lock' precedence, and to only take it when writing the relevant structures. This will reduce contention on 'ct_lock', which impairs scalability when the connection tracker is used by many threads. Signed-off-by: Gaetan Rivet <[email protected]> Reviewed-by: Eli Britstein <[email protected]> --- lib/conntrack-private.h | 7 ++++-- lib/conntrack-tp.c | 30 +------------------------ lib/conntrack.c | 49 ++++++++++++++++++++++++++--------------- 3 files changed, 37 insertions(+), 49 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index 87107a05d..183785d8a 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -134,6 +134,9 @@ struct conn { uint16_t nat_action; char *alg; struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */ + atomic_flag reclaimed; /* False during the lifetime of the connection, + * True as soon as a thread has started freeing + * its memory. */ /* Inserted once by a PMD, then managed by the 'ct_clean' thread. */ struct conn_expire exp; @@ -229,8 +232,8 @@ struct conntrack { }; /* Lock acquisition order: - * 1. 'ct_lock' - * 2. 'conn->lock' + * 1. 'conn->lock' + * 2. 'ct_lock' * 3. 'resources_lock' */ diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c index 592e10c6f..22363e7fe 100644 --- a/lib/conntrack-tp.c +++ b/lib/conntrack-tp.c @@ -245,58 +245,30 @@ conn_schedule_expiration(struct conn *conn, enum ct_timeout tm, long long now, ignore(atomic_flag_test_and_set(&conn->exp.reschedule)); } -static void -conn_update_expiration__(struct conntrack *ct, struct conn *conn, - enum ct_timeout tm, long long now, - uint32_t tp_value) - OVS_REQUIRES(conn->lock) -{ - ovs_mutex_unlock(&conn->lock); - - ovs_mutex_lock(&ct->ct_lock); - ovs_mutex_lock(&conn->lock); - conn_schedule_expiration(conn, tm, now, tp_value); - ovs_mutex_unlock(&conn->lock); - ovs_mutex_unlock(&ct->ct_lock); - - ovs_mutex_lock(&conn->lock); -} - /* The conn entry lock must be held on entry and exit. */ void conn_update_expiration(struct conntrack *ct, struct conn *conn, enum ct_timeout tm, long long now) - OVS_REQUIRES(conn->lock) { struct timeout_policy *tp; uint32_t val; - ovs_mutex_unlock(&conn->lock); - - ovs_mutex_lock(&ct->ct_lock); - ovs_mutex_lock(&conn->lock); tp = timeout_policy_lookup(ct, conn->tp_id); if (tp) { val = tp->policy.attrs[tm_to_ct_dpif_tp(tm)]; } else { val = ct_dpif_netdev_tp_def[tm_to_ct_dpif_tp(tm)]; } - ovs_mutex_unlock(&conn->lock); - ovs_mutex_unlock(&ct->ct_lock); - - ovs_mutex_lock(&conn->lock); VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d " "val=%u sec.", ct_timeout_str[tm], conn->key.zone, conn->tp_id, val); - conn_update_expiration__(ct, conn, tm, now, val); + conn_schedule_expiration(conn, tm, now, val); } -/* ct_lock must be held. */ void conn_init_expiration(struct conntrack *ct, struct conn *conn, enum ct_timeout tm, long long now) - OVS_REQUIRES(ct->ct_lock) { struct timeout_policy *tp; uint32_t val; diff --git a/lib/conntrack.c b/lib/conntrack.c index 81322e405..e4262fdf3 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -468,7 +468,7 @@ zone_limit_delete(struct conntrack *ct, uint16_t zone) static void conn_clean_cmn(struct conntrack *ct, struct conn *conn) - OVS_REQUIRES(ct->ct_lock) + OVS_REQUIRES(conn->lock, ct->ct_lock) { if (conn->alg) { expectation_clean(ct, &conn->key); @@ -498,18 +498,29 @@ conn_unref(struct conn *conn) * removes the associated nat 'conn' from the lookup datastructures. */ static void conn_clean(struct conntrack *ct, struct conn *conn) - OVS_REQUIRES(ct->ct_lock) + OVS_EXCLUDED(conn->lock, ct->ct_lock) { ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT); + if (atomic_flag_test_and_set(&conn->reclaimed)) { + return; + } + + ovs_mutex_lock(&conn->lock); + + ovs_mutex_lock(&ct->ct_lock); conn_clean_cmn(ct, conn); if (conn->nat_conn) { uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis); cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash); } + ovs_mutex_unlock(&ct->ct_lock); + conn->cleaned = true; conn_unref(conn); atomic_count_dec(&ct->n_conn); + + ovs_mutex_unlock(&conn->lock); } static inline bool @@ -524,14 +535,25 @@ conn_unref_one(struct conn *conn) static void conn_clean_one(struct conntrack *ct, struct conn *conn) - OVS_REQUIRES(ct->ct_lock) + OVS_EXCLUDED(conn->lock, ct->ct_lock) { + if (atomic_flag_test_and_set(&conn->reclaimed)) { + return; + } + + ovs_mutex_lock(&conn->lock); + + ovs_mutex_lock(&ct->ct_lock); conn_clean_cmn(ct, conn); + ovs_mutex_unlock(&ct->ct_lock); + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { conn->cleaned = true; atomic_count_dec(&ct->n_conn); } conn_unref_one(conn); + + ovs_mutex_unlock(&conn->lock); } /* Destroys the connection tracker 'ct' and frees all the allocated memory. @@ -545,8 +567,6 @@ conntrack_destroy(struct conntrack *ct) pthread_join(ct->clean_thread, NULL); latch_destroy(&ct->clean_thread_exit); - ovs_mutex_lock(&ct->ct_lock); - for (unsigned i = 0; i < N_CT_TM; i++) { struct mpsc_queue_node *node; @@ -562,7 +582,6 @@ conntrack_destroy(struct conntrack *ct) CMAP_FOR_EACH (conn, cm_node, &ct->conns) { conn_clean_one(ct, conn); } - cmap_destroy(&ct->conns); struct zone_limit *zl; CMAP_FOR_EACH (zl, node, &ct->zone_limits) { @@ -571,7 +590,6 @@ conntrack_destroy(struct conntrack *ct) cmap_remove(&ct->zone_limits, &zl->node, hash); ovsrcu_postpone(free, zl); } - cmap_destroy(&ct->zone_limits); struct timeout_policy *tp; CMAP_FOR_EACH (tp, node, &ct->timeout_policies) { @@ -580,6 +598,11 @@ conntrack_destroy(struct conntrack *ct) cmap_remove(&ct->timeout_policies, &tp->node, hash); ovsrcu_postpone(free, tp); } + + ovs_mutex_lock(&ct->ct_lock); + + cmap_destroy(&ct->conns); + cmap_destroy(&ct->zone_limits); cmap_destroy(&ct->timeout_policies); ovs_mutex_unlock(&ct->ct_lock); @@ -1119,6 +1142,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, nc->nat_conn = nat_conn; ovs_mutex_init_adaptive(&nc->lock); nc->conn_type = CT_CONN_TYPE_DEFAULT; + atomic_flag_clear(&nc->reclaimed); cmap_insert(&ct->conns, &nc->cm_node, ctx->hash); conn_expire_push_back(ct, nc); atomic_count_inc(&ct->n_conn); @@ -1180,11 +1204,9 @@ conn_update_state(struct conntrack *ct, struct dp_packet *pkt, pkt->md.ct_state = CS_INVALID; break; case CT_UPDATE_NEW: - ovs_mutex_lock(&ct->ct_lock); if (conn_lookup(ct, &conn->key, now, NULL, NULL)) { conn_clean(ct, conn); } - ovs_mutex_unlock(&ct->ct_lock); create_new_conn = true; break; case CT_UPDATE_VALID_NEW: @@ -1395,11 +1417,9 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, /* Delete found entry if in wrong direction. 'force' implies commit. */ if (OVS_UNLIKELY(force && ctx->reply && conn)) { - ovs_mutex_lock(&ct->ct_lock); if (conn_lookup(ct, &conn->key, now, NULL, NULL)) { conn_clean(ct, conn); } - ovs_mutex_unlock(&ct->ct_lock); conn = NULL; } @@ -1593,8 +1613,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t limit) struct mpsc_queue_node *node; size_t count = 0; - ovs_mutex_lock(&ct->ct_lock); - for (unsigned i = 0; i < N_CT_TM; i++) { struct conn *end_of_queue = NULL; @@ -1659,7 +1677,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t limit) out: VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count, time_msec() - now); - ovs_mutex_unlock(&ct->ct_lock); return min_expiration; } @@ -2858,13 +2875,11 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone) { struct conn *conn; - ovs_mutex_lock(&ct->ct_lock); CMAP_FOR_EACH (conn, cm_node, &ct->conns) { if (!zone || *zone == conn->key.zone) { conn_clean_one(ct, conn); } } - ovs_mutex_unlock(&ct->ct_lock); return 0; } @@ -2879,7 +2894,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple, memset(&key, 0, sizeof(key)); tuple_to_conn_key(tuple, zone, &key); - ovs_mutex_lock(&ct->ct_lock); conn_lookup(ct, &key, time_msec(), &conn, NULL); if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) { @@ -2889,7 +2903,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple, error = ENOENT; } - ovs_mutex_unlock(&ct->ct_lock); return error; } -- 2.31.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
