Sashiko reports for races and possible crash around the usage of est_cpulist_valid and sysctl_est_cpulist. The problem is that we do not lock est_mutex in some places which can lead to wrong write ordering and as result problems when calling cpumask_weight() and cpumask_empty().
Fix them by moving the est_max_threads read/write under locked est_mutex. Do the same for one ip_vs_est_reload_start() call to protect the cpumask_empty() usage of sysctl_est_cpulist. To remove the chance of deadlock while stopping the estimation kthreads, use the service_mutex to walk the array with kthreads and hold it during kthread_stop(). OTOH, est_mutex is needed only while starting the kthreads, not for stopping. Reorganize the code in kthread 0 to use mutex_trylock() for the service_mutex to ensure kthread_should_stop() is not true while we attempt to lock the mutex. Link: https://sashiko.dev/#/patchset/20260331165015.2777765-1-longman%40redhat.com Link: https://sashiko.dev/#/patchset/20260420171308.87192-1-ja%40ssi.bg Fixes: f0be83d54217 ("ipvs: add est_cpulist and est_nice sysctl vars") Signed-off-by: Julian Anastasov <[email protected]> --- net/netfilter/ipvs/ip_vs_ctl.c | 11 +++++-- net/netfilter/ipvs/ip_vs_est.c | 59 ++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index caec516856e9..fda166aca4fb 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -250,7 +250,7 @@ static void est_reload_work_handler(struct work_struct *work) int genid; int id; - mutex_lock(&ipvs->est_mutex); + mutex_lock(&ipvs->service_mutex); genid = atomic_read(&ipvs->est_genid); for (id = 0; id < ipvs->est_kt_count; id++) { struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; @@ -263,12 +263,14 @@ static void est_reload_work_handler(struct work_struct *work) /* New config ? Stop kthread tasks */ if (genid != genid_done) ip_vs_est_kthread_stop(kd); + mutex_lock(&ipvs->est_mutex); if (!kd->task && !ip_vs_est_stopped(ipvs)) { /* Do not start kthreads above 0 in calc phase */ if ((!id || !ipvs->est_calc_phase) && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } + mutex_unlock(&ipvs->est_mutex); } atomic_set(&ipvs->est_genid_done, genid); @@ -278,7 +280,7 @@ static void est_reload_work_handler(struct work_struct *work) delay); unlock: - mutex_unlock(&ipvs->est_mutex); + mutex_unlock(&ipvs->service_mutex); } static int get_conn_tab_size(struct netns_ipvs *ipvs) @@ -1812,11 +1814,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; if (!READ_ONCE(ipvs->enable)) { + mutex_lock(&ipvs->est_mutex); + /* Now there is a service - full throttle */ WRITE_ONCE(ipvs->enable, 1); + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); + mutex_unlock(&ipvs->est_mutex); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 433ba3cab58c..216e3c354125 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -68,6 +68,10 @@ and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data + - data protected by service_mutex: est_temp_list, est_add_ktid, + est_kt_count, est_kt_arr, est_genid_done, kd->task + - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist, + est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation */ static struct lock_class_key __ipvs_est_key; @@ -229,6 +233,8 @@ static int ip_vs_estimation_kthread(void *data) /* Schedule stop/start for kthread tasks */ void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { + lockdep_assert_held(&ipvs->est_mutex); + /* Ignore reloads before first service is added */ if (!READ_ONCE(ipvs->enable)) return; @@ -304,12 +310,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) void *arr = NULL; int i; - if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - READ_ONCE(ipvs->enable) && ipvs->est_max_threads) - return -EINVAL; - mutex_lock(&ipvs->est_mutex); + /* Allow kt 0 data to be created before the services are added + * and limit the kthreads when services are present. + */ + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) { + ret = -EINVAL; + goto out; + } + for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; @@ -485,9 +496,6 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) - ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); - est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ @@ -561,7 +569,6 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) } if (ktid > 0) { - mutex_lock(&ipvs->est_mutex); ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[ktid] = NULL; if (ktid == ipvs->est_kt_count - 1) { @@ -570,7 +577,6 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) ipvs->est_kt_count--; } - mutex_unlock(&ipvs->est_mutex); /* This slot is now empty, prefer another available kt slot */ if (ktid == ipvs->est_add_ktid) @@ -582,13 +588,11 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; if (!kd || !kd->est_count) { - mutex_lock(&ipvs->est_mutex); if (kd) { ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[0] = NULL; } ipvs->est_kt_count--; - mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } } @@ -602,13 +606,17 @@ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) while (1) { int max = 16; - mutex_lock(&ipvs->service_mutex); + while (!mutex_trylock(&ipvs->service_mutex)) { + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) + return; + cond_resched(); + } while (max-- > 0) { est = hlist_entry_safe(ipvs->est_temp_list.first, struct ip_vs_estimator, list); if (est) { - if (kthread_should_stop()) + if (!READ_ONCE(ipvs->enable)) goto unlock; hlist_del_init(&est->list); if (ip_vs_enqueue_estimator(ipvs, est) >= 0) @@ -647,7 +655,11 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) u64 val; INIT_HLIST_HEAD(&chain); - mutex_lock(&ipvs->service_mutex); + while (!mutex_trylock(&ipvs->service_mutex)) { + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) + return 0; + cond_resched(); + } kd = ipvs->est_kt_arr[0]; mutex_unlock(&ipvs->service_mutex); s = kd ? kd->calc_stats : NULL; @@ -748,22 +760,24 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; - mutex_lock(&ipvs->service_mutex); + while (!mutex_trylock(&ipvs->service_mutex)) { + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) + return; + cond_resched(); + } /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ - mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!READ_ONCE(ipvs->enable)) - goto unlock2; + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) + goto unlock; kd = ipvs->est_kt_arr[id]; if (!kd) continue; ip_vs_est_kthread_stop(kd); } - mutex_unlock(&ipvs->est_mutex); /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while @@ -817,7 +831,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) */ mutex_unlock(&ipvs->service_mutex); cond_resched(); - mutex_lock(&ipvs->service_mutex); + while (!mutex_trylock(&ipvs->service_mutex)) { + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) + return; + cond_resched(); + } /* Current kt released ? */ if (id >= ipvs->est_kt_count) @@ -889,7 +907,6 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; -unlock2: mutex_unlock(&ipvs->est_mutex); unlock: -- 2.53.0
