Hi, With the removal of the kernel lock from timeout_barrier(9), softclock() and the timeout thread do not need the kernel lock.
However, many timeouts implicitly rely on the kernel lock. So to unlock softclock() and the timeout thread I propose adding a new flag, TIMEOUT_MPSAFE. The flag signifies that a given timeout doesn't need the kernel lock at all. We'll run all such timeouts without the kernel lock. Taking/releasing the kernel lock during timeout_run() on a per-timeout basis is way too slow. Instead, during softclock() we can aggregate TIMEOUT_MPSAFE timeouts onto secondary queues and process them all at once after dropping the kernel lock. This will minimize locking overhead. Normal timeouts are put onto timeout_todo_mpsafe and are run during softclock(). Process timeouts are put onto timeout_proc_mpsafe and run from the timeout thread. The funky locking dance in softclock() and softclock_thread() is needed to keep from violating the locking hierarchy. The timeout_mutex is above the kernel lock, so we need to leave the timeout_mutex, then drop the kernel lock, and then reenter the timeout_mutex to start running TIMEOUT_MPSAFE timeouts. Thoughts? Index: kern/kern_timeout.c =================================================================== RCS file: /cvs/src/sys/kern/kern_timeout.c,v retrieving revision 1.84 diff -u -p -r1.84 kern_timeout.c --- kern/kern_timeout.c 11 May 2021 13:29:25 -0000 1.84 +++ kern/kern_timeout.c 13 May 2021 05:01:53 -0000 @@ -74,7 +74,9 @@ struct circq timeout_wheel[BUCKETS]; /* struct circq timeout_wheel_kc[BUCKETS]; /* [T] Clock-based timeouts */ struct circq timeout_new; /* [T] New, unscheduled timeouts */ struct circq timeout_todo; /* [T] Due or needs rescheduling */ +struct circq timeout_todo_mpsafe; /* [T] Due + no kernel lock */ struct circq timeout_proc; /* [T] Due + needs process context */ +struct circq timeout_proc_mpsafe; /* [T] Due, proc ctx, no kernel lock */ time_t timeout_level_width[WHEELCOUNT]; /* [I] Wheel level width (seconds) */ struct timespec tick_ts; /* [I] Length of a tick (1/hz secs) */ @@ -228,7 +230,9 @@ timeout_startup(void) CIRCQ_INIT(&timeout_new); CIRCQ_INIT(&timeout_todo); + CIRCQ_INIT(&timeout_todo_mpsafe); CIRCQ_INIT(&timeout_proc); + CIRCQ_INIT(&timeout_proc_mpsafe); for (b = 0; b < nitems(timeout_wheel); b++) CIRCQ_INIT(&timeout_wheel[b]); for (b = 0; b < nitems(timeout_wheel_kc); b++) @@ -697,7 +701,13 @@ softclock_process_kclock_timeout(struct if (!new && timespeccmp(&to->to_abstime, &kc->kc_late, <=)) tostat.tos_late++; if (ISSET(to->to_flags, TIMEOUT_PROC)) { - CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); + if (ISSET(to->to_flags, TIMEOUT_MPSAFE)) + CIRCQ_INSERT_TAIL(&timeout_proc_mpsafe, &to->to_list); + else + CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); + return; + } else if (ISSET(to->to_flags, TIMEOUT_MPSAFE)) { + CIRCQ_INSERT_TAIL(&timeout_todo_mpsafe, &to->to_list); return; } timeout_run(to); @@ -719,7 +729,13 @@ softclock_process_tick_timeout(struct ti if (!new && delta < 0) tostat.tos_late++; if (ISSET(to->to_flags, TIMEOUT_PROC)) { - CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); + if (ISSET(to->to_flags, TIMEOUT_MPSAFE)) + CIRCQ_INSERT_TAIL(&timeout_proc_mpsafe, &to->to_list); + else + CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); + return; + } else if (ISSET(to->to_flags, TIMEOUT_MPSAFE)) { + CIRCQ_INSERT_TAIL(&timeout_todo_mpsafe, &to->to_list); return; } timeout_run(to); @@ -735,11 +751,14 @@ softclock_process_tick_timeout(struct ti void softclock(void *arg) { + struct circq *cq; struct timeout *first_new, *to; - int needsproc, new; + int needsproc, new, unlocked; first_new = NULL; + needsproc = 1; new = 0; + unlocked = 0; mtx_enter(&timeout_mutex); if (!CIRCQ_EMPTY(&timeout_new)) @@ -755,10 +774,27 @@ softclock(void *arg) else softclock_process_tick_timeout(to, new); } + if (!CIRCQ_EMPTY(&timeout_todo_mpsafe)) { + mtx_leave(&timeout_mutex); + KERNEL_UNLOCK(); + unlocked = 1; + mtx_enter(&timeout_mutex); + while (!CIRCQ_EMPTY(&timeout_todo_mpsafe)) { + cq = CIRCQ_FIRST(&timeout_todo_mpsafe); + to = timeout_from_circq(cq); + CIRCQ_REMOVE(&to->to_list); + timeout_run(to); + tostat.tos_run_softclock++; + } + } tostat.tos_softclocks++; - needsproc = !CIRCQ_EMPTY(&timeout_proc); + if (CIRCQ_EMPTY(&timeout_proc) && CIRCQ_EMPTY(&timeout_proc_mpsafe)) + needsproc = 0; mtx_leave(&timeout_mutex); + if (unlocked) + KERNEL_LOCK(); + if (needsproc) wakeup(&timeout_proc); } @@ -775,9 +811,10 @@ softclock_thread(void *arg) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; + struct circq *cq; struct sleep_state sls; struct timeout *to; - int s; + int do_sleep, s, unlocked; KERNEL_ASSERT_LOCKED(); @@ -792,7 +829,12 @@ softclock_thread(void *arg) s = splsoftclock(); for (;;) { sleep_setup(&sls, &timeout_proc, PSWP, "bored", 0); - sleep_finish(&sls, CIRCQ_EMPTY(&timeout_proc)); + if (CIRCQ_EMPTY(&timeout_proc) && + CIRCQ_EMPTY(&timeout_proc_mpsafe)) + do_sleep = 1; + else + do_sleep = 0; + sleep_finish(&sls, do_sleep); mtx_enter(&timeout_mutex); while (!CIRCQ_EMPTY(&timeout_proc)) { @@ -801,8 +843,24 @@ softclock_thread(void *arg) timeout_run(to); tostat.tos_run_thread++; } + if (!CIRCQ_EMPTY(&timeout_proc_mpsafe)) { + mtx_leave(&timeout_mutex); + KERNEL_UNLOCK(); + unlocked = 1; + mtx_enter(&timeout_mutex); + while (!CIRCQ_EMPTY(&timeout_proc_mpsafe)) { + cq = CIRCQ_FIRST(&timeout_proc_mpsafe); + to = timeout_from_circq(cq); + CIRCQ_REMOVE(&to->to_list); + timeout_run(to); + tostat.tos_run_thread++; + } + } else + unlocked = 0; tostat.tos_thread_wakeups++; mtx_leave(&timeout_mutex); + if (unlocked) + KERNEL_LOCK(); } splx(s); } Index: sys/timeout.h =================================================================== RCS file: /cvs/src/sys/sys/timeout.h,v retrieving revision 1.40 diff -u -p -r1.40 timeout.h --- sys/timeout.h 15 Oct 2020 20:03:44 -0000 1.40 +++ sys/timeout.h 13 May 2021 05:01:53 -0000 @@ -79,6 +79,7 @@ struct timeout { #define TIMEOUT_INITIALIZED 0x04 /* initialized */ #define TIMEOUT_TRIGGERED 0x08 /* running or ran */ #define TIMEOUT_KCLOCK 0x10 /* clock-based timeout */ +#define TIMEOUT_MPSAFE 0x20 /* doesn't need kernel lock */ struct timeoutstat { uint64_t tos_added; /* timeout_add*(9) calls */