Hi,

With the removal of the kernel lock from timeout_barrier(9),
softclock() and the timeout thread do not need the kernel lock.

However, many timeouts implicitly rely on the kernel lock.

So to unlock softclock() and the timeout thread I propose adding a new
flag, TIMEOUT_MPSAFE.  The flag signifies that a given timeout doesn't
need the kernel lock at all.  We'll run all such timeouts without the
kernel lock.

Taking/releasing the kernel lock during timeout_run() on a per-timeout
basis is way too slow.  Instead, during softclock() we can aggregate
TIMEOUT_MPSAFE timeouts onto secondary queues and process them all at
once after dropping the kernel lock.  This will minimize locking
overhead.

Normal timeouts are put onto timeout_todo_mpsafe and are run during
softclock().  Process timeouts are put onto timeout_proc_mpsafe and
run from the timeout thread.

The funky locking dance in softclock() and softclock_thread() is
needed to keep from violating the locking hierarchy.  The
timeout_mutex is above the kernel lock, so we need to leave the
timeout_mutex, then drop the kernel lock, and then reenter the
timeout_mutex to start running TIMEOUT_MPSAFE timeouts.

Thoughts?

Index: kern/kern_timeout.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_timeout.c,v
retrieving revision 1.84
diff -u -p -r1.84 kern_timeout.c
--- kern/kern_timeout.c 11 May 2021 13:29:25 -0000      1.84
+++ kern/kern_timeout.c 13 May 2021 05:01:53 -0000
@@ -74,7 +74,9 @@ struct circq timeout_wheel[BUCKETS];  /* 
 struct circq timeout_wheel_kc[BUCKETS];        /* [T] Clock-based timeouts */
 struct circq timeout_new;              /* [T] New, unscheduled timeouts */
 struct circq timeout_todo;             /* [T] Due or needs rescheduling */
+struct circq timeout_todo_mpsafe;      /* [T] Due + no kernel lock */
 struct circq timeout_proc;             /* [T] Due + needs process context */
+struct circq timeout_proc_mpsafe;      /* [T] Due, proc ctx, no kernel lock */
 
 time_t timeout_level_width[WHEELCOUNT];        /* [I] Wheel level width 
(seconds) */
 struct timespec tick_ts;               /* [I] Length of a tick (1/hz secs) */
@@ -228,7 +230,9 @@ timeout_startup(void)
 
        CIRCQ_INIT(&timeout_new);
        CIRCQ_INIT(&timeout_todo);
+       CIRCQ_INIT(&timeout_todo_mpsafe);
        CIRCQ_INIT(&timeout_proc);
+       CIRCQ_INIT(&timeout_proc_mpsafe);
        for (b = 0; b < nitems(timeout_wheel); b++)
                CIRCQ_INIT(&timeout_wheel[b]);
        for (b = 0; b < nitems(timeout_wheel_kc); b++)
@@ -697,7 +701,13 @@ softclock_process_kclock_timeout(struct 
        if (!new && timespeccmp(&to->to_abstime, &kc->kc_late, <=))
                tostat.tos_late++;
        if (ISSET(to->to_flags, TIMEOUT_PROC)) {
-               CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list);
+               if (ISSET(to->to_flags, TIMEOUT_MPSAFE))
+                       CIRCQ_INSERT_TAIL(&timeout_proc_mpsafe, &to->to_list);
+               else
+                       CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list);
+               return;
+       } else if (ISSET(to->to_flags, TIMEOUT_MPSAFE)) {
+               CIRCQ_INSERT_TAIL(&timeout_todo_mpsafe, &to->to_list);
                return;
        }
        timeout_run(to);
@@ -719,7 +729,13 @@ softclock_process_tick_timeout(struct ti
        if (!new && delta < 0)
                tostat.tos_late++;
        if (ISSET(to->to_flags, TIMEOUT_PROC)) {
-               CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list);
+               if (ISSET(to->to_flags, TIMEOUT_MPSAFE))
+                       CIRCQ_INSERT_TAIL(&timeout_proc_mpsafe, &to->to_list);
+               else
+                       CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list);
+               return;
+       } else if (ISSET(to->to_flags, TIMEOUT_MPSAFE)) {
+               CIRCQ_INSERT_TAIL(&timeout_todo_mpsafe, &to->to_list);
                return;
        }
        timeout_run(to);
@@ -735,11 +751,14 @@ softclock_process_tick_timeout(struct ti
 void
 softclock(void *arg)
 {
+       struct circq *cq;
        struct timeout *first_new, *to;
-       int needsproc, new;
+       int needsproc, new, unlocked;
 
        first_new = NULL;
+       needsproc = 1;
        new = 0;
+       unlocked = 0;
 
        mtx_enter(&timeout_mutex);
        if (!CIRCQ_EMPTY(&timeout_new))
@@ -755,10 +774,27 @@ softclock(void *arg)
                else
                        softclock_process_tick_timeout(to, new);
        }
+       if (!CIRCQ_EMPTY(&timeout_todo_mpsafe)) {
+               mtx_leave(&timeout_mutex);
+               KERNEL_UNLOCK();
+               unlocked = 1;
+               mtx_enter(&timeout_mutex);
+               while (!CIRCQ_EMPTY(&timeout_todo_mpsafe)) {
+                       cq = CIRCQ_FIRST(&timeout_todo_mpsafe);
+                       to = timeout_from_circq(cq);
+                       CIRCQ_REMOVE(&to->to_list);
+                       timeout_run(to);
+                       tostat.tos_run_softclock++;
+               }
+       }
        tostat.tos_softclocks++;
-       needsproc = !CIRCQ_EMPTY(&timeout_proc);
+       if (CIRCQ_EMPTY(&timeout_proc) && CIRCQ_EMPTY(&timeout_proc_mpsafe))
+               needsproc = 0;
        mtx_leave(&timeout_mutex);
 
+       if (unlocked)
+               KERNEL_LOCK();
+
        if (needsproc)
                wakeup(&timeout_proc);
 }
@@ -775,9 +811,10 @@ softclock_thread(void *arg)
 {
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
+       struct circq *cq;
        struct sleep_state sls;
        struct timeout *to;
-       int s;
+       int do_sleep, s, unlocked;
 
        KERNEL_ASSERT_LOCKED();
 
@@ -792,7 +829,12 @@ softclock_thread(void *arg)
        s = splsoftclock();
        for (;;) {
                sleep_setup(&sls, &timeout_proc, PSWP, "bored", 0);
-               sleep_finish(&sls, CIRCQ_EMPTY(&timeout_proc));
+               if (CIRCQ_EMPTY(&timeout_proc) &&
+                   CIRCQ_EMPTY(&timeout_proc_mpsafe))
+                       do_sleep = 1;
+               else
+                       do_sleep = 0;
+               sleep_finish(&sls, do_sleep);
 
                mtx_enter(&timeout_mutex);
                while (!CIRCQ_EMPTY(&timeout_proc)) {
@@ -801,8 +843,24 @@ softclock_thread(void *arg)
                        timeout_run(to);
                        tostat.tos_run_thread++;
                }
+               if (!CIRCQ_EMPTY(&timeout_proc_mpsafe)) {
+                       mtx_leave(&timeout_mutex);
+                       KERNEL_UNLOCK();
+                       unlocked = 1;
+                       mtx_enter(&timeout_mutex);
+                       while (!CIRCQ_EMPTY(&timeout_proc_mpsafe)) {
+                               cq = CIRCQ_FIRST(&timeout_proc_mpsafe);
+                               to = timeout_from_circq(cq);
+                               CIRCQ_REMOVE(&to->to_list);
+                               timeout_run(to);
+                               tostat.tos_run_thread++;
+                       }
+               } else
+                       unlocked = 0;
                tostat.tos_thread_wakeups++;
                mtx_leave(&timeout_mutex);
+               if (unlocked)
+                       KERNEL_LOCK();
        }
        splx(s);
 }
Index: sys/timeout.h
===================================================================
RCS file: /cvs/src/sys/sys/timeout.h,v
retrieving revision 1.40
diff -u -p -r1.40 timeout.h
--- sys/timeout.h       15 Oct 2020 20:03:44 -0000      1.40
+++ sys/timeout.h       13 May 2021 05:01:53 -0000
@@ -79,6 +79,7 @@ struct timeout {
 #define TIMEOUT_INITIALIZED    0x04    /* initialized */
 #define TIMEOUT_TRIGGERED      0x08    /* running or ran */
 #define TIMEOUT_KCLOCK         0x10    /* clock-based timeout */
+#define TIMEOUT_MPSAFE         0x20    /* doesn't need kernel lock */
 
 struct timeoutstat {
        uint64_t tos_added;             /* timeout_add*(9) calls */

Reply via email to