Userland threads are preempt()'d when hogging a CPU or when processing
an AST. Currently when such a thread is preempted the scheduler looks
for an idle CPU and puts it on its run queue. That means the number of
involuntary context switch often result in a migration.
This is not a problem per se and one could argue that if another CPU
is idle it makes sense to move. However with the KERNEL_LOCK() moving
to another CPU won't necessarily allows the preempt()'d thread to run.
It's even worse, it increases contention.
If you add to this behavior the fact that sched_choosecpu() prefers idle
CPUs in a linear order, meaning CPU0 > CPU1 > .. > CPUN, you'll
understand that the set of idle CPUs will change every time preempt() is
called.
I believe this behavior affects kernel threads by side effect, since
the set of idle CPU changes every time a thread is preempted. With this
diff the 'softnet' thread didn't move on a 2 CPUs machine during simple
benchmarks. Without, it plays ping-pong between CPU.
The goal of this diff is to reduce the number of migrations. You
can compare the value of 'sched_nomigrations' and 'sched_nmigrations'
with and without it.
As usual, I'd like to know what's the impact of this diff on your
favorite benchmark. Please test and report back.
Index: kern/kern_sched.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.44
diff -u -p -r1.44 kern_sched.c
--- kern/kern_sched.c 21 Jan 2017 05:42:03 -0000 1.44
+++ kern/kern_sched.c 24 Jan 2017 03:08:23 -0000
@@ -51,6 +51,8 @@ uint64_t sched_noidle; /* Times we didn
uint64_t sched_stolen; /* Times we stole proc from other cpus */
uint64_t sched_choose; /* Times we chose a cpu */
uint64_t sched_wasidle; /* Times we came out of idle */
+uint64_t sched_nvcsw; /* voluntary context switches */
+uint64_t sched_nivcsw; /* involuntary context switches */
#ifdef MULTIPROCESSOR
struct taskq *sbartq;
Index: kern/kern_synch.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.136
diff -u -p -r1.136 kern_synch.c
--- kern/kern_synch.c 21 Jan 2017 05:42:03 -0000 1.136
+++ kern/kern_synch.c 24 Jan 2017 03:08:23 -0000
@@ -296,6 +296,7 @@ sleep_finish(struct sleep_state *sls, in
if (sls->sls_do_sleep && do_sleep) {
p->p_stat = SSLEEP;
p->p_ru.ru_nvcsw++;
+ sched_nvcsw++;
SCHED_ASSERT_LOCKED();
mi_switch();
} else if (!do_sleep) {
@@ -481,6 +482,7 @@ sys_sched_yield(struct proc *p, void *v,
p->p_stat = SRUN;
setrunqueue(p);
p->p_ru.ru_nvcsw++;
+ sched_nvcsw++;
mi_switch();
SCHED_UNLOCK(s);
Index: kern/sched_bsd.c
===================================================================
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.43
diff -u -p -r1.43 sched_bsd.c
--- kern/sched_bsd.c 9 Mar 2016 13:38:50 -0000 1.43
+++ kern/sched_bsd.c 24 Jan 2017 03:18:24 -0000
@@ -302,6 +302,7 @@ yield(void)
p->p_stat = SRUN;
setrunqueue(p);
p->p_ru.ru_nvcsw++;
+ sched_nvcsw++;
mi_switch();
SCHED_UNLOCK(s);
}
@@ -327,9 +328,12 @@ preempt(struct proc *newp)
SCHED_LOCK(s);
p->p_priority = p->p_usrpri;
p->p_stat = SRUN;
+#if 0
p->p_cpu = sched_choosecpu(p);
+#endif
setrunqueue(p);
p->p_ru.ru_nivcsw++;
+ sched_nivcsw++;
mi_switch();
SCHED_UNLOCK(s);
}
Index: sys/sched.h
===================================================================
RCS file: /cvs/src/sys/sys/sched.h,v
retrieving revision 1.41
diff -u -p -r1.41 sched.h
--- sys/sched.h 17 Mar 2016 13:18:47 -0000 1.41
+++ sys/sched.h 24 Jan 2017 02:10:41 -0000
@@ -134,6 +134,9 @@ struct schedstate_percpu {
extern int schedhz; /* ideally: 16 */
extern int rrticks_init; /* ticks per roundrobin() */
+extern uint64_t sched_nvcsw; /* voluntary context switches */
+extern uint64_t sched_nivcsw; /* involuntary context switches */
+
struct proc;
void schedclock(struct proc *);
struct cpu_info;