Module Name: src Committed By: pooka Date: Fri May 28 16:44:14 UTC 2010
Modified Files: src/sys/rump/librump/rumpkern: rump.c scheduler.c threads.c Log Message: Improve the CPU scheduler for a host MP system with multithreaded access. The old scheduler had a global freelist which caused a cache crisis with multiple host threads trying to schedule a virtual CPU simultaneously. The rump scheduler is different from a normal thread scheduler, so it has different requirements. First, we schedule a CPU for a thread (which we get from the host scheduler) instead of scheduling a thread onto a CPU. Second, scheduling points are at every entry/exit to/from the rump kernel, including (but not limited to) syscall entry points and hypercalls. This means scheduling happens a lot more frequently than in a normal kernel. For every lwp, cache the previously used CPU. When scheduling, attempt to reuse the same CPU. If we get it, we can use it directly without any memory barriers or expensive locks. If the CPU is taken, migrate. Use a lock/wait only in the slowpath. Be very wary of walking the entire CPU array because that does not lead to a happy cacher. The migration algorithm could probably benefit from improved heuristics and tuning. Even as such, with the new scheduler an application which has two threads making rlimit syscalls in a tight loop experiences almost 400% speedup. The exact speedup is difficult to pinpoint, though, since the old scheduler caused very jittery results due to cache contention. Also, the rump version is now 70% faster than the counterpart which calls the host kernel. To generate a diff of this commit: cvs rdiff -u -r1.171 -r1.172 src/sys/rump/librump/rumpkern/rump.c cvs rdiff -u -r1.14 -r1.15 src/sys/rump/librump/rumpkern/scheduler.c cvs rdiff -u -r1.8 -r1.9 src/sys/rump/librump/rumpkern/threads.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/rump/librump/rumpkern/rump.c diff -u src/sys/rump/librump/rumpkern/rump.c:1.171 src/sys/rump/librump/rumpkern/rump.c:1.172 --- src/sys/rump/librump/rumpkern/rump.c:1.171 Tue May 11 14:57:20 2010 +++ src/sys/rump/librump/rumpkern/rump.c Fri May 28 16:44:14 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: rump.c,v 1.171 2010/05/11 14:57:20 pooka Exp $ */ +/* $NetBSD: rump.c,v 1.172 2010/05/28 16:44:14 pooka Exp $ */ /* * Copyright (c) 2007 Antti Kantee. All Rights Reserved. @@ -28,7 +28,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: rump.c,v 1.171 2010/05/11 14:57:20 pooka Exp $"); +__KERNEL_RCSID(0, "$NetBSD: rump.c,v 1.172 2010/05/28 16:44:14 pooka Exp $"); #include <sys/systm.h> #define ELFSIZE ARCH_ELFSIZE @@ -269,7 +269,7 @@ /* init minimal lwp/cpu context */ l = &lwp0; l->l_lid = 1; - l->l_cpu = rump_cpu; + l->l_cpu = l->l_target_cpu = rump_cpu; rumpuser_set_curlwp(l); mutex_init(&tty_lock, MUTEX_DEFAULT, IPL_NONE); @@ -533,6 +533,7 @@ l->l_lid = lid; l->l_fd = p->p_fd; l->l_cpu = NULL; + l->l_target_cpu = rump_cpu; lwp_initspecific(l); LIST_INSERT_HEAD(&alllwp, l, l_list); @@ -545,7 +546,7 @@ struct lwp *l = curlwp; rumpuser_set_curlwp(NULL); - newlwp->l_cpu = l->l_cpu; + newlwp->l_cpu = newlwp->l_target_cpu = l->l_cpu; newlwp->l_mutex = l->l_mutex; l->l_mutex = NULL; l->l_cpu = NULL; Index: src/sys/rump/librump/rumpkern/scheduler.c diff -u src/sys/rump/librump/rumpkern/scheduler.c:1.14 src/sys/rump/librump/rumpkern/scheduler.c:1.15 --- src/sys/rump/librump/rumpkern/scheduler.c:1.14 Tue May 18 14:58:42 2010 +++ src/sys/rump/librump/rumpkern/scheduler.c Fri May 28 16:44:14 2010 @@ -1,10 +1,7 @@ -/* $NetBSD: scheduler.c,v 1.14 2010/05/18 14:58:42 pooka Exp $ */ +/* $NetBSD: scheduler.c,v 1.15 2010/05/28 16:44:14 pooka Exp $ */ /* - * Copyright (c) 2009 Antti Kantee. All Rights Reserved. - * - * Development of this software was supported by - * The Finnish Cultural Foundation. + * Copyright (c) 2010 Antti Kantee. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -29,7 +26,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.14 2010/05/18 14:58:42 pooka Exp $"); +__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.15 2010/05/28 16:44:14 pooka Exp $"); #include <sys/param.h> #include <sys/cpu.h> @@ -44,28 +41,62 @@ #include "rump_private.h" -/* should go for MAXCPUS at some point */ static struct cpu_info rump_cpus[MAXCPUS]; static struct rumpcpu { + /* needed in fastpath */ struct cpu_info *rcpu_ci; - int rcpu_flags; + void *rcpu_prevlwp; + + /* needed in slowpath */ + struct rumpuser_mtx *rcpu_mtx; struct rumpuser_cv *rcpu_cv; - LIST_ENTRY(rumpcpu) rcpu_entries; + int rcpu_wanted; + + /* offset 20 (P=4) or 36 (P=8) here */ + + /* + * Some stats. Not really that necessary, but we should + * have room. Note that these overflow quite fast, so need + * to be collected often. + */ + unsigned int rcpu_fastpath; + unsigned int rcpu_slowpath; + unsigned int rcpu_migrated; + + /* offset 32 (P=4) or 50 (P=8) */ + + int rcpu_align[0] __aligned(CACHE_LINE_SIZE); } rcpu_storage[MAXCPUS]; struct cpu_info *rump_cpu = &rump_cpus[0]; int ncpu; -#define RCPU_WANTED 0x01 /* someone wants this specific CPU */ -#define RCPU_BUSY 0x02 /* CPU is busy */ -#define RCPU_FREELIST 0x04 /* CPU is on freelist */ - -static LIST_HEAD(,rumpcpu) cpu_freelist = LIST_HEAD_INITIALIZER(cpu_freelist); +#define RCPULWP_BUSY ((void *)-1) +#define RCPULWP_WANTED ((void *)-2) -static struct rumpuser_mtx *schedmtx; -static struct rumpuser_cv *schedcv, *lwp0cv; +static struct rumpuser_mtx *lwp0mtx; +static struct rumpuser_cv *lwp0cv; +static unsigned nextcpu; static bool lwp0busy = false; +/* + * Keep some stats. + * + * Keeping track of there is not really critical for speed, unless + * stats happen to be on a different cache line (CACHE_LINE_SIZE is + * really just a coarse estimate), so default for the performant case + * (i.e. no stats). + */ +#ifdef RUMPSCHED_STATS +#define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++; +#define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++; +#define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++; +#else +#define SCHED_FASTPATH(rcpu) +#define SCHED_SLOWPATH(rcpu) +#define SCHED_MIGRATED(rcpu) +#endif + struct cpu_info * cpu_lookup(u_int index) { @@ -73,6 +104,19 @@ return &rump_cpus[index]; } +static inline struct rumpcpu * +getnextcpu(void) +{ + unsigned newcpu; + + newcpu = atomic_inc_uint_nv(&nextcpu); + if (__predict_false(ncpu > UINT_MAX/2)) + atomic_and_uint(&nextcpu, 0); + newcpu = newcpu % ncpu; + + return &rcpu_storage[newcpu]; +} + /* this could/should be mi_attach_cpu? */ void rump_cpus_bootstrap(int num) @@ -103,8 +147,7 @@ struct cpu_info *ci; int i; - rumpuser_mutex_init(&schedmtx); - rumpuser_cv_init(&schedcv); + rumpuser_mutex_init(&lwp0mtx); rumpuser_cv_init(&lwp0cv); for (i = 0; i < ncpu; i++) { rcpu = &rcpu_storage[i]; @@ -113,9 +156,9 @@ ci->ci_schedstate.spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); ci->ci_schedstate.spc_flags = SPCF_RUNNING; - LIST_INSERT_HEAD(&cpu_freelist, rcpu, rcpu_entries); - rcpu->rcpu_flags = RCPU_FREELIST; + rcpu->rcpu_wanted = 0; rumpuser_cv_init(&rcpu->rcpu_cv); + rumpuser_mutex_init(&rcpu->rcpu_mtx); } } @@ -125,15 +168,22 @@ void rump_schedlock_cv_wait(struct rumpuser_cv *cv) { + struct lwp *l = curlwp; + struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]]; - rumpuser_cv_wait(cv, schedmtx); + /* mutex will be taken and released in cpu schedule/unschedule */ + rumpuser_cv_wait(cv, rcpu->rcpu_mtx); } int rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts) { + struct lwp *l = curlwp; + struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]]; - return rumpuser_cv_timedwait(cv, schedmtx, ts->tv_sec, ts->tv_nsec); + /* mutex will be taken and released in cpu schedule/unschedule */ + return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx, + ts->tv_sec, ts->tv_nsec); } void @@ -144,16 +194,18 @@ /* * If there is no dedicated lwp, allocate a temp one and * set it to be free'd upon unschedule(). Use lwp0 context - * for reserving the necessary resources. + * for reserving the necessary resources. Don't optimize + * for this case -- anyone who cares about performance will + * start a real thread. */ l = rumpuser_get_curlwp(); if (l == NULL) { /* busy lwp0 */ - rumpuser_mutex_enter_nowrap(schedmtx); + rumpuser_mutex_enter_nowrap(lwp0mtx); while (lwp0busy) - rumpuser_cv_wait_nowrap(lwp0cv, schedmtx); + rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx); lwp0busy = true; - rumpuser_mutex_exit(schedmtx); + rumpuser_mutex_exit(lwp0mtx); /* schedule cpu and use lwp0 */ rump_schedule_cpu(&lwp0); @@ -162,10 +214,10 @@ /* release lwp0 */ rump_lwp_switch(l); - rumpuser_mutex_enter_nowrap(schedmtx); + rumpuser_mutex_enter_nowrap(lwp0mtx); lwp0busy = false; rumpuser_cv_signal(lwp0cv); - rumpuser_mutex_exit(schedmtx); + rumpuser_mutex_exit(lwp0mtx); /* mark new lwp as dead-on-exit */ rump_lwp_release(l); @@ -181,45 +233,87 @@ rump_schedule_cpu_interlock(l, NULL); } +/* + * Schedule a CPU. This optimizes for the case where we schedule + * the same thread often, and we have nCPU >= nFrequently-Running-Thread + * (where CPU is virtual rump cpu, not host CPU). + */ void rump_schedule_cpu_interlock(struct lwp *l, void *interlock) { struct rumpcpu *rcpu; - bool schedlock = interlock == schedmtx; + void *old; + bool domigrate; + bool bound = l->l_pflag & LP_BOUND; + + /* + * First, try fastpath: if we were the previous user of the + * CPU, everything is in order cachewise and we can just + * proceed to use it. + * + * If we are a different thread (i.e. CAS fails), we must go + * through a memory barrier to ensure we get a truthful + * view of the world. + */ - if (!schedlock) - rumpuser_mutex_enter_nowrap(schedmtx); + rcpu = &rcpu_storage[l->l_target_cpu-&rump_cpus[0]]; + if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) { + if (__predict_true(interlock == rcpu->rcpu_mtx)) + rumpuser_mutex_exit(rcpu->rcpu_mtx); + SCHED_FASTPATH(rcpu); + /* jones, you're the man */ + goto fastlane; + } - if (l->l_pflag & LP_BOUND) { - KASSERT(l->l_cpu != NULL); - rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]]; - if (rcpu->rcpu_flags & RCPU_BUSY) { - KASSERT((rcpu->rcpu_flags & RCPU_FREELIST) == 0); - while (rcpu->rcpu_flags & RCPU_BUSY) { - rcpu->rcpu_flags |= RCPU_WANTED; - rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, - schedmtx); + /* + * Else, it's the slowpath for us. First, determine if we + * can migrate. + */ + if (ncpu == 1) + domigrate = false; + else + domigrate = true; + + /* Take lock. This acts as a load barrier too. */ + if (__predict_true(interlock != rcpu->rcpu_mtx)) + rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); + + for (;;) { + SCHED_SLOWPATH(rcpu); + old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED); + + /* CPU is free? */ + if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) { + if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, + RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) { + break; } - rcpu->rcpu_flags &= ~RCPU_WANTED; - } else { - KASSERT(rcpu->rcpu_flags & (RCPU_FREELIST|RCPU_WANTED)); - } - if (rcpu->rcpu_flags & RCPU_FREELIST) { - LIST_REMOVE(rcpu, rcpu_entries); - rcpu->rcpu_flags &= ~RCPU_FREELIST; } - } else { - while ((rcpu = LIST_FIRST(&cpu_freelist)) == NULL) { - rumpuser_cv_wait_nowrap(schedcv, schedmtx); + + /* + * Do we want to migrate once? + * This may need a slightly better algorithm, or we + * might cache pingpong eternally for non-frequent + * threads. + */ + if (domigrate && !bound) { + domigrate = false; + SCHED_MIGRATED(rcpu); + rumpuser_mutex_exit(rcpu->rcpu_mtx); + rcpu = getnextcpu(); + rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); + continue; } - KASSERT(rcpu->rcpu_flags & RCPU_FREELIST); - LIST_REMOVE(rcpu, rcpu_entries); - rcpu->rcpu_flags &= ~RCPU_FREELIST; - KASSERT(l->l_cpu == NULL); - l->l_cpu = rcpu->rcpu_ci; + + /* Want CPU, wait until it's released an retry */ + rcpu->rcpu_wanted++; + rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx); + rcpu->rcpu_wanted--; } - rcpu->rcpu_flags |= RCPU_BUSY; - rumpuser_mutex_exit(schedmtx); + rumpuser_mutex_exit(rcpu->rcpu_mtx); + + fastlane: + l->l_cpu = l->l_target_cpu = rcpu->rcpu_ci; l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex; } @@ -241,11 +335,11 @@ rumpuser_set_curlwp(NULL); /* busy lwp0 */ - rumpuser_mutex_enter_nowrap(schedmtx); + rumpuser_mutex_enter_nowrap(lwp0mtx); while (lwp0busy) - rumpuser_cv_wait_nowrap(lwp0cv, schedmtx); + rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx); lwp0busy = true; - rumpuser_mutex_exit(schedmtx); + rumpuser_mutex_exit(lwp0mtx); rump_schedule_cpu(&lwp0); rumpuser_set_curlwp(&lwp0); @@ -253,10 +347,10 @@ rump_unschedule_cpu(&lwp0); rumpuser_set_curlwp(NULL); - rumpuser_mutex_enter_nowrap(schedmtx); + rumpuser_mutex_enter_nowrap(lwp0mtx); lwp0busy = false; rumpuser_cv_signal(lwp0cv); - rumpuser_mutex_exit(schedmtx); + rumpuser_mutex_exit(lwp0mtx); } } @@ -281,35 +375,47 @@ { struct rumpcpu *rcpu; struct cpu_info *ci; - bool schedlock = interlock == schedmtx; + void *old; ci = l->l_cpu; - if ((l->l_pflag & LP_BOUND) == 0) { - l->l_cpu = NULL; - } + l->l_cpu = NULL; rcpu = &rcpu_storage[ci-&rump_cpus[0]]; + KASSERT(rcpu->rcpu_ci == ci); - KASSERT(rcpu->rcpu_flags & RCPU_BUSY); - rumpuser_mutex_enter_nowrap(schedmtx); + /* + * Make sure all stores are seen before the CPU release. This + * is relevant only in the non-fastpath scheduling case, but + * we don't know here if that's going to happen, so need to + * expect the worst. + */ + membar_exit(); - if (rcpu->rcpu_flags & RCPU_WANTED) { - /* - * The assumption is that there will usually be max 1 - * thread waiting on the rcpu_cv, so broadcast is fine. - * (and the current structure requires it because of - * only a bitmask being used for wanting). - */ - rumpuser_cv_broadcast(rcpu->rcpu_cv); - } else { - LIST_INSERT_HEAD(&cpu_freelist, rcpu, rcpu_entries); - rcpu->rcpu_flags |= RCPU_FREELIST; - rumpuser_cv_signal(schedcv); + /* Release the CPU. */ + old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l); + + /* No waiters? No problems. We're outta here. */ + if (old == RCPULWP_BUSY) { + /* Was the scheduler interlock requested? */ + if (__predict_false(interlock == rcpu->rcpu_mtx)) + rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); + return; } - rcpu->rcpu_flags &= ~RCPU_BUSY; - if (!schedlock) - rumpuser_mutex_exit(schedmtx); + KASSERT(old == RCPULWP_WANTED); + + /* + * Ok, things weren't so snappy. + * + * Snailpath: take lock and signal anyone waiting for this CPU. + */ + + rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); + if (rcpu->rcpu_wanted) + rumpuser_cv_broadcast(rcpu->rcpu_cv); + + if (__predict_true(interlock != rcpu->rcpu_mtx)) + rumpuser_mutex_exit(rcpu->rcpu_mtx); } /* Give up and retake CPU (perhaps a different one) */ Index: src/sys/rump/librump/rumpkern/threads.c diff -u src/sys/rump/librump/rumpkern/threads.c:1.8 src/sys/rump/librump/rumpkern/threads.c:1.9 --- src/sys/rump/librump/rumpkern/threads.c:1.8 Tue Feb 9 16:53:13 2010 +++ src/sys/rump/librump/rumpkern/threads.c Fri May 28 16:44:14 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: threads.c,v 1.8 2010/02/09 16:53:13 pooka Exp $ */ +/* $NetBSD: threads.c,v 1.9 2010/05/28 16:44:14 pooka Exp $ */ /* * Copyright (c) 2007-2009 Antti Kantee. All Rights Reserved. @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: threads.c,v 1.8 2010/02/09 16:53:13 pooka Exp $"); +__KERNEL_RCSID(0, "$NetBSD: threads.c,v 1.9 2010/05/28 16:44:14 pooka Exp $"); #include <sys/param.h> #include <sys/kmem.h> @@ -136,13 +136,14 @@ k->f = func; k->arg = arg; k->mylwp = l = rump_lwp_alloc(0, rump_nextlid()); + l->l_flag |= LW_SYSTEM; if (flags & KTHREAD_MPSAFE) l->l_pflag |= LP_MPSAFE; if (flags & KTHREAD_INTR) l->l_pflag |= LP_INTR; if (ci) { l->l_pflag |= LP_BOUND; - l->l_cpu = ci; + l->l_target_cpu = ci; } if (thrname) { l->l_name = kmem_alloc(MAXCOMLEN, KM_SLEEP);