Re: [RFC] kern/kern_timeout.c rewrite in progress

Hans Petter Selasky Sun, 04 Jan 2015 04:20:35 -0800

Hi,

Please find attached an updated timeout patch which also updates clientsin the kernel area to use the callout API properly, like cv_timedwait().Previously there was some custom sleepqueue code in the calloutsubsystem. All of that has now been removed and we allow callouts to beprotected by spinlocks. This allows us to tear down the callback likedone with regular mutexes, and a "td_slpmutex" has been added to "structthread" to atomically teardown the "td_slpcallout". Further the"TDF_TIMOFAIL" and "SWT_SLEEPQTIMO" states can now be completely removed.


Summary of changes:

1) Make consistent callout API which also supports spinlocks for thecallback function. This has been done to allow atomic callout stop of"td_slpcallout" without the need of many kernel threading quirks.

2) It is not allowed to migrate CPU if the timeout is restarted whilethe timeout callback is executing. Callouts must be stopped before CPUmigration is allowed. Optionally drained.

3) Shared lock support has been removed, because it prevents atomic stopof the callback function.

4) A new API to drain callouts asynchronously has been added, called"callout_drain_async()".


Please test and report any errors!

Patch applies to FreeBSD-11-current as of today.

Thank you!

--HPS

Index: sys/ofed/include/linux/completion.h
===================================================================
--- sys/ofed/include/linux/completion.h	(revision 276531)
+++ sys/ofed/include/linux/completion.h	(working copy)
@@ -105,7 +105,9 @@
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
+		sleepq_release(c);
 		sleepq_set_timeout(c, end - ticks);
+		sleepq_lock(c);
 		if (flags & SLEEPQ_INTERRUPTIBLE) {
 			if (sleepq_timedwait_sig(c, 0) != 0)
 				return (-ERESTARTSYS);
Index: sys/kern/init_main.c
===================================================================
--- sys/kern/init_main.c	(revision 276531)
+++ sys/kern/init_main.c	(working copy)
@@ -504,7 +504,8 @@
 
 	callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0);
 	callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
-	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
+	mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN);
+	callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0);
 
 	/* Create credentials. */
 	p->p_ucred = crget();
Index: sys/kern/kern_condvar.c
===================================================================
--- sys/kern/kern_condvar.c	(revision 276531)
+++ sys/kern/kern_condvar.c	(working copy)
@@ -313,15 +313,13 @@
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+	sleepq_release(cvp);
 	sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
 	if (lock != &Giant.lock_object) {
-		if (class->lc_flags & LC_SLEEPABLE)
-			sleepq_release(cvp);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
-		if (class->lc_flags & LC_SLEEPABLE)
-			sleepq_lock(cvp);
 	}
+	sleepq_lock(cvp);
 	rval = sleepq_timedwait(cvp, 0);
 
 #ifdef KTRACE
@@ -383,15 +381,13 @@
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
 	    SLEEPQ_INTERRUPTIBLE, 0);
+	sleepq_release(cvp);
 	sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
 	if (lock != &Giant.lock_object) {
-		if (class->lc_flags & LC_SLEEPABLE)
-			sleepq_release(cvp);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
-		if (class->lc_flags & LC_SLEEPABLE)
-			sleepq_lock(cvp);
 	}
+	sleepq_lock(cvp);
 	rval = sleepq_timedwait_sig(cvp, 0);
 
 #ifdef KTRACE
Index: sys/kern/kern_lock.c
===================================================================
--- sys/kern/kern_lock.c	(revision 276531)
+++ sys/kern/kern_lock.c	(working copy)
@@ -210,9 +210,11 @@
 	GIANT_SAVE();
 	sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
 	    SLEEPQ_INTERRUPTIBLE : 0), queue);
-	if ((flags & LK_TIMELOCK) && timo)
+	if ((flags & LK_TIMELOCK) && timo) {
+		sleepq_release(&lk->lock_object);
 		sleepq_set_timeout(&lk->lock_object, timo);
-
+		sleepq_lock(&lk->lock_object);
+	}
 	/*
 	 * Decisional switch for real sleeping.
 	 */
Index: sys/kern/kern_switch.c
===================================================================
--- sys/kern/kern_switch.c	(revision 276531)
+++ sys/kern/kern_switch.c	(working copy)
@@ -93,8 +93,6 @@
     &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
 SCHED_STAT_DEFINE_VAR(sleepq,
     &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
-SCHED_STAT_DEFINE_VAR(sleepqtimo,
-    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), "");
 SCHED_STAT_DEFINE_VAR(relinquish, 
     &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
 SCHED_STAT_DEFINE_VAR(needresched,
Index: sys/kern/kern_synch.c
===================================================================
--- sys/kern/kern_synch.c	(revision 276531)
+++ sys/kern/kern_synch.c	(working copy)
@@ -236,13 +236,17 @@
 	 * return from cursig().
 	 */
 	sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
-	if (sbt != 0)
-		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
 		sleepq_release(ident);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
+		if (sbt != 0)
+			sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 		sleepq_lock(ident);
+	} else if (sbt != 0) {
+		sleepq_release(ident);
+		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
+		sleepq_lock(ident);
 	}
 	if (sbt != 0 && catch)
 		rval = sleepq_timedwait_sig(ident, pri);
@@ -306,8 +310,11 @@
 	 * We put ourselves on the sleep queue and start our timeout.
 	 */
 	sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
-	if (sbt != 0)
+	if (sbt != 0) {
+		sleepq_release(ident);
 		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
+		sleepq_lock(ident);
+	}
 
 	/*
 	 * Can't call ktrace with any spin locks held so it can lock the
Index: sys/kern/kern_thread.c
===================================================================
--- sys/kern/kern_thread.c	(revision 276531)
+++ sys/kern/kern_thread.c	(working copy)
@@ -149,6 +149,9 @@
 	audit_thread_alloc(td);
 #endif
 	umtx_thread_alloc(td);
+
+	mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN);
+	callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0);
 	return (0);
 }
 
@@ -162,6 +165,10 @@
 
 	td = (struct thread *)mem;
 
+	/* make sure to drain any use of the "td->td_slpcallout" */
+	callout_drain(&td->td_slpcallout);
+	mtx_destroy(&td->td_slpmutex);
+
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
@@ -544,7 +551,6 @@
 	LIST_INIT(&td->td_lprof[0]);
 	LIST_INIT(&td->td_lprof[1]);
 	sigqueue_init(&td->td_sigqueue, p);
-	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
Index: sys/kern/kern_timeout.c
===================================================================
--- sys/kern/kern_timeout.c	(revision 276531)
+++ sys/kern/kern_timeout.c	(working copy)
@@ -54,6 +54,8 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
@@ -124,37 +126,216 @@
  */
 u_int callwheelsize, callwheelmask;
 
+typedef void callout_mutex_op_t(struct lock_object *);
+typedef int callout_owned_op_t(struct lock_object *);
+
+struct callout_mutex_ops {
+	callout_mutex_op_t *lock;
+	callout_mutex_op_t *unlock;
+	callout_owned_op_t *owned;
+};
+
+enum {
+	CALLOUT_LC_UNUSED_0,
+	CALLOUT_LC_UNUSED_1,
+	CALLOUT_LC_UNUSED_2,
+	CALLOUT_LC_UNUSED_3,
+	CALLOUT_LC_SPIN,
+	CALLOUT_LC_MUTEX,
+	CALLOUT_LC_RW,
+	CALLOUT_LC_RM,
+};
+
+static void
+callout_mutex_op_none(struct lock_object *lock)
+{
+}
+
+static int
+callout_owned_op_none(struct lock_object *lock)
+{
+	return (0);
+}
+
+static void
+callout_mutex_lock(struct lock_object *lock)
+{
+	mtx_lock((struct mtx *)lock);
+}
+
+static void
+callout_mutex_unlock(struct lock_object *lock)
+{
+	mtx_unlock((struct mtx *)lock);
+}
+
+static void
+callout_mutex_lock_spin(struct lock_object *lock)
+{
+	mtx_lock_spin((struct mtx *)lock);
+}
+
+static void
+callout_mutex_unlock_spin(struct lock_object *lock)
+{
+	mtx_unlock_spin((struct mtx *)lock);
+}
+
+static int
+callout_mutex_owned(struct lock_object *lock)
+{
+	return (mtx_owned((struct mtx *)lock));
+}
+
+static void
+callout_rm_wlock(struct lock_object *lock)
+{
+	rm_wlock((struct rmlock *)lock);
+}
+
+static void
+callout_rm_wunlock(struct lock_object *lock)
+{
+	rm_wunlock((struct rmlock *)lock);
+}
+
+static int
+callout_rm_owned(struct lock_object *lock)
+{
+	return (rm_wowned((struct rmlock *)lock));
+}
+
+static void
+callout_rw_wlock(struct lock_object *lock)
+{
+	rw_wlock((struct rwlock *)lock);
+}
+
+static void
+callout_rw_wunlock(struct lock_object *lock)
+{
+	rw_wunlock((struct rwlock *)lock);
+}
+
+static int
+callout_rw_owned(struct lock_object *lock)
+{
+	return (rw_wowned((struct rwlock *)lock));
+}
+
+static const struct callout_mutex_ops callout_mutex_ops[8] = {
+	[CALLOUT_LC_UNUSED_0] = {
+		.lock = callout_mutex_op_none,
+		.unlock = callout_mutex_op_none,
+		.owned = callout_owned_op_none,
+	},
+	[CALLOUT_LC_UNUSED_1] = {
+		.lock = callout_mutex_op_none,
+		.unlock = callout_mutex_op_none,
+		.owned = callout_owned_op_none,
+	},
+	[CALLOUT_LC_UNUSED_2] = {
+		.lock = callout_mutex_op_none,
+		.unlock = callout_mutex_op_none,
+		.owned = callout_owned_op_none,
+	},
+	[CALLOUT_LC_UNUSED_3] = {
+		.lock = callout_mutex_op_none,
+		.unlock = callout_mutex_op_none,
+		.owned = callout_owned_op_none,
+	},
+	[CALLOUT_LC_SPIN] = {
+		.lock = callout_mutex_lock_spin,
+		.unlock = callout_mutex_unlock_spin,
+		.owned = callout_mutex_owned,
+	},
+	[CALLOUT_LC_MUTEX] = {
+		.lock = callout_mutex_lock,
+		.unlock = callout_mutex_unlock,
+		.owned = callout_mutex_owned,
+	},
+	[CALLOUT_LC_RW] = {
+		.lock = callout_rw_wlock,
+		.unlock = callout_rw_wunlock,
+		.owned = callout_rw_owned,
+	},
+	[CALLOUT_LC_RM] = {
+		.lock = callout_rm_wlock,
+		.unlock = callout_rm_wunlock,
+		.owned = callout_rm_owned,
+	},
+};
+
+static void
+callout_lock_client(int c_flags, struct lock_object *c_lock)
+{
+	callout_mutex_ops[CALLOUT_GET_LC(c_flags)].lock(c_lock);
+}
+
+static void
+callout_unlock_client(int c_flags, struct lock_object *c_lock)
+{
+	callout_mutex_ops[CALLOUT_GET_LC(c_flags)].unlock(c_lock);
+}
+
+#ifdef SMP
+static int
+callout_lock_owned_client(int c_flags, struct lock_object *c_lock)
+{
+	return (callout_mutex_ops[CALLOUT_GET_LC(c_flags)].owned(c_lock));
+}
+#endif
+
 /*
- * The callout cpu exec entities represent informations necessary for
- * describing the state of callouts currently running on the CPU and the ones
- * necessary for migrating callouts to the new callout cpu. In particular,
- * the first entry of the array cc_exec_entity holds informations for callout
- * running in SWI thread context, while the second one holds informations
- * for callout running directly from hardware interrupt context.
- * The cached informations are very important for deferring migration when
- * the migrating callout is already running.
+ * The callout CPU exec structure represent information necessary for
+ * describing the state of callouts currently running on the CPU and
+ * for handling deferred callout restarts.
+ *
+ * In particular, the first entry of the array cc_exec_entity holds
+ * information for callouts running from the SWI thread context, while
+ * the second one holds information for callouts running directly from
+ * the hardware interrupt context.
  */
 struct cc_exec {
-	struct callout		*cc_next;
+	/*
+	 * The "cc_curr" points to the currently executing callout and
+	 * is protected by the "cc_lock" spinlock. If no callback is
+	 * currently executing it is equal to "NULL".
+	 */
 	struct callout		*cc_curr;
-#ifdef SMP
-	void			(*ce_migration_func)(void *);
-	void			*ce_migration_arg;
-	int			ce_migration_cpu;
-	sbintime_t		ce_migration_time;
-	sbintime_t		ce_migration_prec;
-#endif
-	bool			cc_cancel;
-	bool			cc_waiting;
+	/*
+	 * The "cc_restart_args" structure holds the argument for a
+	 * deferred callback restart and is protected by the "cc_lock"
+	 * spinlock. The structure is only valid if "cc_restart" is
+	 * "true". If "cc_restart" is "false" the information in the
+	 * "cc_restart_args" structure shall be ignored.
+	 */
+	struct callout_args	cc_restart_args;
+	bool			cc_restart;
+	/*
+	 * The "cc_cancel" variable allows the currently pending
+	 * callback to be atomically cancelled. This field is write
+	 * protected by the "cc_lock" spinlock.
+	 */
+	bool cc_cancel;
+	/*
+	 * The "cc_drain_fn" points to a function which shall be
+	 * called with the argument stored in "cc_drain_arg" when an
+	 * asynchronous drain is performed. This field is write
+	 * protected by the "cc_lock" spinlock.
+	 */
+	callout_func_t *cc_drain_fn;
+	void *cc_drain_arg;
 };
 
 /*
- * There is one struct callout_cpu per cpu, holding all relevant
+ * There is one "struct callout_cpu" per CPU, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
+	struct callout		*cc_exec_next_dir;
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
@@ -166,27 +347,7 @@
 	char			cc_ktr_event_name[20];
 };
 
-#define	cc_exec_curr		cc_exec_entity[0].cc_curr
-#define	cc_exec_next		cc_exec_entity[0].cc_next
-#define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
-#define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
-#define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
-#define	cc_exec_next_dir	cc_exec_entity[1].cc_next
-#define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
-#define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
-
 #ifdef SMP
-#define	cc_migration_func	cc_exec_entity[0].ce_migration_func
-#define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
-#define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
-#define	cc_migration_time	cc_exec_entity[0].ce_migration_time
-#define	cc_migration_prec	cc_exec_entity[0].ce_migration_prec
-#define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
-#define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
-#define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
-#define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
-#define	cc_migration_prec_dir	cc_exec_entity[1].ce_migration_prec
-
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
@@ -211,62 +372,11 @@
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
-/**
- * Locked by cc_lock:
- *   cc_curr         - If a callout is in progress, it is cc_curr.
- *                     If cc_curr is non-NULL, threads waiting in
- *                     callout_drain() will be woken up as soon as the
- *                     relevant callout completes.
- *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
- *                     guarantees that the current callout will not run.
- *                     The softclock() function sets this to 0 before it
- *                     drops callout_lock to acquire c_lock, and it calls
- *                     the handler only if curr_cancelled is still 0 after
- *                     cc_lock is successfully acquired.
- *   cc_waiting      - If a thread is waiting in callout_drain(), then
- *                     callout_wait is nonzero.  Set only when
- *                     cc_curr is non-NULL.
- */
-
 /*
- * Resets the execution entity tied to a specific callout cpu.
+ * Kernel low level callwheel initialization called from cpu0 during
+ * kernel startup:
  */
 static void
-cc_cce_cleanup(struct callout_cpu *cc, int direct)
-{
-
-	cc->cc_exec_entity[direct].cc_curr = NULL;
-	cc->cc_exec_entity[direct].cc_next = NULL;
-	cc->cc_exec_entity[direct].cc_cancel = false;
-	cc->cc_exec_entity[direct].cc_waiting = false;
-#ifdef SMP
-	cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
-	cc->cc_exec_entity[direct].ce_migration_time = 0;
-	cc->cc_exec_entity[direct].ce_migration_prec = 0;
-	cc->cc_exec_entity[direct].ce_migration_func = NULL;
-	cc->cc_exec_entity[direct].ce_migration_arg = NULL;
-#endif
-}
-
-/*
- * Checks if migration is requested by a specific callout cpu.
- */
-static int
-cc_cce_migrating(struct callout_cpu *cc, int direct)
-{
-
-#ifdef SMP
-	return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
-#else
-	return (0);
-#endif
-}
-
-/*
- * Kernel low level callwheel initialization
- * called on cpu0 during kernel startup.
- */
-static void
 callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
@@ -324,8 +434,6 @@
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = SBT_MAX;
-	for (i = 0; i < 2; i++)
-		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
 	    "callwheel cpu %d", cpu);
 	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
@@ -333,42 +441,12 @@
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
-		c->c_flags = CALLOUT_LOCAL_ALLOC;
+		c->c_flags |= CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
-#ifdef SMP
 /*
- * Switches the cpu tied to a specific callout.
- * The function expects a locked incoming callout cpu and returns with
- * locked outcoming callout cpu.
- */
-static struct callout_cpu *
-callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
-{
-	struct callout_cpu *new_cc;
-
-	MPASS(c != NULL && cc != NULL);
-	CC_LOCK_ASSERT(cc);
-
-	/*
-	 * Avoid interrupts and preemption firing after the callout cpu
-	 * is blocked in order to avoid deadlocks as the new thread
-	 * may be willing to acquire the callout cpu lock.
-	 */
-	c->c_cpu = CPUBLOCK;
-	spinlock_enter();
-	CC_UNLOCK(cc);
-	new_cc = CC_CPU(new_cpu);
-	CC_LOCK(new_cc);
-	spinlock_exit();
-	c->c_cpu = new_cpu;
-	return (new_cc);
-}
-#endif
-
-/*
  * Start standard softclock thread.
  */
 static void
@@ -444,9 +522,8 @@
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
-
 	cc = CC_SELF();
-	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
+	CC_LOCK(cc);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
@@ -549,7 +626,7 @@
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
-	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
+	CC_UNLOCK(cc);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
@@ -562,49 +639,55 @@
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
-	int cpu;
-
-	for (;;) {
-		cpu = c->c_cpu;
-#ifdef SMP
-		if (cpu == CPUBLOCK) {
-			while (c->c_cpu == CPUBLOCK)
-				cpu_spinwait();
-			continue;
-		}
-#endif
-		cc = CC_CPU(cpu);
-		CC_LOCK(cc);
-		if (cpu == c->c_cpu)
-			break;
-		CC_UNLOCK(cc);
-	}
+	cc = CC_CPU(c->c_cpu);
+	CC_LOCK(cc);
 	return (cc);
 }
 
-static void
-callout_cc_add(struct callout *c, struct callout_cpu *cc,
-    sbintime_t sbt, sbintime_t precision, void (*func)(void *),
-    void *arg, int cpu, int flags)
+static struct callout_cpu *
+callout_cc_add_locked(struct callout *c, struct callout_cpu *cc,
+    struct callout_args *coa, bool can_swap_cpu)
 {
+#ifndef NO_EVENTTIMERS
+	sbintime_t sbt;
+#endif
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
-	if (sbt < cc->cc_lastscan)
-		sbt = cc->cc_lastscan;
-	c->c_arg = arg;
-	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
-	if (flags & C_DIRECT_EXEC)
-		c->c_flags |= CALLOUT_DIRECT;
-	c->c_flags &= ~CALLOUT_PROCESSED;
-	c->c_func = func;
-	c->c_time = sbt;
-	c->c_precision = precision;
+
+	/* update flags before swapping locks, if any */
+	c->c_flags &= ~(CALLOUT_PROCESSED | CALLOUT_DIRECT | CALLOUT_DEFRESTART);
+	if (coa->flags & C_DIRECT_EXEC)
+		c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING | CALLOUT_DIRECT);
+	else
+		c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+#ifdef SMP
+	/*
+	 * Check if we are changing the CPU on which the callback
+	 * should be executed and if we have a lock protecting us:
+	 */
+	if (can_swap_cpu != false && coa->cpu != c->c_cpu &&
+	    callout_lock_owned_client(c->c_flags, c->c_lock) != 0) {
+		CC_UNLOCK(cc);
+		c->c_cpu = coa->cpu;
+		cc = callout_lock(c);
+	}
+#endif
+	if (coa->time < cc->cc_lastscan)
+		coa->time = cc->cc_lastscan;
+	c->c_arg = coa->arg;
+	c->c_func = coa->func;
+	c->c_time = coa->time;
+	c->c_precision = coa->precision;
+
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
+
+	/* Ensure we are first to be scanned, if called via a callback */
 	if (cc->cc_bucket == bucket)
 		cc->cc_exec_next_dir = c;
 #ifndef NO_EVENTTIMERS
@@ -617,9 +700,10 @@
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
-		cpu_new_callout(cpu, sbt, c->c_time);
+		cpu_new_callout(coa->cpu, sbt, c->c_time);
 	}
 #endif
+	return (cc);
 }
 
 static void
@@ -626,8 +710,6 @@
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
-	if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0)
-		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
@@ -639,20 +721,10 @@
 #endif
     int direct)
 {
-	struct rm_priotracker tracker;
-	void (*c_func)(void *);
+	callout_func_t *c_func;
 	void *c_arg;
-	struct lock_class *class;
 	struct lock_object *c_lock;
-	uintptr_t lock_status;
 	int c_flags;
-#ifdef SMP
-	struct callout_cpu *new_cc;
-	void (*new_func)(void *);
-	void *new_arg;
-	int flags, new_cpu;
-	sbintime_t new_prec, new_time;
-#endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
@@ -663,37 +735,43 @@
 	KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
 	    (CALLOUT_PENDING | CALLOUT_ACTIVE),
 	    ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
-	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
-	lock_status = 0;
-	if (c->c_flags & CALLOUT_SHAREDLOCK) {
-		if (class == &lock_class_rm)
-			lock_status = (uintptr_t)&tracker;
-		else
-			lock_status = 1;
-	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_flags = c->c_flags;
-	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
-		c->c_flags = CALLOUT_LOCAL_ALLOC;
-	else
-		c->c_flags &= ~CALLOUT_PENDING;
+
+	/* remove pending bit */
+	c->c_flags &= ~CALLOUT_PENDING;
+
+	/* reset our local state */
 	cc->cc_exec_entity[direct].cc_curr = c;
 	cc->cc_exec_entity[direct].cc_cancel = false;
-	CC_UNLOCK(cc);
+	cc->cc_exec_entity[direct].cc_restart = false;
+	cc->cc_exec_entity[direct].cc_drain_fn = NULL;
+	cc->cc_exec_entity[direct].cc_drain_arg = NULL;
+
 	if (c_lock != NULL) {
-		class->lc_lock(c_lock, lock_status);
+		CC_UNLOCK(cc);
+
+		/* unlocked region for switching locks */
+
+		callout_lock_client(c_flags, c_lock);
+
 		/*
-		 * The callout may have been cancelled
-		 * while we switched locks.
+		 * Check if the callout may have been cancelled while
+		 * we were switching locks. Even though the callout is
+		 * specifying a lock, it might not be certain this
+		 * lock is locked when starting and stopping callouts.
 		 */
+		CC_LOCK(cc);
 		if (cc->cc_exec_entity[direct].cc_cancel) {
-			class->lc_unlock(c_lock);
-			goto skip;
+			callout_unlock_client(c_flags, c_lock);
+			goto skip_cc_locked;
 		}
-		/* The callout cannot be stopped now. */
+		/* The callout cannot be stopped now! */
 		cc->cc_exec_entity[direct].cc_cancel = true;
+		CC_UNLOCK(cc);
+
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
@@ -708,6 +786,8 @@
 			    c, c_func, c_arg);
 		}
 	} else {
+		CC_UNLOCK(cc);
+		/* unlocked region */
 #ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 #endif
@@ -740,85 +820,40 @@
 #endif
 	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
+
+	/*
+	 * At this point the callback structure might have been freed,
+	 * so we need to check the previously copied value of
+	 * "c->c_flags":
+	 */
 	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
-		class->lc_unlock(c_lock);
-skip:
+		callout_unlock_client(c_flags, c_lock);
+
 	CC_LOCK(cc);
+
+skip_cc_locked:
 	KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
 	cc->cc_exec_entity[direct].cc_curr = NULL;
-	if (cc->cc_exec_entity[direct].cc_waiting) {
+
+	/* Check if there is anything which needs draining */
+	if (cc->cc_exec_entity[direct].cc_drain_fn != NULL) {
 		/*
-		 * There is someone waiting for the
-		 * callout to complete.
-		 * If the callout was scheduled for
-		 * migration just cancel it.
+		 * Unlock the CPU callout last, so that any use of
+		 * structures belonging to the callout are complete:
 		 */
-		if (cc_cce_migrating(cc, direct)) {
-			cc_cce_cleanup(cc, direct);
-
-			/*
-			 * It should be assert here that the callout is not
-			 * destroyed but that is not easy.
-			 */
-			c->c_flags &= ~CALLOUT_DFRMIGRATION;
-		}
-		cc->cc_exec_entity[direct].cc_waiting = false;
 		CC_UNLOCK(cc);
-		wakeup(&cc->cc_exec_entity[direct].cc_waiting);
+		/* call drain function unlocked */
+		cc->cc_exec_entity[direct].cc_drain_fn(
+		    cc->cc_exec_entity[direct].cc_drain_arg);
 		CC_LOCK(cc);
-	} else if (cc_cce_migrating(cc, direct)) {
-		KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
-		    ("Migrating legacy callout %p", c));
-#ifdef SMP
-		/*
-		 * If the callout was scheduled for
-		 * migration just perform it now.
-		 */
-		new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu;
-		new_time = cc->cc_exec_entity[direct].ce_migration_time;
-		new_prec = cc->cc_exec_entity[direct].ce_migration_prec;
-		new_func = cc->cc_exec_entity[direct].ce_migration_func;
-		new_arg = cc->cc_exec_entity[direct].ce_migration_arg;
-		cc_cce_cleanup(cc, direct);
-
-		/*
-		 * It should be assert here that the callout is not destroyed
-		 * but that is not easy.
-		 *
-		 * As first thing, handle deferred callout stops.
-		 */
-		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
-			CTR3(KTR_CALLOUT,
-			     "deferred cancelled %p func %p arg %p",
-			     c, new_func, new_arg);
-			callout_cc_del(c, cc);
-			return;
-		}
-		c->c_flags &= ~CALLOUT_DFRMIGRATION;
-
-		new_cc = callout_cpu_switch(c, cc, new_cpu);
-		flags = (direct) ? C_DIRECT_EXEC : 0;
-		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
-		    new_arg, new_cpu, flags);
-		CC_UNLOCK(new_cc);
-		CC_LOCK(cc);
-#else
-		panic("migration should not happen");
-#endif
+	} else if (c_flags & CALLOUT_LOCAL_ALLOC) {
+		/* return callout back to freelist */
+		callout_cc_del(c, cc);
+	} else if (cc->cc_exec_entity[direct].cc_restart) {
+		/* [re-]schedule callout, if any */
+		cc = callout_cc_add_locked(c, cc,
+		    &cc->cc_exec_entity[direct].cc_restart_args, false);
 	}
-	/*
-	 * If the current callout is locally allocated (from
-	 * timeout(9)) then put it on the freelist.
-	 *
-	 * Note: we need to check the cached copy of c_flags because
-	 * if it was not local, then it's not safe to deref the
-	 * callout pointer.
-	 */
-	KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 ||
-	    c->c_flags == CALLOUT_LOCAL_ALLOC,
-	    ("corrupted callout"));
-	if (c_flags & CALLOUT_LOCAL_ALLOC)
-		callout_cc_del(c, cc);
 }
 
 /*
@@ -899,10 +934,11 @@
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
-	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
+	callout_reset(new, to_ticks, ftn, arg);
+
 	return (handle);
 }
 
@@ -910,6 +946,7 @@
 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
 {
 	struct callout_cpu *cc;
+	bool match;
 
 	/*
 	 * Check for a handle that was initialized
@@ -920,9 +957,11 @@
 		return;
 
 	cc = callout_lock(handle.callout);
-	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
+	match = (handle.callout->c_func == ftn && handle.callout->c_arg == arg);
+	CC_UNLOCK(cc);
+
+	if (match)
 		callout_stop(handle.callout);
-	CC_UNLOCK(cc);
 }
 
 void
@@ -931,6 +970,119 @@
 	handle->callout = NULL;
 }
 
+static int
+callout_restart_async(struct callout *c, struct callout_args *coa,
+    callout_func_t *drain_fn, void *drain_arg)
+{
+	struct callout_cpu *cc;
+	int cancelled;
+	int direct;
+
+	cc = callout_lock(c);
+
+	/* Figure out if the callout is direct or not */
+	direct = ((c->c_flags & CALLOUT_DIRECT) != 0);
+
+	/*
+	 * Check if the callback is currently scheduled for
+	 * completion:
+	 */
+	if (cc->cc_exec_entity[direct].cc_curr == c) {
+		/*
+		 * Try to prevent the callback from running by setting
+		 * the "cc_cancel" variable to "true". Also check if
+		 * the callout was previously subject to a deferred
+		 * callout restart:
+		 */
+		if (cc->cc_exec_entity[direct].cc_cancel == false ||
+		    (c->c_flags & CALLOUT_DEFRESTART) != 0) {
+			cc->cc_exec_entity[direct].cc_cancel = true;
+			cancelled = 1;
+		} else {
+			cancelled = 0;
+		}
+
+		/*
+		 * Prevent callback restart if "callout_drain_xxx()"
+		 * is being called or we are stopping the callout or
+		 * the callback was preallocated by us:
+		 */
+		if (cc->cc_exec_entity[direct].cc_drain_fn != NULL ||
+		    coa == NULL || (c->c_flags & CALLOUT_LOCAL_ALLOC) != 0) {
+			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
+			    cancelled ? "cancelled and draining" : "draining",
+			    c, c->c_func, c->c_arg);
+
+			/* clear old flags, if any */
+			c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING |
+			    CALLOUT_DEFRESTART | CALLOUT_PROCESSED);
+
+			/* clear restart flag, if any */
+			cc->cc_exec_entity[direct].cc_restart = false;
+
+			/* set drain function, if any */
+			if (drain_fn != NULL) {
+				cc->cc_exec_entity[direct].cc_drain_fn = drain_fn;
+				cc->cc_exec_entity[direct].cc_drain_arg = drain_arg;
+				cancelled |= 2;		/* XXX define the value */
+			}
+		} else {
+			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
+			    cancelled ? "cancelled and restarting" : "restarting",
+			    c, c->c_func, c->c_arg);
+
+			/* get us back into the game */
+			c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING |
+			    CALLOUT_DEFRESTART);
+			c->c_flags &= ~CALLOUT_PROCESSED;
+
+			/* enable deferred restart */
+			cc->cc_exec_entity[direct].cc_restart = true;
+
+			/* store arguments for the deferred restart, if any */
+			cc->cc_exec_entity[direct].cc_restart_args = *coa;
+		}
+	} else {
+		/* stop callout */
+		if (c->c_flags & CALLOUT_PENDING) {
+			/*
+			 * The callback has not yet been executed, and
+			 * we simply just need to unlink it:
+			 */
+			if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+				if (cc->cc_exec_next_dir == c)
+					cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
+				LIST_REMOVE(c, c_links.le);
+			} else {
+				TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+			}
+			cancelled = 1;
+		} else {
+			cancelled = 0;
+		}
+
+		/* [re-]schedule callout, if any */
+		if (coa != NULL) {
+			cc = callout_cc_add_locked(c, cc, coa, true);
+		} else {
+			/* clear old flags, if any */
+			c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING |
+			    CALLOUT_DEFRESTART | CALLOUT_PROCESSED);
+
+			/* return callback to pre-allocated list, if any */
+			if ((c->c_flags & CALLOUT_LOCAL_ALLOC) && cancelled != 0) {
+				callout_cc_del(c, cc);
+			}
+		}
+
+		CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
+		    cancelled ? "rescheduled" : "scheduled",
+		    c, c->c_func, c->c_arg);
+	}
+	CC_UNLOCK(cc);
+	return (cancelled);
+}
+
 /*
  * New interface; clients allocate their own callout structures.
  *
@@ -949,25 +1101,32 @@
  */
 int
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
-    void (*ftn)(void *), void *arg, int cpu, int flags)
+    callout_func_t *ftn, void *arg, int cpu, int flags)
 {
-	sbintime_t to_sbt, pr;
-	struct callout_cpu *cc;
-	int cancelled, direct;
+	struct callout_args coa;
 
-	cancelled = 0;
-	if (flags & C_ABSOLUTE) {
-		to_sbt = sbt;
+	/* store arguments for callout add function */
+	coa.func = ftn;
+	coa.arg = arg;
+	coa.precision = precision;
+	coa.flags = flags;
+	coa.cpu = cpu;
+
+	/* compute the rest of the arguments needed */
+	if (coa.flags & C_ABSOLUTE) {
+		coa.time = sbt;
 	} else {
-		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
+		sbintime_t pr;
+
+		if ((coa.flags & C_HARDCLOCK) && (sbt < tick_sbt))
 			sbt = tick_sbt;
-		if ((flags & C_HARDCLOCK) ||
+		if ((coa.flags & C_HARDCLOCK) ||
 #ifdef NO_EVENTTIMERS
 		    sbt >= sbt_timethreshold) {
-			to_sbt = getsbinuptime();
+			coa.time = getsbinuptime();
 
 			/* Add safety belt for the case of hz > 1000. */
-			to_sbt += tc_tick_sbt - tick_sbt;
+			coa.time += tc_tick_sbt - tick_sbt;
 #else
 		    sbt >= sbt_tickthreshold) {
 			/*
@@ -977,101 +1136,29 @@
 			 * active ones.
 			 */
 #ifdef __LP64__
-			to_sbt = DPCPU_GET(hardclocktime);
+			coa.time = DPCPU_GET(hardclocktime);
 #else
 			spinlock_enter();
-			to_sbt = DPCPU_GET(hardclocktime);
+			coa.time = DPCPU_GET(hardclocktime);
 			spinlock_exit();
 #endif
 #endif
-			if ((flags & C_HARDCLOCK) == 0)
-				to_sbt += tick_sbt;
+			if ((coa.flags & C_HARDCLOCK) == 0)
+				coa.time += tick_sbt;
 		} else
-			to_sbt = sbinuptime();
-		if (SBT_MAX - to_sbt < sbt)
-			to_sbt = SBT_MAX;
+			coa.time = sbinuptime();
+		if (SBT_MAX - coa.time < sbt)
+			coa.time = SBT_MAX;
 		else
-			to_sbt += sbt;
-		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
-		    sbt >> C_PRELGET(flags));
-		if (pr > precision)
-			precision = pr;
+			coa.time += sbt;
+		pr = ((C_PRELGET(coa.flags) < 0) ? sbt >> tc_precexp :
+		    sbt >> C_PRELGET(coa.flags));
+		if (pr > coa.precision)
+			coa.precision = pr;
 	}
-	/*
-	 * Don't allow migration of pre-allocated callouts lest they
-	 * become unbalanced.
-	 */
-	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
-		cpu = c->c_cpu;
-	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
-	KASSERT(!direct || c->c_lock == NULL,
-	    ("%s: direct callout %p has lock", __func__, c));
-	cc = callout_lock(c);
-	if (cc->cc_exec_entity[direct].cc_curr == c) {
-		/*
-		 * We're being asked to reschedule a callout which is
-		 * currently in progress.  If there is a lock then we
-		 * can cancel the callout if it has not really started.
-		 */
-		if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel)
-			cancelled = cc->cc_exec_entity[direct].cc_cancel = true;
-		if (cc->cc_exec_entity[direct].cc_waiting) {
-			/*
-			 * Someone has called callout_drain to kill this
-			 * callout.  Don't reschedule.
-			 */
-			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
-			    cancelled ? "cancelled" : "failed to cancel",
-			    c, c->c_func, c->c_arg);
-			CC_UNLOCK(cc);
-			return (cancelled);
-		}
-	}
-	if (c->c_flags & CALLOUT_PENDING) {
-		if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
-			if (cc->cc_exec_next_dir == c)
-				cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
-			LIST_REMOVE(c, c_links.le);
-		} else
-			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
-		cancelled = 1;
-		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
-	}
 
-#ifdef SMP
-	/*
-	 * If the callout must migrate try to perform it immediately.
-	 * If the callout is currently running, just defer the migration
-	 * to a more appropriate moment.
-	 */
-	if (c->c_cpu != cpu) {
-		if (cc->cc_exec_entity[direct].cc_curr == c) {
-			cc->cc_exec_entity[direct].ce_migration_cpu = cpu;
-			cc->cc_exec_entity[direct].ce_migration_time
-			    = to_sbt;
-			cc->cc_exec_entity[direct].ce_migration_prec 
-			    = precision;
-			cc->cc_exec_entity[direct].ce_migration_func = ftn;
-			cc->cc_exec_entity[direct].ce_migration_arg = arg;
-			c->c_flags |= CALLOUT_DFRMIGRATION;
-			CTR6(KTR_CALLOUT,
-		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
-			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
-			    (u_int)(to_sbt & 0xffffffff), cpu);
-			CC_UNLOCK(cc);
-			return (cancelled);
-		}
-		cc = callout_cpu_switch(c, cc, cpu);
-	}
-#endif
-
-	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
-	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
-	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
-	    (u_int)(to_sbt & 0xffffffff));
-	CC_UNLOCK(cc);
-
-	return (cancelled);
+	/* get callback started, if any */
+	return (callout_restart_async(c, &coa, NULL, NULL));
 }
 
 /*
@@ -1090,189 +1177,79 @@
 }
 
 int
-_callout_stop_safe(struct callout *c, int safe)
+callout_stop(struct callout *c)
 {
-	struct callout_cpu *cc, *old_cc;
-	struct lock_class *class;
-	int direct, sq_locked, use_lock;
+	/* get callback stopped, if any */
+	return (callout_restart_async(c, NULL, NULL, NULL));
+}
 
-	/*
-	 * Some old subsystems don't hold Giant while running a callout_stop(),
-	 * so just discard this check for the moment.
-	 */
-	if (!safe && c->c_lock != NULL) {
-		if (c->c_lock == &Giant.lock_object)
-			use_lock = mtx_owned(&Giant);
-		else {
-			use_lock = 1;
-			class = LOCK_CLASS(c->c_lock);
-			class->lc_assert(c->c_lock, LA_XLOCKED);
-		}
-	} else
-		use_lock = 0;
-	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
-	sq_locked = 0;
-	old_cc = NULL;
-again:
-	cc = callout_lock(c);
+static void
+callout_drain_function(void *arg)
+{
+	wakeup(arg);
+}
 
-	/*
-	 * If the callout was migrating while the callout cpu lock was
-	 * dropped,  just drop the sleepqueue lock and check the states
-	 * again.
-	 */
-	if (sq_locked != 0 && cc != old_cc) {
-#ifdef SMP
-		CC_UNLOCK(cc);
-		sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting);
-		sq_locked = 0;
-		old_cc = NULL;
-		goto again;
-#else
-		panic("migration should not happen");
-#endif
-	}
+int
+callout_drain_async(struct callout *c, callout_func_t *fn, void *arg)
+{
+	/* get callback stopped, if any */
+	return (callout_restart_async(c, NULL, fn, arg) & 2);
+}
 
-	/*
-	 * If the callout isn't pending, it's not on the queue, so
-	 * don't attempt to remove it from the queue.  We can try to
-	 * stop it by other means however.
-	 */
-	if (!(c->c_flags & CALLOUT_PENDING)) {
-		c->c_flags &= ~CALLOUT_ACTIVE;
+int
+callout_drain(struct callout *c)
+{
+	int cancelled;
 
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+	    "Draining callout");
+
+	callout_lock_client(c->c_flags, c->c_lock);
+
+	/* at this point the "c->c_cpu" field is not changing */
+
+	cancelled = callout_drain_async(c, &callout_drain_function, c);
+
+	if (cancelled != 0) {
+		struct callout_cpu *cc;
+		int direct;
+
+		CTR3(KTR_CALLOUT, "need to drain %p func %p arg %p",
+		    c, c->c_func, c->c_arg);
+
+		cc = callout_lock(c);
+		direct = ((c->c_flags & CALLOUT_DIRECT) != 0);
+
 		/*
-		 * If it wasn't on the queue and it isn't the current
-		 * callout, then we can't stop it, so just bail.
+		 * We've gotten our callout CPU lock, it is safe to
+		 * drop the initial lock:
 		 */
-		if (cc->cc_exec_entity[direct].cc_curr != c) {
-			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
-			    c, c->c_func, c->c_arg);
-			CC_UNLOCK(cc);
-			if (sq_locked)
-				sleepq_release(
-				    &cc->cc_exec_entity[direct].cc_waiting);
-			return (0);
-		}
+		callout_unlock_client(c->c_flags, c->c_lock);
 
-		if (safe) {
-			/*
-			 * The current callout is running (or just
-			 * about to run) and blocking is allowed, so
-			 * just wait for the current invocation to
-			 * finish.
-			 */
-			while (cc->cc_exec_entity[direct].cc_curr == c) {
-				/*
-				 * Use direct calls to sleepqueue interface
-				 * instead of cv/msleep in order to avoid
-				 * a LOR between cc_lock and sleepqueue
-				 * chain spinlocks.  This piece of code
-				 * emulates a msleep_spin() call actually.
-				 *
-				 * If we already have the sleepqueue chain
-				 * locked, then we can safely block.  If we
-				 * don't already have it locked, however,
-				 * we have to drop the cc_lock to lock
-				 * it.  This opens several races, so we
-				 * restart at the beginning once we have
-				 * both locks.  If nothing has changed, then
-				 * we will end up back here with sq_locked
-				 * set.
-				 */
-				if (!sq_locked) {
-					CC_UNLOCK(cc);
-					sleepq_lock(
-					&cc->cc_exec_entity[direct].cc_waiting);
-					sq_locked = 1;
-					old_cc = cc;
-					goto again;
-				}
+		/* Wait for drain to complete */
 
-				/*
-				 * Migration could be cancelled here, but
-				 * as long as it is still not sure when it
-				 * will be packed up, just let softclock()
-				 * take care of it.
-				 */
-				cc->cc_exec_entity[direct].cc_waiting = true;
-				DROP_GIANT();
-				CC_UNLOCK(cc);
-				sleepq_add(
-				    &cc->cc_exec_entity[direct].cc_waiting,
-				    &cc->cc_lock.lock_object, "codrain",
-				    SLEEPQ_SLEEP, 0);
-				sleepq_wait(
-				    &cc->cc_exec_entity[direct].cc_waiting,
-					     0);
-				sq_locked = 0;
-				old_cc = NULL;
+		while (cc->cc_exec_entity[direct].cc_curr == c)
+			msleep_spin(c, (struct mtx *)&cc->cc_lock, "codrain", 0);
 
-				/* Reacquire locks previously released. */
-				PICKUP_GIANT();
-				CC_LOCK(cc);
-			}
-		} else if (use_lock &&
-			    !cc->cc_exec_entity[direct].cc_cancel) {
-			/*
-			 * The current callout is waiting for its
-			 * lock which we hold.  Cancel the callout
-			 * and return.  After our caller drops the
-			 * lock, the callout will be skipped in
-			 * softclock().
-			 */
-			cc->cc_exec_entity[direct].cc_cancel = true;
-			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
-			    c, c->c_func, c->c_arg);
-			KASSERT(!cc_cce_migrating(cc, direct),
-			    ("callout wrongly scheduled for migration"));
-			CC_UNLOCK(cc);
-			KASSERT(!sq_locked, ("sleepqueue chain locked"));
-			return (1);
-		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
-			c->c_flags &= ~CALLOUT_DFRMIGRATION;
-			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
-			    c, c->c_func, c->c_arg);
-			CC_UNLOCK(cc);
-			return (1);
-		}
-		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
-		    c, c->c_func, c->c_arg);
 		CC_UNLOCK(cc);
-		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
-		return (0);
+	} else {
+		callout_unlock_client(c->c_flags, c->c_lock);
 	}
-	if (sq_locked)
-		sleepq_release(&cc->cc_exec_entity[direct].cc_waiting);
 
-	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
-
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
-	if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
-		if (cc->cc_exec_next_dir == c)
-			cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
-		LIST_REMOVE(c, c_links.le);
-	} else
-		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
-	callout_cc_del(c, cc);
 
-	CC_UNLOCK(cc);
-	return (1);
+	return (cancelled & 1);
 }
 
 void
 callout_init(struct callout *c, int mpsafe)
 {
-	bzero(c, sizeof *c);
 	if (mpsafe) {
-		c->c_lock = NULL;
-		c->c_flags = CALLOUT_RETURNUNLOCKED;
+		_callout_init_lock(c, NULL, CALLOUT_RETURNUNLOCKED);
 	} else {
-		c->c_lock = &Giant.lock_object;
-		c->c_flags = 0;
+		_callout_init_lock(c, &Giant.lock_object, 0);
 	}
-	c->c_cpu = timeout_cpu;
 }
 
 void
@@ -1279,15 +1256,26 @@
 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
 {
 	bzero(c, sizeof *c);
+	KASSERT((flags & ~CALLOUT_RETURNUNLOCKED) == 0,
+	    ("callout_init_lock: bad flags 0x%08x", flags));
+	flags &= CALLOUT_RETURNUNLOCKED;
+	if (lock != NULL) {
+		struct lock_class *class = LOCK_CLASS(lock);
+		if (class == &lock_class_mtx_sleep)
+			flags |= CALLOUT_SET_LC(CALLOUT_LC_MUTEX);
+		else if (class == &lock_class_mtx_spin)
+			flags |= CALLOUT_SET_LC(CALLOUT_LC_SPIN);
+		else if (class == &lock_class_rm)
+			flags |= CALLOUT_SET_LC(CALLOUT_LC_RM);
+		else if (class == &lock_class_rw)
+			flags |= CALLOUT_SET_LC(CALLOUT_LC_RW);
+		else
+			panic("callout_init_lock: Unsupported lock class '%s'\n", class->lc_name);
+	} else {
+		flags |= CALLOUT_SET_LC(CALLOUT_LC_UNUSED_0);
+	}
 	c->c_lock = lock;
-	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
-	    ("callout_init_lock: bad flags %d", flags));
-	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
-	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
-	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
-	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
-	    __func__));
-	c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
+	c->c_flags = flags;
 	c->c_cpu = timeout_cpu;
 }
 
Index: sys/kern/subr_sleepqueue.c
===================================================================
--- sys/kern/subr_sleepqueue.c	(revision 276531)
+++ sys/kern/subr_sleepqueue.c	(working copy)
@@ -152,7 +152,8 @@
  */
 static int	sleepq_catch_signals(void *wchan, int pri);
 static int	sleepq_check_signals(void);
-static int	sleepq_check_timeout(void);
+static int	sleepq_check_timeout(struct thread *);
+static void	sleepq_stop_timeout(struct thread *);
 #ifdef INVARIANTS
 static void	sleepq_dtor(void *mem, int size, void *arg);
 #endif
@@ -373,17 +374,14 @@
 sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
     int flags)
 {
-	struct sleepqueue_chain *sc;
 	struct thread *td;
 
 	td = curthread;
-	sc = SC_LOOKUP(wchan);
-	mtx_assert(&sc->sc_lock, MA_OWNED);
-	MPASS(TD_ON_SLEEPQ(td));
-	MPASS(td->td_sleepqueue == NULL);
-	MPASS(wchan != NULL);
+
+	mtx_lock_spin(&td->td_slpmutex);
 	callout_reset_sbt_on(&td->td_slpcallout, sbt, pr,
 	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
+	mtx_unlock_spin(&td->td_slpmutex);
 }
 
 /*
@@ -559,11 +557,8 @@
  * Check to see if we timed out.
  */
 static int
-sleepq_check_timeout(void)
+sleepq_check_timeout(struct thread *td)
 {
-	struct thread *td;
-
-	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
@@ -573,28 +568,21 @@
 		td->td_flags &= ~TDF_TIMEOUT;
 		return (EWOULDBLOCK);
 	}
-
-	/*
-	 * If TDF_TIMOFAIL is set, the timeout ran after we had
-	 * already been woken up.
-	 */
-	if (td->td_flags & TDF_TIMOFAIL)
-		td->td_flags &= ~TDF_TIMOFAIL;
-
-	/*
-	 * If callout_stop() fails, then the timeout is running on
-	 * another CPU, so synchronize with it to avoid having it
-	 * accidentally wake up a subsequent sleep.
-	 */
-	else if (callout_stop(&td->td_slpcallout) == 0) {
-		td->td_flags |= TDF_TIMEOUT;
-		TD_SET_SLEEPING(td);
-		mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
-	}
 	return (0);
 }
 
 /*
+ * Atomically stop the timeout by using a mutex.
+ */
+static void
+sleepq_stop_timeout(struct thread *td)
+{
+	mtx_lock_spin(&td->td_slpmutex);
+	callout_stop(&td->td_slpcallout);
+	mtx_unlock_spin(&td->td_slpmutex);
+}
+
+/*
  * Check to see if we were awoken by a signal.
  */
 static int
@@ -664,9 +652,11 @@
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
-	rval = sleepq_check_timeout();
+	rval = sleepq_check_timeout(td);
 	thread_unlock(td);
 
+	sleepq_stop_timeout(td);
+
 	return (rval);
 }
 
@@ -677,12 +667,18 @@
 int
 sleepq_timedwait_sig(void *wchan, int pri)
 {
+	struct thread *td;
 	int rcatch, rvalt, rvals;
 
+	td = curthread;
+
 	rcatch = sleepq_catch_signals(wchan, pri);
-	rvalt = sleepq_check_timeout();
+	rvalt = sleepq_check_timeout(td);
 	rvals = sleepq_check_signals();
-	thread_unlock(curthread);
+	thread_unlock(td);
+
+	sleepq_stop_timeout(td);
+
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
@@ -889,64 +885,49 @@
 static void
 sleepq_timeout(void *arg)
 {
-	struct sleepqueue_chain *sc;
-	struct sleepqueue *sq;
-	struct thread *td;
-	void *wchan;
-	int wakeup_swapper;
+	struct thread *td = arg;
+	int wakeup_swapper = 0;
 
-	td = arg;
-	wakeup_swapper = 0;
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
-	/*
-	 * First, see if the thread is asleep and get the wait channel if
-	 * it is.
-	 */
+	/* Handle the three cases which can happen */
+
 	thread_lock(td);
-	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
-		wchan = td->td_wchan;
-		sc = SC_LOOKUP(wchan);
-		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
-		sq = sleepq_lookup(wchan);
-		MPASS(sq != NULL);
-		td->td_flags |= TDF_TIMEOUT;
-		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
-		thread_unlock(td);
-		if (wakeup_swapper)
-			kick_proc0();
-		return;
-	}
+	if (TD_ON_SLEEPQ(td)) {
+		if (TD_IS_SLEEPING(td)) {
+			struct sleepqueue_chain *sc;
+			struct sleepqueue *sq;
+			void *wchan;
 
-	/*
-	 * If the thread is on the SLEEPQ but isn't sleeping yet, it
-	 * can either be on another CPU in between sleepq_add() and
-	 * one of the sleepq_*wait*() routines or it can be in
-	 * sleepq_catch_signals().
-	 */
-	if (TD_ON_SLEEPQ(td)) {
-		td->td_flags |= TDF_TIMEOUT;
-		thread_unlock(td);
-		return;
+			/*
+			 * Case I - thread is asleep and needs to be
+			 * awoken:
+			 */
+			wchan = td->td_wchan;
+			sc = SC_LOOKUP(wchan);
+			THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
+			sq = sleepq_lookup(wchan);
+			MPASS(sq != NULL);
+			td->td_flags |= TDF_TIMEOUT;
+			wakeup_swapper = sleepq_resume_thread(sq, td, 0);
+		} else {
+			/*
+			 * Case II - cancel going to sleep by setting
+			 * the timeout flag because the target thread
+			 * is not asleep yet. It can be on another CPU
+			 * in between sleepq_add() and one of the
+			 * sleepq_*wait*() routines or it can be in
+			 * sleepq_catch_signals().
+			 */
+			td->td_flags |= TDF_TIMEOUT;
+		}
+	} else {
+		/*
+		 * Case III - thread is already woken up by a wakeup
+		 * call and should not timeout. Nothing to do!
+		 */
 	}
-
-	/*
-	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
-	 * then the other thread has already yielded to us, so clear
-	 * the flag and resume it.  If TDF_TIMEOUT is not set, then the
-	 * we know that the other thread is not on a sleep queue, but it
-	 * hasn't resumed execution yet.  In that case, set TDF_TIMOFAIL
-	 * to let it know that the timeout has already run and doesn't
-	 * need to be canceled.
-	 */
-	if (td->td_flags & TDF_TIMEOUT) {
-		MPASS(TD_IS_SLEEPING(td));
-		td->td_flags &= ~TDF_TIMEOUT;
-		TD_CLR_SLEEPING(td);
-		wakeup_swapper = setrunnable(td);
-	} else
-		td->td_flags |= TDF_TIMOFAIL;
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
Index: sys/sys/_callout.h
===================================================================
--- sys/sys/_callout.h	(revision 276531)
+++ sys/sys/_callout.h	(working copy)
@@ -46,6 +46,17 @@
 SLIST_HEAD(callout_slist, callout);
 TAILQ_HEAD(callout_tailq, callout);
 
+typedef void callout_func_t(void *);
+
+struct callout_args {
+	sbintime_t time;		/* absolute time for the event */
+	sbintime_t precision;		/* delta allowed wrt opt */
+	void	*arg;			/* function argument */
+	callout_func_t *func;		/* function to call */
+	int	flags;			/* flags passed to callout_reset() */
+	int	cpu;			/* CPU we're scheduled on */
+};
+
 struct callout {
 	union {
 		LIST_ENTRY(callout) le;
@@ -52,13 +63,13 @@
 		SLIST_ENTRY(callout) sle;
 		TAILQ_ENTRY(callout) tqe;
 	} c_links;
-	sbintime_t c_time;			/* ticks to the event */
+	sbintime_t c_time;			/* absolute time for the event */
 	sbintime_t c_precision;			/* delta allowed wrt opt */
 	void	*c_arg;				/* function argument */
-	void	(*c_func)(void *);		/* function to call */
-	struct lock_object *c_lock;		/* lock to handle */
+	callout_func_t *c_func;			/* function to call */
+	struct lock_object *c_lock;		/* callback lock */
 	int	c_flags;			/* state of this entry */
-	volatile int c_cpu;			/* CPU we're scheduled on */
+	int	c_cpu;				/* CPU we're scheduled on */
 };
 
 #endif
Index: sys/sys/callout.h
===================================================================
--- sys/sys/callout.h	(revision 276531)
+++ sys/sys/callout.h	(working copy)
@@ -45,10 +45,12 @@
 #define	CALLOUT_PENDING		0x0004 /* callout is waiting for timeout */
 #define	CALLOUT_MPSAFE		0x0008 /* callout handler is mp safe */
 #define	CALLOUT_RETURNUNLOCKED	0x0010 /* handler returns with mtx unlocked */
-#define	CALLOUT_SHAREDLOCK	0x0020 /* callout lock held in shared mode */
-#define	CALLOUT_DFRMIGRATION	0x0040 /* callout in deferred migration mode */
+#define	CALLOUT_UNUSED_5	0x0020 /* --available-- */
+#define	CALLOUT_DEFRESTART	0x0040 /* callout restart is deferred */
 #define	CALLOUT_PROCESSED	0x0080 /* callout in wheel or processing list? */
 #define	CALLOUT_DIRECT 		0x0100 /* allow exec from hw int context */
+#define	CALLOUT_SET_LC(x)	(((x) & 7) << 16) /* set lock class */
+#define	CALLOUT_GET_LC(x)	(((x) >> 16) & 7) /* get lock class */
 
 #define	C_DIRECT_EXEC		0x0001 /* direct execution of callout */
 #define	C_PRELBITS		7
@@ -65,7 +67,8 @@
 #ifdef _KERNEL
 #define	callout_active(c)	((c)->c_flags & CALLOUT_ACTIVE)
 #define	callout_deactivate(c)	((c)->c_flags &= ~CALLOUT_ACTIVE)
-#define	callout_drain(c)	_callout_stop_safe(c, 1)
+int	callout_drain(struct callout *);
+int	callout_drain_async(struct callout *, callout_func_t *, void *);
 void	callout_init(struct callout *, int);
 void	_callout_init_lock(struct callout *, struct lock_object *, int);
 #define	callout_init_mtx(c, mtx, flags)					\
@@ -79,7 +82,7 @@
 	   NULL, (flags))
 #define	callout_pending(c)	((c)->c_flags & CALLOUT_PENDING)
 int	callout_reset_sbt_on(struct callout *, sbintime_t, sbintime_t,
-	    void (*)(void *), void *, int, int);
+	    callout_func_t *, void *, int, int);
 #define	callout_reset_sbt(c, sbt, pr, fn, arg, flags)			\
     callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), (c)->c_cpu, (flags))
 #define	callout_reset_sbt_curcpu(c, sbt, pr, fn, arg, flags)		\
@@ -103,8 +106,7 @@
 int	callout_schedule_on(struct callout *, int, int);
 #define	callout_schedule_curcpu(c, on_tick)				\
     callout_schedule_on((c), (on_tick), PCPU_GET(cpuid))
-#define	callout_stop(c)		_callout_stop_safe(c, 0)
-int	_callout_stop_safe(struct callout *, int);
+int	callout_stop(struct callout *);
 void	callout_process(sbintime_t now);
 
 #endif
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h	(revision 276531)
+++ sys/sys/proc.h	(working copy)
@@ -308,6 +308,7 @@
 	} td_uretoff;			/* (k) Syscall aux returns. */
 #define td_retval	td_uretoff.tdu_retval
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
+	struct mtx	td_slpmutex;	/* (h) Mutex for sleep callout */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* (a) Kernel VA of kstack. */
@@ -364,7 +365,7 @@
 #define	TDF_ALLPROCSUSP	0x00000200 /* suspended by SINGLE_ALLPROC */
 #define	TDF_BOUNDARY	0x00000400 /* Thread suspended at user boundary */
 #define	TDF_ASTPENDING	0x00000800 /* Thread has some asynchronous events. */
-#define	TDF_TIMOFAIL	0x00001000 /* Timeout from sleep after we were awake. */
+#define	TDF_UNUSED12	0x00001000 /* --available-- */
 #define	TDF_SBDRY	0x00002000 /* Stop only on usermode boundary. */
 #define	TDF_UPIBLOCKED	0x00004000 /* Thread blocked on user PI mutex. */
 #define	TDF_NEEDSUSPCHK	0x00008000 /* Thread may need to suspend. */
@@ -704,7 +705,7 @@
 #define	SWT_OWEPREEMPT		2	/* Switching due to opepreempt. */
 #define	SWT_TURNSTILE		3	/* Turnstile contention. */
 #define	SWT_SLEEPQ		4	/* Sleepq wait. */
-#define	SWT_SLEEPQTIMO		5	/* Sleepq timeout wait. */
+#define	SWT_UNUSED5		5	/* --available-- */
 #define	SWT_RELINQUISH		6	/* yield call. */
 #define	SWT_NEEDRESCHED		7	/* NEEDRESCHED was set. */
 #define	SWT_IDLE		8	/* Switching from the idle thread. */

_______________________________________________
freebsd-current@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"

Re: [RFC] kern/kern_timeout.c rewrite in progress

Reply via email to