On Thu, Jan 14, 2010 at 01:11:33AM +0100, Jan Kiszka wrote:
> Paul E. McKenney wrote:
> > On Tue, Jan 12, 2010 at 09:28:15AM +0100, Jan Kiszka wrote:
> >> If so, I will try to write something like this the next days. Will
> >> surely appreciate your review afterwards!
> >
> > Sounds good!
> >
>
> Here we go, find the commit inlined below. You may be primarily
> interested in kvm_synchronize_sched_expedited() and the helper thread
> function kvm_rcu_sync_thread. As I use a dedicated thread I simplified
> some parts compared to upstream. Light testing indicates that it works.
Cool!!!
Have you had a chance to run rcutorture on this implementation?
Thanx, Paul
> The full set of changes (specifically interesting for anyone who wants
> to test it) is available on kernel.org. There is now also a
> commit [1] included for CPU hotplug support, but it's untested as I don't
> have the required hardware at hand.
>
> Thanks in advance,
> Jan
>
> [1]
> http://git.kernel.org/?p=virt/kvm/kvm-kmod.git;a=commitdiff;h=81a3ddcd8ea343a9570f3164bb31160ff4be0ab7
>
> --------->
>
> From fe13d0785586623176879a87949ad626dace192c Mon Sep 17 00:00:00 2001
> From: Jan Kiszka <[email protected]>
> Date: Wed, 13 Jan 2010 09:32:50 +0100
> Subject: [PATCH] Compat support for synchronize_srcu_sync
>
> This services was first introduced to 2.6.32 and further optimized in
> 2.6.33. KVM makes use of it for replacing the rw-sem based slot lock
> with SRCU.
>
> TODO: Add CPU-hotplug support to the sync kthreads.
>
> Signed-off-by: Jan Kiszka <[email protected]>
> ---
> external-module-compat-comm.h | 17 ++
> srcu.c | 382
> ++++++++++++++++++++++++++++++-----------
> sync | 27 ++-
> 3 files changed, 316 insertions(+), 110 deletions(-)
>
> diff --git a/external-module-compat-comm.h b/external-module-compat-comm.h
> index ca57567..eabf8a1 100644
> --- a/external-module-compat-comm.h
> +++ b/external-module-compat-comm.h
> @@ -1053,3 +1053,20 @@ static inline void kvm_fire_urn(void)
> static inline void kvm_fire_urn(void) {}
>
> #endif /* CONFIG_USER_RETURN_NOTIFIER */
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33)
> +
> +#ifdef CONFIG_SMP
> +void kvm_synchronize_srcu_expedited(struct srcu_struct *sp);
> +#else
> +static inline void kvm_synchronize_srcu_expedited(struct srcu_struct *sp) { }
> +#endif
> +
> +#else
> +
> +#define kvm_synchronize_srcu_expedited synchronize_srcu_expedited
> +
> +#endif
> +
> +int kvm_init_srcu(void);
> +void kvm_exit_srcu(void);
> diff --git a/srcu.c b/srcu.c
> index e9734bc..985d627 100644
> --- a/srcu.c
> +++ b/srcu.c
> @@ -24,8 +24,6 @@
> *
> */
>
> -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
> -
> #include <linux/module.h>
> #include <linux/mutex.h>
> #include <linux/percpu.h>
> @@ -35,6 +33,118 @@
> #include <linux/slab.h>
> #include <linux/smp.h>
> #include <linux/srcu.h>
> +#include <linux/kthread.h>
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) || \
> + (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) && defined(CONFIG_SMP))
> +
> +/*
> + * srcu_readers_active_idx -- returns approximate number of readers
> + * active on the specified rank of per-CPU counters.
> + */
> +
> +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> +{
> + int cpu;
> + int sum;
> +
> + sum = 0;
> + for_each_possible_cpu(cpu)
> + sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
> + return sum;
> +}
> +
> +/*
> + * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
> + */
> +static void __synchronize_srcu(struct srcu_struct *sp, void
> (*sync_func)(void))
> +{
> + int idx;
> +
> + idx = sp->completed;
> + mutex_lock(&sp->mutex);
> +
> + /*
> + * Check to see if someone else did the work for us while we were
> + * waiting to acquire the lock. We need -two- advances of
> + * the counter, not just one. If there was but one, we might have
> + * shown up -after- our helper's first synchronize_sched(), thus
> + * having failed to prevent CPU-reordering races with concurrent
> + * srcu_read_unlock()s on other CPUs (see comment below). So we
> + * either (1) wait for two or (2) supply the second ourselves.
> + */
> +
> + if ((sp->completed - idx) >= 2) {
> + mutex_unlock(&sp->mutex);
> + return;
> + }
> +
> + sync_func(); /* Force memory barrier on all CPUs. */
> +
> + /*
> + * The preceding synchronize_sched() ensures that any CPU that
> + * sees the new value of sp->completed will also see any preceding
> + * changes to data structures made by this CPU. This prevents
> + * some other CPU from reordering the accesses in its SRCU
> + * read-side critical section to precede the corresponding
> + * srcu_read_lock() -- ensuring that such references will in
> + * fact be protected.
> + *
> + * So it is now safe to do the flip.
> + */
> +
> + idx = sp->completed & 0x1;
> + sp->completed++;
> +
> + sync_func(); /* Force memory barrier on all CPUs. */
> +
> + /*
> + * At this point, because of the preceding synchronize_sched(),
> + * all srcu_read_lock() calls using the old counters have completed.
> + * Their corresponding critical sections might well be still
> + * executing, but the srcu_read_lock() primitives themselves
> + * will have finished executing.
> + */
> +
> + while (srcu_readers_active_idx(sp, idx))
> + schedule_timeout_interruptible(1);
> +
> + sync_func(); /* Force memory barrier on all CPUs. */
> +
> + /*
> + * The preceding synchronize_sched() forces all srcu_read_unlock()
> + * primitives that were executing concurrently with the preceding
> + * for_each_possible_cpu() loop to have completed by this point.
> + * More importantly, it also forces the corresponding SRCU read-side
> + * critical sections to have also completed, and the corresponding
> + * references to SRCU-protected data items to be dropped.
> + *
> + * Note:
> + *
> + * Despite what you might think at first glance, the
> + * preceding synchronize_sched() -must- be within the
> + * critical section ended by the following mutex_unlock().
> + * Otherwise, a task taking the early exit can race
> + * with a srcu_read_unlock(), which might have executed
> + * just before the preceding srcu_readers_active() check,
> + * and whose CPU might have reordered the srcu_read_unlock()
> + * with the preceding critical section. In this case, there
> + * is nothing preventing the synchronize_sched() task that is
> + * taking the early exit from freeing a data structure that
> + * is still being referenced (out of order) by the task
> + * doing the srcu_read_unlock().
> + *
> + * Alternatively, the comparison with "2" on the early exit
> + * could be changed to "3", but this increases synchronize_srcu()
> + * latency for bulk loads. So the current code is preferred.
> + */
> +
> + mutex_unlock(&sp->mutex);
> +}
> +
> +#endif
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
>
> #undef kvm_init_srcu_struct
> #undef kvm_cleanup_srcu_struct
> @@ -42,6 +152,11 @@
> #undef kvm_srcu_read_unlock
> #undef kvm_synchronize_srcu
> #undef kvm_srcu_batches_completed
> +
> +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx);
> +static void
> +__synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void));
> +
> /**
> * init_srcu_struct - initialize a sleep-RCU structure
> * @sp: structure to initialize.
> @@ -58,22 +173,6 @@ int kvm_init_srcu_struct(struct srcu_struct *sp)
> return (sp->per_cpu_ref ? 0 : -ENOMEM);
> }
>
> -/*
> - * srcu_readers_active_idx -- returns approximate number of readers
> - * active on the specified rank of per-CPU counters.
> - */
> -
> -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> -{
> - int cpu;
> - int sum;
> -
> - sum = 0;
> - for_each_possible_cpu(cpu)
> - sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
> - return sum;
> -}
> -
> /**
> * srcu_readers_active - returns approximate number of readers.
> * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
> @@ -154,94 +253,14 @@ void kvm_srcu_read_unlock(struct srcu_struct *sp, int
> idx)
> * synchronizing concurrent updates. Can block; must be called from
> * process context.
> *
> - * Note that it is illegal to call synchornize_srcu() from the corresponding
> + * Note that it is illegal to call synchronize_srcu() from the corresponding
> * SRCU read-side critical section; doing so will result in deadlock.
> * However, it is perfectly legal to call synchronize_srcu() on one
> * srcu_struct from some other srcu_struct's read-side critical section.
> */
> void kvm_synchronize_srcu(struct srcu_struct *sp)
> {
> - int idx;
> -
> - idx = sp->completed;
> - mutex_lock(&sp->mutex);
> -
> - /*
> - * Check to see if someone else did the work for us while we were
> - * waiting to acquire the lock. We need -two- advances of
> - * the counter, not just one. If there was but one, we might have
> - * shown up -after- our helper's first synchronize_sched(), thus
> - * having failed to prevent CPU-reordering races with concurrent
> - * srcu_read_unlock()s on other CPUs (see comment below). So we
> - * either (1) wait for two or (2) supply the second ourselves.
> - */
> -
> - if ((sp->completed - idx) >= 2) {
> - mutex_unlock(&sp->mutex);
> - return;
> - }
> -
> - synchronize_sched(); /* Force memory barrier on all CPUs. */
> -
> - /*
> - * The preceding synchronize_sched() ensures that any CPU that
> - * sees the new value of sp->completed will also see any preceding
> - * changes to data structures made by this CPU. This prevents
> - * some other CPU from reordering the accesses in its SRCU
> - * read-side critical section to precede the corresponding
> - * srcu_read_lock() -- ensuring that such references will in
> - * fact be protected.
> - *
> - * So it is now safe to do the flip.
> - */
> -
> - idx = sp->completed & 0x1;
> - sp->completed++;
> -
> - synchronize_sched(); /* Force memory barrier on all CPUs. */
> -
> - /*
> - * At this point, because of the preceding synchronize_sched(),
> - * all srcu_read_lock() calls using the old counters have completed.
> - * Their corresponding critical sections might well be still
> - * executing, but the srcu_read_lock() primitives themselves
> - * will have finished executing.
> - */
> -
> - while (srcu_readers_active_idx(sp, idx))
> - schedule_timeout_interruptible(1);
> -
> - synchronize_sched(); /* Force memory barrier on all CPUs. */
> -
> - /*
> - * The preceding synchronize_sched() forces all srcu_read_unlock()
> - * primitives that were executing concurrently with the preceding
> - * for_each_possible_cpu() loop to have completed by this point.
> - * More importantly, it also forces the corresponding SRCU read-side
> - * critical sections to have also completed, and the corresponding
> - * references to SRCU-protected data items to be dropped.
> - *
> - * Note:
> - *
> - * Despite what you might think at first glance, the
> - * preceding synchronize_sched() -must- be within the
> - * critical section ended by the following mutex_unlock().
> - * Otherwise, a task taking the early exit can race
> - * with a srcu_read_unlock(), which might have executed
> - * just before the preceding srcu_readers_active() check,
> - * and whose CPU might have reordered the srcu_read_unlock()
> - * with the preceding critical section. In this case, there
> - * is nothing preventing the synchronize_sched() task that is
> - * taking the early exit from freeing a data structure that
> - * is still being referenced (out of order) by the task
> - * doing the srcu_read_unlock().
> - *
> - * Alternatively, the comparison with "2" on the early exit
> - * could be changed to "3", but this increases synchronize_srcu()
> - * latency for bulk loads. So the current code is preferred.
> - */
> -
> - mutex_unlock(&sp->mutex);
> + __synchronize_srcu(sp, synchronize_sched);
> }
>
> /**
> @@ -265,3 +284,166 @@ EXPORT_SYMBOL_GPL(kvm_synchronize_srcu);
> EXPORT_SYMBOL_GPL(kvm_srcu_batches_completed);
>
> #endif
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) && defined(CONFIG_SMP)
> +
> +struct sync_req {
> + struct list_head list;
> + bool pending;
> + bool success;
> + struct completion done;
> +};
> +
> +static DEFINE_PER_CPU(struct sync_req, sync_req);
> +static DEFINE_PER_CPU(struct task_struct *, sync_thread);
> +static DEFINE_MUTEX(rcu_sched_expedited_mutex);
> +
> +static long synchronize_sched_expedited_count;
> +
> +static int kvm_rcu_sync_thread(void *data)
> +{
> + int badcpu;
> + int cpu = (long)data;
> + struct sync_req *req = &per_cpu(sync_req, cpu);
> +
> + set_current_state(TASK_INTERRUPTIBLE);
> + while (!kthread_should_stop()) {
> + if (!req->pending) {
> + schedule();
> + set_current_state(TASK_INTERRUPTIBLE);
> + continue;
> + }
> + req->pending = false;
> +
> + preempt_disable();
> + badcpu = smp_processor_id();
> + if (likely(cpu == badcpu)) {
> + req->success = true;
> + } else {
> + req->success = false;
> + WARN_ONCE(1, "kvm_rcu_sync_thread() on CPU %d, "
> + "expected %d\n", badcpu, cpu);
> + }
> + preempt_enable();
> +
> + complete(&req->done);
> + }
> + __set_current_state(TASK_RUNNING);
> +
> + return 0;
> +}
> +
> +static void kvm_synchronize_sched_expedited(void)
> +{
> + int cpu;
> + bool need_full_sync = 0;
> + struct sync_req *req;
> + long snap;
> + int trycount = 0;
> +
> + smp_mb(); /* ensure prior mod happens before capturing snap. */
> + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
> + get_online_cpus();
> + while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
> + put_online_cpus();
> + if (trycount++ < 10)
> + udelay(trycount * num_online_cpus());
> + else {
> + synchronize_sched();
> + return;
> + }
> + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
> + smp_mb(); /* ensure test happens before caller kfree */
> + return;
> + }
> + get_online_cpus();
> + }
> + for_each_online_cpu(cpu) {
> + req = &per_cpu(sync_req, cpu);
> + init_completion(&req->done);
> + smp_wmb();
> + req->pending = true;
> + wake_up_process(per_cpu(sync_thread, cpu));
> + }
> + for_each_online_cpu(cpu) {
> + req = &per_cpu(sync_req, cpu);
> + wait_for_completion(&req->done);
> + if (unlikely(!req->success))
> + need_full_sync = 1;
> + }
> + synchronize_sched_expedited_count++;
> + mutex_unlock(&rcu_sched_expedited_mutex);
> + put_online_cpus();
> + if (need_full_sync)
> + synchronize_sched();
> +}
> +
> +/**
> + * synchronize_srcu_expedited - like synchronize_srcu, but less patient
> + * @sp: srcu_struct with which to synchronize.
> + *
> + * Flip the completed counter, and wait for the old count to drain to zero.
> + * As with classic RCU, the updater must use some separate means of
> + * synchronizing concurrent updates. Can block; must be called from
> + * process context.
> + *
> + * Note that it is illegal to call synchronize_srcu_expedited()
> + * from the corresponding SRCU read-side critical section; doing so
> + * will result in deadlock. However, it is perfectly legal to call
> + * synchronize_srcu_expedited() on one srcu_struct from some other
> + * srcu_struct's read-side critical section.
> + */
> +void kvm_synchronize_srcu_expedited(struct srcu_struct *sp)
> +{
> + __synchronize_srcu(sp, kvm_synchronize_sched_expedited);
> +}
> +EXPORT_SYMBOL_GPL(kvm_synchronize_srcu_expedited);
> +
> +int kvm_init_srcu(void)
> +{
> + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
> + struct task_struct *p;
> + int cpu;
> + int err;
> +
> + for_each_online_cpu(cpu) {
> + p = kthread_create(kvm_rcu_sync_thread, (void *)(long)cpu,
> + "kvmsrcusync/%d", cpu);
> + if (IS_ERR(p))
> + goto error_out;
> +
> + kthread_bind(p, cpu);
> + sched_setscheduler(p, SCHED_FIFO, ¶m);
> + per_cpu(sync_thread, cpu) = p;
> + wake_up_process(p);
> + }
> + return 0;
> +
> +error_out:
> + printk(KERN_ERR "kvm: kvmsrcsync for %d failed\n", cpu);
> + err = PTR_ERR(p);
> + kvm_exit_srcu();
> + return err;
> +}
> +
> +void kvm_exit_srcu(void)
> +{
> + int cpu;
> +
> + for_each_online_cpu(cpu)
> + if (per_cpu(sync_thread, cpu))
> + kthread_stop(per_cpu(sync_thread, cpu));
> +}
> +
> +#else
> +
> +int kvm_init_srcu(void)
> +{
> + return 0;
> +}
> +
> +void kvm_exit_srcu(void)
> +{
> +}
> +
> +#endif
> diff --git a/sync b/sync
> index 0eaff31..d67469e 100755
> --- a/sync
> +++ b/sync
> @@ -40,8 +40,9 @@ def __hack(data):
> 'do_machine_check eventfd_signal get_desc_base get_desc_limit '
> 'vma_kernel_pagesize native_read_tsc user_return_notifier '
> 'user_return_notifier_register user_return_notifier_unregister '
> + 'synchronize_srcu_expedited '
> )
> - anon_inodes = anon_inodes_exit = False
> + kvm_init = kvm_exit = False
> mce = False
> result = []
> pr_fmt = ''
> @@ -58,24 +59,30 @@ def __hack(data):
> pr_fmt = sub(r'#define pr_fmt\([^)]*\) ("[^"]*").*', r'\1',
> line) + ' '
> line = ''
> line = sub(r'pr_debug\(([^),]*)', r'pr_debug(' + pr_fmt + r'\1',
> line)
> - if match(r'^int kvm_init\('): anon_inodes = True
> - if match(r'r = kvm_arch_init\(opaque\);') and anon_inodes:
> + if match(r'^int kvm_init\('): kvm_init = True
> + if match(r'r = kvm_arch_init\(opaque\);') and kvm_init:
> w('\tr = kvm_init_anon_inodes();')
> w('\tif (r)')
> w('\t\treturn r;\n')
> + w('\tr = kvm_init_srcu();')
> + w('\tif (r)')
> + w('\t\tgoto cleanup_anon_inodes;\n')
> w('\tpreempt_notifier_sys_init();')
> w('\thrtimer_kallsyms_resolve();\n')
> - if match(r'return 0;') and anon_inodes:
> + if match(r'return 0;') and kvm_init:
> w('\tprintk("loaded kvm module (%s)\\n");\n' % (version,))
> - if match(r'return r;') and anon_inodes:
> - w('\tkvm_exit_anon_inodes();')
> + if match(r'return r;') and kvm_init:
> w('\tpreempt_notifier_sys_exit();')
> - anon_inodes = False
> - if match(r'^void kvm_exit'): anon_inodes_exit = True
> - if match(r'\}') and anon_inodes_exit:
> + w('\tkvm_exit_srcu();')
> + w('cleanup_anon_inodes:')
> w('\tkvm_exit_anon_inodes();')
> + kvm_init = False
> + if match(r'^void kvm_exit'): kvm_exit = True
> + if match(r'\}') and kvm_exit:
> w('\tpreempt_notifier_sys_exit();')
> - anon_inodes_exit = False
> + w('\tkvm_exit_srcu();')
> + w('\tkvm_exit_anon_inodes();')
> + kvm_exit = False
> if match(r'^int kvm_arch_init'): kvm_arch_init = True
> if match(r'\btsc_khz\b') and kvm_arch_init:
> line = sub(r'\btsc_khz\b', 'kvm_tsc_khz', line)
> --
> 1.6.0.2
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html