Re: [mm PATCH 4/6] RCU: preemptible RCU

Dipankar Sarma Mon, 15 Jan 2007 11:39:42 -0800

From: Paul McKenney <[EMAIL PROTECTED]>

This patch implements a new version of RCU which allows its read-side
critical sections to be preempted. It uses a set of counter pairs
to keep track of the read-side critical sections and flips them
when all tasks exit read-side critical section. The details
of this implementation can be found in this paper -


http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf

This patch was developed as a part of the -rt kernel
development and meant to provide better latencies when
read-side critical sections of RCU don't disable preemption.
As a consequence of keeping track of RCU readers, the readers
have a slight overhead (optimizations in the paper).
This implementation co-exists with the "classic" RCU
implementations and can be switched to at compiler.

Signed-off-by: Paul McKenney <[EMAIL PROTECTED]>
Signed-off-by: Dipankar Sarma <[EMAIL PROTECTED]>



diff -puN include/linux/init_task.h~rcu-preempt include/linux/init_task.h
--- linux-2.6.20-rc3-mm1-rcu/include/linux/init_task.h~rcu-preempt      
2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/init_task.h 2007-01-15 
15:36:51.000000000 +0530
@@ -90,6 +90,14 @@ extern struct nsproxy init_nsproxy;
 
 extern struct group_info init_groups;
 
+#ifdef CONFIG_PREEMPT_RCU
+#define INIT_PREEMPT_RCU                                               \
+       .rcu_read_lock_nesting = 0,                                     \
+       .rcu_flipctr_idx = 0,
+#else
+#define INIT_PREEMPT_RCU
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -111,6 +119,7 @@ extern struct group_info init_groups;
        .run_list       = LIST_HEAD_INIT(tsk.run_list),                 \
        .ioprio         = 0,                                            \
        .time_slice     = HZ,                                           \
+       INIT_PREEMPT_RCU                                                \
        .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
        .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),          \
        .ptrace_list    = LIST_HEAD_INIT(tsk.ptrace_list),              \
diff -puN include/linux/rcuclassic.h~rcu-preempt include/linux/rcuclassic.h
--- linux-2.6.20-rc3-mm1-rcu/include/linux/rcuclassic.h~rcu-preempt     
2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/rcuclassic.h        
2007-01-15 15:36:51.000000000 +0530
@@ -142,7 +142,6 @@ extern int rcu_needs_cpu(int cpu);
 extern void __rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
-extern long rcu_batches_completed(void);
 
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUCLASSIC_H */
diff -puN include/linux/rcupdate.h~rcu-preempt include/linux/rcupdate.h
--- linux-2.6.20-rc3-mm1-rcu/include/linux/rcupdate.h~rcu-preempt       
2007-01-15 15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/rcupdate.h  2007-01-15 
15:36:51.000000000 +0530
@@ -41,7 +41,11 @@
 #include <linux/percpu.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#ifdef CONFIG_CLASSIC_RCU
 #include <linux/rcuclassic.h>
+#else
+#include <linux/rcupreempt.h>
+#endif
 
 /**
  * struct rcu_head - callback structure for use with RCU
@@ -216,10 +220,13 @@ extern void FASTCALL(call_rcu_bh(struct 
 /* Exported common interfaces */
 extern void synchronize_rcu(void);
 extern void rcu_barrier(void);
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
   
 /* Internal to kernel */
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
+extern int rcu_needs_cpu(int cpu);
 
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
diff -puN /dev/null include/linux/rcupreempt.h
--- /dev/null   2006-03-26 18:34:52.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/rcupreempt.h        
2007-01-15 15:36:51.000000000 +0530
@@ -0,0 +1,65 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (RT implementation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Author:  Paul McKenney <[EMAIL PROTECTED]>
+ * 
+ * Based on the original work by Paul McKenney <[EMAIL PROTECTED]>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ *             http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUPREEMPT_H
+#define __LINUX_RCUPREEMPT_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#define __rcu_read_lock_bh()   { rcu_read_lock(); local_bh_disable(); }
+#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); }
+
+#define __rcu_read_lock_nesting()      (current->rcu_read_lock_nesting)
+
+extern void __synchronize_sched(void);
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPREEMPT_H */
diff -puN include/linux/sched.h~rcu-preempt include/linux/sched.h
--- linux-2.6.20-rc3-mm1-rcu/include/linux/sched.h~rcu-preempt  2007-01-15 
15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/include/linux/sched.h     2007-01-15 
15:36:51.000000000 +0530
@@ -848,6 +848,11 @@ struct task_struct {
        cpumask_t cpus_allowed;
        unsigned int time_slice, first_time_slice;
 
+#ifdef CONFIG_PREEMPT_RCU
+        int rcu_read_lock_nesting;
+        int rcu_flipctr_idx;
+#endif
+
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        struct sched_info sched_info;
 #endif
diff -puN kernel/fork.c~rcu-preempt kernel/fork.c
--- linux-2.6.20-rc3-mm1-rcu/kernel/fork.c~rcu-preempt  2007-01-15 
15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/kernel/fork.c     2007-01-15 
15:36:51.000000000 +0530
@@ -941,6 +941,16 @@ static inline void rt_mutex_init_task(st
 #endif
 }
 
+#ifdef CONFIG_PREEMPT_RCU
+static inline void rcu_task_init(struct task_struct *p)
+{
+       p->rcu_read_lock_nesting = 0;
+       p->rcu_flipctr_idx = 0;
+}
+#else
+static inline void rcu_task_init(struct task_struct *p) {}
+#endif
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1026,6 +1036,7 @@ static struct task_struct *copy_process(
 
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
+       rcu_task_init(p);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);
 
diff -puN kernel/Kconfig.preempt~rcu-preempt kernel/Kconfig.preempt
--- linux-2.6.20-rc3-mm1-rcu/kernel/Kconfig.preempt~rcu-preempt 2007-01-15 
15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/kernel/Kconfig.preempt    2007-01-15 
15:36:51.000000000 +0530
@@ -63,3 +63,29 @@ config PREEMPT_BKL
          Say Y here if you are building a kernel for a desktop system.
          Say N if you are unsure.
 
+choice
+       prompt "RCU implementation type:"
+       default CLASSIC_RCU
+
+config CLASSIC_RCU
+       bool "Classic RCU"
+       help
+         This option selects the classic RCU implementation that is
+         designed for best read-side performance on non-realtime
+         systems.
+
+         Say Y if you are unsure.
+
+config PREEMPT_RCU
+       bool "Preemptible RCU"
+       help
+         This option reduces the latency of the kernel by making certain
+         RCU sections preemptible. Normally RCU code is non-preemptible, if
+         this option is selected then read-only RCU sections become
+         preemptible. This helps latency, but may expose bugs due to
+         now-naive assumptions about each RCU read-side critical section
+         remaining on a given CPU through its execution.
+
+         Say N if you are unsure.
+
+endchoice
diff -puN kernel/Makefile~rcu-preempt kernel/Makefile
--- linux-2.6.20-rc3-mm1-rcu/kernel/Makefile~rcu-preempt        2007-01-15 
15:36:51.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/kernel/Makefile   2007-01-15 
15:36:51.000000000 +0530
@@ -6,9 +6,9 @@ obj-y     = sched.o fork.o exec_domain.o
            exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
-           rcupdate.o rcuclassic.o extable.o params.o posix-timers.o \
+           extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-           hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+           hrtimer.o rwsem.o latency.o nsproxy.o rcupdate.o srcu.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
@@ -46,6 +46,8 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
 obj-$(CONFIG_DEBUG_SYNCHRO_TEST) += synchro-test.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_UTS_NS) += utsname.o
diff -puN /dev/null kernel/rcupreempt.c
--- /dev/null   2006-03-26 18:34:52.000000000 +0530
+++ linux-2.6.20-rc3-mm1-rcu-dipankar/kernel/rcupreempt.c       2007-01-15 
15:36:51.000000000 +0530
@@ -0,0 +1,595 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Authors: Paul E. McKenney <[EMAIL PROTECTED]>
+ *             With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
+ *             for pushing me away from locks and towards counters, and
+ *             to Suparna Bhattacharya for pushing me completely away
+ *             from atomic instructions on the read side.
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ *             Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/cpumask.h>
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+struct rcu_data {
+       spinlock_t      lock;
+       long            completed;      /* Number of last completed batch. */
+       struct rcu_head *nextlist;
+       struct rcu_head **nexttail;
+       struct rcu_head *waitlist;
+       struct rcu_head **waittail;
+       struct rcu_head *donelist;
+       struct rcu_head **donetail;
+};
+struct rcu_ctrlblk {
+       spinlock_t      fliplock;
+       long            completed;      /* Number of last completed batch. */
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+       .fliplock = SPIN_LOCK_UNLOCKED,
+       .completed = 0,
+};
+static DEFINE_PER_CPU(int [2], rcu_flipctr) = { 0, 0 };
+
+/*
+ * States for rcu_try_flip() and friends.
+ */
+enum rcu_try_flip_state {
+       RCU_TRY_FLIP_IDLE,      /* "I" */
+       RCU_TRY_FLIP_GP,        /* "G" */
+       RCU_TRY_FLIP_WAITACK,   /* "A" */
+       RCU_TRY_FLIP_WAITZERO,  /* "Z" */
+       RCU_TRY_FLIP_WAITMB     /* "M" */
+};
+static enum rcu_try_flip_state rcu_try_flip_state = RCU_TRY_FLIP_IDLE;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has seen
+ * the most recent counter flip.
+ */
+enum rcu_flip_flag_value {
+       RCU_FLIP_SEEN,          /* Steady/initial state, last flip seen. */
+                               /* Only GP detector can update. */
+       RCU_FLIPPED             /* Flip just completed, need confirmation. */
+                               /* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU(enum rcu_flip_flag_value, rcu_flip_flag) = RCU_FLIP_SEEN;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has executed the
+ * needed memory barrier to fence in memory references from its last RCU
+ * read-side critical section in the just-completed grace period.
+ */
+enum rcu_mb_flag_value {
+       RCU_MB_DONE,            /* Steady/initial state, no mb()s required. */
+                               /* Only GP detector can update. */
+       RCU_MB_NEEDED           /* Flip just completed, need an mb(). */
+                               /* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU(enum rcu_mb_flag_value, rcu_mb_flag) = RCU_MB_DONE;
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+       return rcu_ctrlblk.completed;
+}
+
+long rcu_batches_completed_bh(void)
+{
+       return rcu_ctrlblk.completed;
+}
+
+void __rcu_read_lock(void)
+{
+       int idx;
+       int nesting;
+
+       nesting = current->rcu_read_lock_nesting;
+       if (nesting != 0) {
+
+               /* An earlier rcu_read_lock() covers us, just count this one. */
+               current->rcu_read_lock_nesting = nesting + 1;
+
+       } else {
+               unsigned long oldirq;
+
+               /*
+                * Disable local interrupts to prevent the grace-period
+                * detection state machine from seeing us half-done.
+                */
+               local_irq_save(oldirq);
+
+               /*
+                * Outermost nesting of rcu_read_lock(), so atomically
+                * increment the current counter for the current CPU.
+                */
+               idx = rcu_ctrlblk.completed & 0x1;
+               smp_read_barrier_depends();
+               barrier();
+               __get_cpu_var(rcu_flipctr)[idx]++;
+               barrier();
+
+               /*
+                * Now that the per-CPU counter has been incremented, we
+                * are protected.  We can therefore safely increment
+                * the nesting counter, relieving further NMIs of the
+                * need to do so.
+                */
+               current->rcu_read_lock_nesting = nesting + 1;
+               barrier();
+
+               /*
+                * Now that we have prevented any NMIs from storing
+                * to the ->rcu_flipctr_idx, we can safely use it to
+                * remember which counter to decrement in the matching
+                * rcu_read_unlock().
+                */
+               current->rcu_flipctr_idx = idx;
+               local_irq_restore(oldirq);
+       }
+}
+
+void __rcu_read_unlock(void)
+{
+       int idx;
+       int nesting;
+
+       nesting = current->rcu_read_lock_nesting;
+       if (nesting > 1) {
+               /*
+                * We are still protected by an enclosing rcu_read_lock(),
+                * so simply decrement the counter.
+                */
+               current->rcu_read_lock_nesting = nesting - 1;
+
+       } else {
+               unsigned long oldirq;
+
+               /*
+                * Disable local interrupts to prevent the grace-period
+                * detection state machine from seeing us half-done.
+                */
+               local_irq_save(oldirq);
+
+               /*
+                * Outermost nesting of rcu_read_unlock(), so we must
+                * decrement the current counter for the current CPU.
+                * This must be done carefully, because NMIs can
+                * occur at any point in this code, and any rcu_read_lock()
+                * and rcu_read_unlock() pairs in the NMI handlers
+                * must interact non-destructively with this code.
+                * Lots of barrier() calls, and -very- careful ordering.
+                *
+                * Changes to this code, including this one, must be
+                * inspected, validated, and tested extremely carefully!!!
+                */
+
+               /* 
+                * First, pick up the index.  Enforce ordering for
+                * both compilers and for DEC Alpha.
+                */
+               idx = current->rcu_flipctr_idx;
+               smp_read_barrier_depends();
+               barrier();
+
+               /*
+                * It is now safe to decrement the task's nesting count.
+                * NMIs that occur after this statement will route
+                * their rcu_read_lock() calls through this "else" clause
+                * of this "if" statement, and thus will start incrementing
+                * the per-CPU counter on their own.  Enforce ordering for
+                * compilers.
+                */
+               current->rcu_read_lock_nesting = nesting - 1;
+               barrier();
+
+               /*
+                * Decrement the per-CPU counter. NMI handlers
+                * might increment it as well, but they had better
+                * properly nest their rcu_read_lock()/rcu_read_unlock()
+                * pairs so that the value is restored before the handler
+                * returns to us.
+                */
+               __get_cpu_var(rcu_flipctr)[idx]--;
+               local_irq_restore(oldirq);
+       }
+}
+
+static void __rcu_advance_callbacks(void)
+{
+       if ((rcu_data.completed >> 1) != (rcu_ctrlblk.completed >> 1)) {
+               if (rcu_data.waitlist != NULL) {
+                       *rcu_data.donetail = rcu_data.waitlist;
+                       rcu_data.donetail = rcu_data.waittail;
+               }
+               if (rcu_data.nextlist != NULL) {
+                       rcu_data.waitlist = rcu_data.nextlist;
+                       rcu_data.waittail = rcu_data.nexttail;
+                       rcu_data.nextlist = NULL;
+                       rcu_data.nexttail = &rcu_data.nextlist;
+               } else {
+                       rcu_data.waitlist = NULL;
+                       rcu_data.waittail = &rcu_data.waitlist;
+               }
+               rcu_data.completed = rcu_ctrlblk.completed;
+       } else if (rcu_data.completed != rcu_ctrlblk.completed)
+               rcu_data.completed = rcu_ctrlblk.completed;
+}
+
+/*
+ * Get here when RCU is idle.  Decide whether we need to
+ * move out of idle state, and return zero if so.
+ * "Straightforward" approach for the moment, might later
+ * use callback-list lengths, grace-period duration, or
+ * some such to determine when to exit idle state.
+ * Might also need a pre-idle test that does not acquire
+ * the lock, but let's get the simple case working first...
+ */
+static int rcu_try_flip_idle(int flipctr)
+{
+       if (!rcu_pending(smp_processor_id()))
+               return 1;
+       return 0;
+}
+
+/*
+ * Flip processing up to and including the flip, as well as
+ * telling CPUs to acknowledge the flip.
+ */
+static int rcu_try_flip_in_gp(int flipctr)
+{
+       int cpu;
+
+       /*
+        * Do the flip.
+        */
+       rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
+
+       /*
+        * Need a memory barrier so that other CPUs see the new
+        * counter value before they see the subsequent change of all
+        * the rcu_flip_flag instances to RCU_FLIPPED.
+        */
+       smp_mb();
+
+       /* Now ask each CPU for acknowledgement of the flip. */
+
+       for_each_possible_cpu(cpu)
+               per_cpu(rcu_flip_flag, cpu) = RCU_FLIPPED;
+
+       return 0;
+}
+
+/*
+ * Wait for CPUs to acknowledge the flip.
+ */
+static int rcu_try_flip_waitack(int flipctr)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               if (per_cpu(rcu_flip_flag, cpu) != RCU_FLIP_SEEN) 
+                       return 1;
+
+       /*
+        * Make sure our checks above don't bleed into subsequent
+        * waiting for the sum of the counters to reach zero.
+        */
+       smp_mb();
+       return 0;
+}
+
+/*
+ * Wait for collective ``last'' counter to reach zero,
+ * then tell all CPUs to do an end-of-grace-period memory barrier.
+ */
+static int rcu_try_flip_waitzero(int flipctr)
+{
+       int cpu;
+       int lastidx = !(flipctr & 0x1);
+       int sum = 0;
+
+       /* Check to see if the sum of the "last" counters is zero. */
+
+       for_each_possible_cpu(cpu)
+               sum += per_cpu(rcu_flipctr, cpu)[lastidx];
+       if (sum != 0)
+               return 1;
+
+       /* Make sure we don't call for memory barriers before we see zero. */
+       smp_mb();
+
+       /* Call for a memory barrier from each CPU. */
+       for_each_possible_cpu(cpu)
+               per_cpu(rcu_mb_flag, cpu) = RCU_MB_NEEDED;
+
+       return 0;
+}
+
+/*
+ * Wait for all CPUs to do their end-of-grace-period memory barrier.
+ * Return 0 once all CPUs have done so.
+ */
+static int rcu_try_flip_waitmb(int flipctr)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               if (per_cpu(rcu_mb_flag, cpu) != RCU_MB_DONE)
+                       return 1;
+
+       smp_mb(); /* Ensure that the above checks precede any following flip. */
+       return 0;
+}
+
+/*
+ * Attempt a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * at least three consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void rcu_try_flip(void)
+{
+       long flipctr;
+       unsigned long oldirq;
+
+       if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) 
+               return;
+
+       /*
+        * Take the next transition(s) through the RCU grace-period
+        * flip-counter state machine.
+        */
+       flipctr = rcu_ctrlblk.completed;
+       switch (rcu_try_flip_state) {
+       case RCU_TRY_FLIP_IDLE:
+               if (rcu_try_flip_idle(flipctr))
+                       break;
+               rcu_try_flip_state = RCU_TRY_FLIP_GP;
+       case RCU_TRY_FLIP_GP:
+               if (rcu_try_flip_in_gp(flipctr))
+                       break;
+               rcu_try_flip_state = RCU_TRY_FLIP_WAITACK;
+       case RCU_TRY_FLIP_WAITACK:
+               if (rcu_try_flip_waitack(flipctr))
+                       break;
+               rcu_try_flip_state = RCU_TRY_FLIP_WAITZERO;
+       case RCU_TRY_FLIP_WAITZERO:
+               if (rcu_try_flip_waitzero(flipctr))
+                       break;
+               rcu_try_flip_state = RCU_TRY_FLIP_WAITMB;
+       case RCU_TRY_FLIP_WAITMB:
+               if (rcu_try_flip_waitmb(flipctr))
+                       break;
+               rcu_try_flip_state = RCU_TRY_FLIP_IDLE;
+       }
+       spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+}
+
+/*
+ * Check to see if this CPU needs to report that it has seen the most
+ * recent counter flip, thereby declaring that all subsequent
+ * rcu_read_lock() invocations will respect this flip.
+ */
+static void rcu_check_flipseen(int cpu)
+{
+       if (per_cpu(rcu_flip_flag, cpu) == RCU_FLIPPED) {
+               smp_mb();  /* Subsequent counter acccesses must see new value */
+               per_cpu(rcu_flip_flag, cpu) = RCU_FLIP_SEEN;
+               smp_mb();  /* probably be implied by interrupt, but... */
+       }
+}
+
+/*
+ * Check to see if this CPU needs to do a memory barrier in order to
+ * ensure that any prior RCU read-side critical sections have committed
+ * their counter manipulations and critical-section memory references
+ * before declaring the grace period to be completed.
+ */
+static void rcu_check_mb(int cpu)
+{
+       if (per_cpu(rcu_mb_flag, cpu) == RCU_MB_NEEDED) {
+               smp_mb();
+               per_cpu(rcu_mb_flag, cpu) = RCU_MB_DONE;
+       }
+}
+
+/*
+ * This function is periodically called from hardware-irq context on
+ * each CPU.
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+       unsigned long oldirq;
+
+       rcu_check_flipseen(cpu);
+       rcu_check_mb(cpu);
+       if (rcu_ctrlblk.completed == rcu_data.completed) {
+               rcu_try_flip();
+               if (rcu_ctrlblk.completed == rcu_data.completed) {
+                       return;
+               }
+       }
+       spin_lock_irqsave(&rcu_data.lock, oldirq);
+       __rcu_advance_callbacks();
+       if (rcu_data.donelist == NULL)
+               spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+       else {
+               spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+               raise_softirq(RCU_SOFTIRQ);
+       }
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+       return rcu_pending(cpu);
+}
+
+/*
+ * Needed by dynticks, to make sure all RCU processing has finished
+ * when we go idle:
+ */
+void rcu_advance_callbacks(int cpu, int user)
+{
+       unsigned long oldirq;
+
+       if (rcu_ctrlblk.completed == rcu_data.completed) {
+               rcu_try_flip();
+               if (rcu_ctrlblk.completed == rcu_data.completed) {
+                       return;
+               }
+       }
+       spin_lock_irqsave(&rcu_data.lock, oldirq);
+       __rcu_advance_callbacks();
+       spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+       unsigned long flags;
+       struct rcu_head *next, *list;
+
+       spin_lock_irqsave(&rcu_data.lock, flags);
+       list = rcu_data.donelist;
+       if (list == NULL) {
+               spin_unlock_irqrestore(&rcu_data.lock, flags);
+               return;
+       }
+       rcu_data.donelist = NULL;
+       rcu_data.donetail = &rcu_data.donelist;
+       spin_unlock_irqrestore(&rcu_data.lock, flags);
+       while (list) {
+               next = list->next;
+               list->func(list);
+               list = next;
+       }
+}
+
+void fastcall call_rcu(struct rcu_head *head,
+                               void (*func)(struct rcu_head *rcu))
+{
+       unsigned long flags;
+
+       head->func = func;
+       head->next = NULL;
+       spin_lock_irqsave(&rcu_data.lock, flags);
+       __rcu_advance_callbacks();
+       *rcu_data.nexttail = head;
+       rcu_data.nexttail = &head->next;
+       spin_unlock_irqrestore(&rcu_data.lock, flags);
+}
+
+/*
+ * Wait until all currently running preempt_disable() code segments
+ * (including hardware-irq-disable segments) complete.  Note that
+ * in -rt this does -not- necessarily result in all currently executing
+ * interrupt -handlers- having completed.
+ */
+void __synchronize_sched(void)
+{
+       cpumask_t oldmask;
+       int cpu;
+
+       if (sched_getaffinity(0, &oldmask) < 0) {
+               oldmask = cpu_possible_map;
+       }
+       for_each_online_cpu(cpu) {
+               sched_setaffinity(0, cpumask_of_cpu(cpu));
+               schedule();
+       }
+       sched_setaffinity(0, oldmask);
+}
+
+int rcu_pending(int cpu)
+{
+       return (rcu_data.donelist != NULL ||
+               rcu_data.waitlist != NULL ||
+               rcu_data.nextlist != NULL);
+}
+
+void __init __rcu_init(void)
+{
+       spin_lock_init(&rcu_data.lock);
+       rcu_data.completed = 0;
+       rcu_data.nextlist = NULL;
+       rcu_data.nexttail = &rcu_data.nextlist;
+       rcu_data.waitlist = NULL;
+       rcu_data.waittail = &rcu_data.waitlist;
+       rcu_data.donelist = NULL;
+       rcu_data.donetail = &rcu_data.donelist;
+       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+       synchronize_rcu();
+}
+
+
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+

_
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [mm PATCH 4/6] RCU: preemptible RCU

Reply via email to