Below Original message from Andrey Smirnov.

I would like to address the issue and submit the patch for it.

Please find the patch attached with this mail.

Issue mention below. Probably a race in the scheduling of the notifier chain.
Build and tested on Udoo board with RT patch applied.

-Anand Moon

----------------------------------------------------------------------------

Hello everyone,

I am working on integrating PREEMPT-RT patches into 3.10.17 kernel BSP
release from Freescale which can be found at:

http://git.freescale.com/git/cgit.cgi/imx/linux-2.6-imx.git (tag
imx_3.10.17_1.0.0_ga)

and what I am finding is that if I select "interactive" cpufreq
governor I get a kernel the kernel that occasionally spouts this

BUG: scheduling while atomic: swapper/3/0/0x00000002
Modules linked in:
Preemption disabled at:[<  (null)>]   (null)
CPU: 3 PID: 0 Comm: swapper/3 Not tainted 3.10.17-rt12-80705-g232293e-dirty #3
[<800139c4>] (unwind_backtrace+0x0/0xf8) from [<80011420>]
(show_stack+0x10/0x14)
[<80011420>] (show_stack+0x10/0x14) from [<805bfbbc>] (__schedule_bug+0x78/0x9c)
[<805bfbbc>] (__schedule_bug+0x78/0x9c) from [<805c431c>]
(__schedule+0x398/0x49c)
[<805c431c>] (__schedule+0x398/0x49c) from [<805c44d0>] (schedule+0x34/0xa0)
[<805c44d0>] (schedule+0x34/0xa0) from [<805c5250>]
(rt_spin_lock_slowlock+0xc0/0x258)
[<805c5250>] (rt_spin_lock_slowlock+0xc0/0x258) from [<80031d44>]
(lock_timer_base+0x2c/0x4c)
[<80031d44>] (lock_timer_base+0x2c/0x4c) from [<80032024>]
(mod_timer+0x60/0x1c0)
[<80032024>] (mod_timer+0x60/0x1c0) from [<803fe860>]
(cpufreq_interactive_idle_notifier+0xa4/0x13c)
[<803fe860>] (cpufreq_interactive_idle_notifier+0xa4/0x13c) from
[<80048444>] (notifier_call_chain+0)
[<80048444>] (notifier_call_chain+0x44/0x84) from [<80048754>]
(__atomic_notifier_call_chain+0x38/0x)
[<80048754>] (__atomic_notifier_call_chain+0x38/0x4c) from
[<80048780>] (atomic_notifier_call_chain+)
[<80048780>] (atomic_notifier_call_chain+0x18/0x20) from [<80058df8>]
(cpu_startup_entry+0x68/0x1a4)
[....] [<80058df8>] (cpu_startup_entry+0x68/0x1a4) from [<105bc204>]
(0x105bc204)

and eventually crashes. After doing some digging I believe the
sequence of events leading to this is following:

secondary_start_kernel() calls preempt_disable(), then
cpu_startup_entry(CPUHP_ONLINE), which results in a call to
cpufreq_interactive_idle_notifier() which in turn tries to use
mod_timer(). Mod_timer() internally tries to acquire a spinlock, but
with RT
patches applied it unfolds into a rt_mutex, the attempt of acquisition
of which results in the call to schedule() and that's when we
see that backtrace.

Eventually I am hoping to disable any sorts of frequency scaling or
power management on our system, but I am still curious to know if that
is a known issue patches for which exist. Does anyone has any leads/suggestions?

Thank you,
Andrey Smirnov
diff --git a/drivers/cpufreq/cpufreq_interactive.c 
b/drivers/cpufreq/cpufreq_interactive.c
index 9a6f64f..64f2784 100644
--- a/drivers/cpufreq/cpufreq_interactive.c
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -17,6 +17,7 @@
  *
  */
 
+#include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/cpufreq.h>
@@ -29,7 +30,6 @@
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/kernel_stat.h>
-#include <linux/module.h>
 #include <asm/cputime.h>
 
 static atomic_t active_count = ATOMIC_INIT(0);
@@ -61,6 +61,8 @@ static cpumask_t down_cpumask;
 static spinlock_t down_cpumask_lock;
 static struct mutex set_speed_lock;
 
+#define MAX_RT_PRIO 100
+
 /* Hi speed to bump to from lo speed when load burst (default max) */
 static u64 hispeed_freq;
 
@@ -77,9 +79,7 @@ static unsigned long min_sample_time;
 /*
  * The sample rate of the timer used to increase frequency
  */
-#define DEFAULT_TIMER_RATE (50 * USEC_PER_MSEC)
-#define CPUFREQ_IRQ_LEN 60
-#define CPUFREQ_NOTE_LEN 120
+#define DEFAULT_TIMER_RATE (20 * USEC_PER_MSEC)
 static unsigned long timer_rate;
 
 static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
@@ -133,8 +133,8 @@ static void cpufreq_interactive_timer(unsigned long data)
        if (!idle_exit_time)
                goto exit;
 
-       delta_idle = (unsigned int)(now_idle - time_in_idle);
-       delta_time = (unsigned int)(pcpu->timer_run_time - idle_exit_time);
+       delta_idle = (unsigned int) (now_idle - time_in_idle);
+       delta_time = (unsigned int) (pcpu->timer_run_time - idle_exit_time);
 
        /*
         * If timer ran less than 1ms after short-term sample started, retry.
@@ -147,9 +147,8 @@ static void cpufreq_interactive_timer(unsigned long data)
        else
                cpu_load = 100 * (delta_time - delta_idle) / delta_time;
 
-       delta_idle = (unsigned int)(now_idle - pcpu->freq_change_time_in_idle);
-       delta_time = (unsigned int)(pcpu->timer_run_time -
-               pcpu->freq_change_time);
+       delta_idle = (unsigned int) (now_idle - pcpu->freq_change_time_in_idle);
+       delta_time = (unsigned int) (pcpu->timer_run_time - 
pcpu->freq_change_time);
 
        if ((delta_time == 0) || (delta_idle > delta_time))
                load_since_change = 0;
@@ -250,10 +249,11 @@ static void cpufreq_interactive_idle_start(void)
                &per_cpu(cpuinfo, smp_processor_id());
        int pending;
 
-       pcpu->idling = 1;
-       smp_wmb();
        if (!pcpu->governor_enabled)
                return;
+
+       pcpu->idling = 1;
+       smp_wmb();
        pending = timer_pending(&pcpu->cpu_timer);
 
        if (pcpu->target_freq != pcpu->policy->min) {
@@ -619,10 +619,10 @@ static int cpufreq_interactive_idle_notifier(struct 
notifier_block *nb,
                                             void *data)
 {
        switch (val) {
-       case IDLE_START:
+       case SCHED_IDLE_START:
                cpufreq_interactive_idle_start();
                break;
-       case IDLE_END:
+       case SCHED_IDLE_END:
                cpufreq_interactive_idle_end();
                break;
        }
@@ -638,7 +638,7 @@ static int __init cpufreq_interactive_init(void)
 {
        unsigned int i;
        struct cpufreq_interactive_cpuinfo *pcpu;
-       struct sched_param param = { .sched_priority = 99 };
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
        go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
        min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
@@ -674,7 +674,7 @@ static int __init cpufreq_interactive_init(void)
        spin_lock_init(&down_cpumask_lock);
        mutex_init(&set_speed_lock);
 
-       idle_notifier_register(&cpufreq_interactive_idle_nb);
+       sched_idle_notifier_register(&cpufreq_interactive_idle_nb);
 
        return cpufreq_register_governor(&cpufreq_gov_interactive);
 
@@ -684,7 +684,7 @@ err_freeuptask:
 }
 
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
-late_initcall(cpufreq_interactive_init);
+fs_initcall(cpufreq_interactive_init);
 #else
 module_init(cpufreq_interactive_init);
 #endif
@@ -699,7 +699,7 @@ static void __exit cpufreq_interactive_exit(void)
 
 module_exit(cpufreq_interactive_exit);
 
-MODULE_AUTHOR("Mike Chan <[email protected]>");
+MODULE_AUTHOR("Mike Chan <mike at android.com>");
 MODULE_DESCRIPTION("'cpufreq_interactive' - A cpufreq governor for "
        "Latency sensitive workloads");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ecb2f9b..c247353 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1898,6 +1898,16 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #endif
 
+#define SCHED_IDLE_START   1
+#define SCHED_IDLE_END     2
+extern void sched_idle_notifier_register(struct notifier_block *nb);
+extern void sched_idle_notifier_unregister(struct notifier_block *nb);
+extern void sched_idle_notifier_call_chain(unsigned long val);
+extern void sched_idle_enter_condrcu(bool idle_uses_rcu);
+extern void sched_idle_exit_condrcu(bool idle_uses_rcu);
+static inline void sched_idle_enter(void) { sched_idle_enter_condrcu(0); }
+static inline void sched_idle_exit(void) { sched_idle_exit_condrcu(0); }
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a060a09..2df783e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1828,6 +1828,44 @@ void wake_up_new_task(struct task_struct *p)
        task_rq_unlock(rq, p, &flags);
 }
 
+static ATOMIC_NOTIFIER_HEAD(sched_idle_notifier);
+
+void sched_idle_notifier_register(struct notifier_block *nb)
+{
+   atomic_notifier_chain_register(&sched_idle_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(sched_idle_notifier_register);
+
+void sched_idle_notifier_unregister(struct notifier_block *nb)
+{
+   atomic_notifier_chain_unregister(&sched_idle_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(sched_idle_notifier_unregister);
+
+void sched_idle_notifier_call_chain(unsigned long val)
+{
+   atomic_notifier_call_chain(&sched_idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(sched_idle_notifier_call_chain);
+
+void sched_idle_enter_condrcu(bool idle_uses_rcu)
+{
+   tick_nohz_idle_enter();
+   if (!idle_uses_rcu)
+       rcu_idle_enter();
+   sched_idle_notifier_call_chain(SCHED_IDLE_START);
+}
+EXPORT_SYMBOL_GPL(sched_idle_enter_condrcu);
+
+void sched_idle_exit_condrcu(bool idle_uses_rcu)
+{
+   sched_idle_notifier_call_chain(SCHED_IDLE_END);
+   if (!idle_uses_rcu)
+       rcu_idle_exit();
+   tick_nohz_idle_exit();
+}
+EXPORT_SYMBOL_GPL(sched_idle_exit_condrcu);
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
 /**
-- 
_______________________________________________
meta-freescale mailing list
[email protected]
https://lists.yoctoproject.org/listinfo/meta-freescale

Reply via email to