[PATCH v5 2/2] idle: add support for tasks that inject idle

2016-11-28 Thread Jacob Pan
From: Peter Zijlstra 

Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use
realtime tasks to take control of CPU then inject idle. There are two
issues with this approach:

 1. Low efficiency: injected idle task is treated as busy so sched ticks
do not stop during injected idle period, the result of these
unwanted wakeups can be ~20% loss in power savings.

 2. Idle accounting: injected idle time is presented to user as busy.

This patch addresses the issues by introducing a new PF_IDLE flag which
allows any given task to be treated as idle task while the flag is set.
Therefore, idle injection tasks can run through the normal flow of NOHZ
idle enter/exit to get the correct accounting as well as tick stop when
possible.

The implication is that idle task is then no longer limited to PID == 0.

Acked-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra 
Signed-off-by: Jacob Pan 
---
 include/linux/cpu.h   |   2 +
 include/linux/sched.h |   3 +-
 kernel/fork.c |   2 +-
 kernel/sched/core.c   |   1 +
 kernel/sched/idle.c   | 162 +++---
 5 files changed, 107 insertions(+), 63 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b886dc1..ac0efae 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -245,6 +245,8 @@ static inline void enable_nonboot_cpus(void) {}
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
+void play_idle(unsigned long duration_ms);
+
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
 bool cpu_report_death(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e9c009d..a3d338e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2254,6 +2254,7 @@ static inline cputime_t task_gtime(struct task_struct *t)
 /*
  * Per process flags
  */
+#define PF_IDLE0x0002  /* I am an IDLE thread */
 #define PF_EXITING 0x0004  /* getting shut down */
 #define PF_EXITPIDONE  0x0008  /* pi exit done on shut down */
 #define PF_VCPU0x0010  /* I'm a virtual CPU */
@@ -2611,7 +2612,7 @@ extern int sched_setattr(struct task_struct *,
  */
 static inline bool is_idle_task(const struct task_struct *p)
 {
-   return p->pid == 0;
+   return !!(p->flags & PF_IDLE);
 }
 extern struct task_struct *curr_task(int cpu);
 extern void ia64_set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 997ac1d..a8eb821 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1540,7 +1540,7 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cleanup_count;
 
delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
-   p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+   p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(>children);
INIT_LIST_HEAD(>sibling);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 154fd68..c95fbcd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5279,6 +5279,7 @@ void init_idle(struct task_struct *idle, int cpu)
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+   idle->flags |= PF_IDLE;
 
kasan_unpoison_task_stack(idle);
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 513e4df..6a4bae0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -205,76 +205,65 @@ static void cpuidle_idle_call(void)
  *
  * Called with polling cleared.
  */
-static void cpu_idle_loop(void)
+static void do_idle(void)
 {
-   int cpu = smp_processor_id();
-
-   while (1) {
-   /*
-* If the arch has a polling bit, we maintain an invariant:
-*
-* Our polling bit is clear if we're not scheduled (i.e. if
-* rq->curr != rq->idle).  This means that, if rq->idle has
-* the polling bit set, then setting need_resched is
-* guaranteed to cause the cpu to reschedule.
-*/
-
-   __current_set_polling();
-   quiet_vmstat();
-   tick_nohz_idle_enter();
+   /*
+* If the arch has a polling bit, we maintain an invariant:
+*
+* Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+* rq->idle). This means that, if rq->idle has the polling bit set,
+* then setting need_resched is guaranteed to cause the CPU to
+* reschedule.
+*/
 
-   while (!need_resched()) {
-   check_pgt_cache();
-   rmb();
+   __current_set_polling();
+   tick_nohz_idle_enter();
 
-   if 

[PATCH v5 2/2] idle: add support for tasks that inject idle

2016-11-28 Thread Jacob Pan
From: Peter Zijlstra 

Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use
realtime tasks to take control of CPU then inject idle. There are two
issues with this approach:

 1. Low efficiency: injected idle task is treated as busy so sched ticks
do not stop during injected idle period, the result of these
unwanted wakeups can be ~20% loss in power savings.

 2. Idle accounting: injected idle time is presented to user as busy.

This patch addresses the issues by introducing a new PF_IDLE flag which
allows any given task to be treated as idle task while the flag is set.
Therefore, idle injection tasks can run through the normal flow of NOHZ
idle enter/exit to get the correct accounting as well as tick stop when
possible.

The implication is that idle task is then no longer limited to PID == 0.

Acked-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra 
Signed-off-by: Jacob Pan 
---
 include/linux/cpu.h   |   2 +
 include/linux/sched.h |   3 +-
 kernel/fork.c |   2 +-
 kernel/sched/core.c   |   1 +
 kernel/sched/idle.c   | 162 +++---
 5 files changed, 107 insertions(+), 63 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b886dc1..ac0efae 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -245,6 +245,8 @@ static inline void enable_nonboot_cpus(void) {}
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
+void play_idle(unsigned long duration_ms);
+
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
 bool cpu_report_death(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e9c009d..a3d338e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2254,6 +2254,7 @@ static inline cputime_t task_gtime(struct task_struct *t)
 /*
  * Per process flags
  */
+#define PF_IDLE0x0002  /* I am an IDLE thread */
 #define PF_EXITING 0x0004  /* getting shut down */
 #define PF_EXITPIDONE  0x0008  /* pi exit done on shut down */
 #define PF_VCPU0x0010  /* I'm a virtual CPU */
@@ -2611,7 +2612,7 @@ extern int sched_setattr(struct task_struct *,
  */
 static inline bool is_idle_task(const struct task_struct *p)
 {
-   return p->pid == 0;
+   return !!(p->flags & PF_IDLE);
 }
 extern struct task_struct *curr_task(int cpu);
 extern void ia64_set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 997ac1d..a8eb821 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1540,7 +1540,7 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cleanup_count;
 
delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
-   p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+   p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(>children);
INIT_LIST_HEAD(>sibling);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 154fd68..c95fbcd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5279,6 +5279,7 @@ void init_idle(struct task_struct *idle, int cpu)
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+   idle->flags |= PF_IDLE;
 
kasan_unpoison_task_stack(idle);
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 513e4df..6a4bae0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -205,76 +205,65 @@ static void cpuidle_idle_call(void)
  *
  * Called with polling cleared.
  */
-static void cpu_idle_loop(void)
+static void do_idle(void)
 {
-   int cpu = smp_processor_id();
-
-   while (1) {
-   /*
-* If the arch has a polling bit, we maintain an invariant:
-*
-* Our polling bit is clear if we're not scheduled (i.e. if
-* rq->curr != rq->idle).  This means that, if rq->idle has
-* the polling bit set, then setting need_resched is
-* guaranteed to cause the cpu to reschedule.
-*/
-
-   __current_set_polling();
-   quiet_vmstat();
-   tick_nohz_idle_enter();
+   /*
+* If the arch has a polling bit, we maintain an invariant:
+*
+* Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+* rq->idle). This means that, if rq->idle has the polling bit set,
+* then setting need_resched is guaranteed to cause the CPU to
+* reschedule.
+*/
 
-   while (!need_resched()) {
-   check_pgt_cache();
-   rmb();
+   __current_set_polling();
+   tick_nohz_idle_enter();
 
-   if (cpu_is_offline(cpu)) {
-   cpuhp_report_idle_dead();
-