Re: [RFC PATCH v2 2/4] cpuidle: (POWER) cpuidle driver for pSeries

2011-11-28 Thread Deepthi Dharwar
On 11/28/2011 04:33 AM, Benjamin Herrenschmidt wrote:

 On Thu, 2011-11-17 at 16:58 +0530, Deepthi Dharwar wrote:
 This patch implements a backhand cpuidle driver for pSeries
 based on pseries_dedicated_idle_loop and pseries_shared_idle_loop
 routines.  The driver is built only if CONFIG_CPU_IDLE is set. This
 cpuidle driver uses global registration of idle states and
 not per-cpu.
 
  .../...
 
 +#define MAX_IDLE_STATE_COUNT2
 +
 +static int max_cstate = MAX_IDLE_STATE_COUNT - 1;
 
 Why cstate ? This isn't an x86 :-)


Sure, I shall change it right away :)

 
 +static struct cpuidle_device __percpu *pseries_idle_cpuidle_devices;
 
 Shorter name please. pseries_cpuidle_devs is fine.


I ll do so.

 
 +static struct cpuidle_state *cpuidle_state_table;
 +
 +void update_smt_snooze_delay(int snooze)
 +{
 +struct cpuidle_driver *drv = cpuidle_get_driver();
 +if (drv)
 +drv-states[0].target_residency = snooze;
 +}
 +
 +static inline void idle_loop_prolog(unsigned long *in_purr, ktime_t 
 *kt_before)
 +{
 +
 +*kt_before = ktime_get_real();
 +*in_purr = mfspr(SPRN_PURR);
 +/*
 + * Indicate to the HV that we are idle. Now would be
 + * a good time to find other work to dispatch.
 + */
 +get_lppaca()-idle = 1;
 +get_lppaca()-donate_dedicated_cpu = 1;
 +}
 
 I notice that you call this on shared processors as well. The old ocde
 used to not set donate_dedicated_cpu in that case. I assume that's not a
 big deal and that the HV will just ignore it in the shared processor
 case but please add a comment after you've verified it.



Yes, the old code does not set donate_dedicated_cpu. But yes I will
try testing it in a shared proc config but also remove this
initialization for shared idle loop.

 
 +static inline  s64 idle_loop_epilog(unsigned long in_purr, ktime_t 
 kt_before)
 +{
 +get_lppaca()-wait_state_cycles += mfspr(SPRN_PURR) - in_purr;
 +get_lppaca()-donate_dedicated_cpu = 0;
 +get_lppaca()-idle = 0;
 +
 +return ktime_to_us(ktime_sub(ktime_get_real(), kt_before));
 +}
 +
 +
 +static int snooze_loop(struct cpuidle_device *dev,
 +struct cpuidle_driver *drv,
 +int index)
 +{
 +unsigned long in_purr;
 +ktime_t kt_before;
 +
 +idle_loop_prolog(in_purr, kt_before);
 +
 +local_irq_enable();
 +set_thread_flag(TIF_POLLING_NRFLAG);
 +while (!need_resched()) {
 +ppc64_runlatch_off();
 +HMT_low();
 +HMT_very_low();
 +}
 +HMT_medium();
 +clear_thread_flag(TIF_POLLING_NRFLAG);
 +smp_mb();
 +local_irq_disable();
 +
 +dev-last_residency =
 +(int)idle_loop_epilog(in_purr, kt_before);
 +return index;
 +}
 
 So your snooze loop has no timeout, is that handled by the cpuidle
 driver using some kind of timer ? That sounds a lot less efficient than
 passing a max delay to the snooze loop to handle getting into a deeper
 state after a bit of snoozing rather than interrupting etc...


My bad, snooze_loop is essential for a time out. Nope cpuidle
driver doesn't have any timer mechanism. I ll fix it.
Need to add loop for snooze time.

 +static int dedicated_cede_loop(struct cpuidle_device *dev,
 +struct cpuidle_driver *drv,
 +int index)
 +{
 +unsigned long in_purr;
 +ktime_t kt_before;
 +
 +idle_loop_prolog(in_purr, kt_before);
 +
 +ppc64_runlatch_off();
 +HMT_medium();
 +cede_processor();
 +
 +dev-last_residency =
 +(int)idle_loop_epilog(in_purr, kt_before);
 +return index;
 +}
 +
 +static int shared_cede_loop(struct cpuidle_device *dev,
 +struct cpuidle_driver *drv,
 +int index)
 +{
 +unsigned long in_purr;
 +ktime_t kt_before;
 +
 +idle_loop_prolog(in_purr, kt_before);
 +
 +/*
 + * Yield the processor to the hypervisor.  We return if
 + * an external interrupt occurs (which are driven prior
 + * to returning here) or if a prod occurs from another
 + * processor. When returning here, external interrupts
 + * are enabled.
 + */
 +cede_processor();
 +
 +dev-last_residency =
 +(int)idle_loop_epilog(in_purr, kt_before);
 +return index;
 +}
 +
 +/*
 + * States for dedicated partition case.
 + */
 +static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
 +{ /* Snooze */
 +.name = snooze,
 +.desc = snooze,
 +.flags = CPUIDLE_FLAG_TIME_VALID,
 +.exit_latency = 0,
 +.target_residency = 0,
 +.enter = snooze_loop },
 +{ /* CEDE */
 +.name = CEDE,
 +.desc = CEDE,
 +.flags = CPUIDLE_FLAG_TIME_VALID,
 +.exit_latency = 1,
 +.target_residency = 10,
 +.enter = dedicated_cede_loop },
 +};
 +
 +/*
 + * States for shared partition case.
 + */
 +static struct cpuidle_state 

Re: [RFC PATCH v2 2/4] cpuidle: (POWER) cpuidle driver for pSeries

2011-11-27 Thread Benjamin Herrenschmidt
On Thu, 2011-11-17 at 16:58 +0530, Deepthi Dharwar wrote:
 This patch implements a backhand cpuidle driver for pSeries
 based on pseries_dedicated_idle_loop and pseries_shared_idle_loop
 routines.  The driver is built only if CONFIG_CPU_IDLE is set. This
 cpuidle driver uses global registration of idle states and
 not per-cpu.

 .../...

 +#define MAX_IDLE_STATE_COUNT 2
 +
 +static int max_cstate = MAX_IDLE_STATE_COUNT - 1;

Why cstate ? This isn't an x86 :-)

 +static struct cpuidle_device __percpu *pseries_idle_cpuidle_devices;

Shorter name please. pseries_cpuidle_devs is fine.

 +static struct cpuidle_state *cpuidle_state_table;
 +
 +void update_smt_snooze_delay(int snooze)
 +{
 + struct cpuidle_driver *drv = cpuidle_get_driver();
 + if (drv)
 + drv-states[0].target_residency = snooze;
 +}
 +
 +static inline void idle_loop_prolog(unsigned long *in_purr, ktime_t 
 *kt_before)
 +{
 +
 + *kt_before = ktime_get_real();
 + *in_purr = mfspr(SPRN_PURR);
 + /*
 +  * Indicate to the HV that we are idle. Now would be
 +  * a good time to find other work to dispatch.
 +  */
 + get_lppaca()-idle = 1;
 + get_lppaca()-donate_dedicated_cpu = 1;
 +}

I notice that you call this on shared processors as well. The old ocde
used to not set donate_dedicated_cpu in that case. I assume that's not a
big deal and that the HV will just ignore it in the shared processor
case but please add a comment after you've verified it.

 +static inline  s64 idle_loop_epilog(unsigned long in_purr, ktime_t kt_before)
 +{
 + get_lppaca()-wait_state_cycles += mfspr(SPRN_PURR) - in_purr;
 + get_lppaca()-donate_dedicated_cpu = 0;
 + get_lppaca()-idle = 0;
 +
 + return ktime_to_us(ktime_sub(ktime_get_real(), kt_before));
 +}
 +
 +
 +static int snooze_loop(struct cpuidle_device *dev,
 + struct cpuidle_driver *drv,
 + int index)
 +{
 + unsigned long in_purr;
 + ktime_t kt_before;
 +
 + idle_loop_prolog(in_purr, kt_before);
 +
 + local_irq_enable();
 + set_thread_flag(TIF_POLLING_NRFLAG);
 + while (!need_resched()) {
 + ppc64_runlatch_off();
 + HMT_low();
 + HMT_very_low();
 + }
 + HMT_medium();
 + clear_thread_flag(TIF_POLLING_NRFLAG);
 + smp_mb();
 + local_irq_disable();
 +
 + dev-last_residency =
 + (int)idle_loop_epilog(in_purr, kt_before);
 + return index;
 +}

So your snooze loop has no timeout, is that handled by the cpuidle
driver using some kind of timer ? That sounds a lot less efficient than
passing a max delay to the snooze loop to handle getting into a deeper
state after a bit of snoozing rather than interrupting etc...

 +static int dedicated_cede_loop(struct cpuidle_device *dev,
 + struct cpuidle_driver *drv,
 + int index)
 +{
 + unsigned long in_purr;
 + ktime_t kt_before;
 +
 + idle_loop_prolog(in_purr, kt_before);
 +
 + ppc64_runlatch_off();
 + HMT_medium();
 + cede_processor();
 +
 + dev-last_residency =
 + (int)idle_loop_epilog(in_purr, kt_before);
 + return index;
 +}
 +
 +static int shared_cede_loop(struct cpuidle_device *dev,
 + struct cpuidle_driver *drv,
 + int index)
 +{
 + unsigned long in_purr;
 + ktime_t kt_before;
 +
 + idle_loop_prolog(in_purr, kt_before);
 +
 + /*
 +  * Yield the processor to the hypervisor.  We return if
 +  * an external interrupt occurs (which are driven prior
 +  * to returning here) or if a prod occurs from another
 +  * processor. When returning here, external interrupts
 +  * are enabled.
 +  */
 + cede_processor();
 +
 + dev-last_residency =
 + (int)idle_loop_epilog(in_purr, kt_before);
 + return index;
 +}
 +
 +/*
 + * States for dedicated partition case.
 + */
 +static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
 + { /* Snooze */
 + .name = snooze,
 + .desc = snooze,
 + .flags = CPUIDLE_FLAG_TIME_VALID,
 + .exit_latency = 0,
 + .target_residency = 0,
 + .enter = snooze_loop },
 + { /* CEDE */
 + .name = CEDE,
 + .desc = CEDE,
 + .flags = CPUIDLE_FLAG_TIME_VALID,
 + .exit_latency = 1,
 + .target_residency = 10,
 + .enter = dedicated_cede_loop },
 +};
 +
 +/*
 + * States for shared partition case.
 + */
 +static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
 + { /* Shared Cede */
 + .name = Shared Cede,
 + .desc = Shared Cede,
 + .flags = CPUIDLE_FLAG_TIME_VALID,
 + .exit_latency = 0,
 + .target_residency = 0,
 + .enter = shared_cede_loop },
 +};
 +
 +int pseries_notify_cpuidle_add_cpu(int cpu)
 +{
 + struct cpuidle_device 

[RFC PATCH v2 2/4] cpuidle: (POWER) cpuidle driver for pSeries

2011-11-17 Thread Deepthi Dharwar
This patch implements a backhand cpuidle driver for pSeries
based on pseries_dedicated_idle_loop and pseries_shared_idle_loop
routines.  The driver is built only if CONFIG_CPU_IDLE is set. This
cpuidle driver uses global registration of idle states and
not per-cpu.

Signed-off-by: Deepthi Dharwar deep...@linux.vnet.ibm.com
Signed-off-by: Trinabh Gupta g.trin...@gmail.com
Signed-off-by: Arun R Bharadwaj arun.r.bharad...@gmail.com
---
 arch/powerpc/include/asm/system.h   |8 +
 arch/powerpc/kernel/sysfs.c |2 
 arch/powerpc/platforms/pseries/Kconfig  |9 +
 arch/powerpc/platforms/pseries/Makefile |1 
 arch/powerpc/platforms/pseries/processor_idle.c |  317 +++
 arch/powerpc/platforms/pseries/pseries.h|3 
 arch/powerpc/platforms/pseries/setup.c  |3 
 arch/powerpc/platforms/pseries/smp.c|1 
 8 files changed, 341 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/processor_idle.c

diff --git a/arch/powerpc/include/asm/system.h 
b/arch/powerpc/include/asm/system.h
index ff66680..f56a0a7 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -223,6 +223,14 @@ extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 void cpu_idle_wait(void);
 
+#ifdef CONFIG_PSERIES_IDLE
+extern void update_smt_snooze_delay(int snooze);
+extern int pseries_notify_cpuidle_add_cpu(int cpu);
+#else
+static inline void update_smt_snooze_delay(int snooze) {}
+static inline int pseries_notify_cpuidle_add_cpu(int cpu) { return 0; }
+#endif
+
 /*
  * Atomic exchange
  *
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index ce035c1..ebe5d78 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -18,6 +18,7 @@
 #include asm/machdep.h
 #include asm/smp.h
 #include asm/pmc.h
+#include asm/system.h
 
 #include cacheinfo.h
 
@@ -51,6 +52,7 @@ static ssize_t store_smt_snooze_delay(struct sys_device *dev,
return -EINVAL;
 
per_cpu(smt_snooze_delay, cpu-sysdev.id) = snooze;
+   update_smt_snooze_delay(snooze);
 
return count;
 }
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index c81f6bb..ae7b6d4 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -120,3 +120,12 @@ config DTL
  which are accessible through a debugfs file.
 
  Say N if you are unsure.
+
+config PSERIES_IDLE
+   tristate Cpuidle driver for pSeries platforms
+   depends on CPU_IDLE
+   depends on PPC_PSERIES
+   default y
+   help
+ Select this option to enable processor idle state management
+ through cpuidle subsystem.
diff --git a/arch/powerpc/platforms/pseries/Makefile 
b/arch/powerpc/platforms/pseries/Makefile
index 3556e40..236db46 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_PHYP_DUMP)   += phyp_dump.o
 obj-$(CONFIG_CMM)  += cmm.o
 obj-$(CONFIG_DTL)  += dtl.o
 obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o
+obj-$(CONFIG_PSERIES_IDLE) += processor_idle.o
 
 ifeq ($(CONFIG_PPC_PSERIES),y)
 obj-$(CONFIG_SUSPEND)  += suspend.o
diff --git a/arch/powerpc/platforms/pseries/processor_idle.c 
b/arch/powerpc/platforms/pseries/processor_idle.c
new file mode 100644
index 000..b5addd7
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/processor_idle.c
@@ -0,0 +1,317 @@
+/*
+ *  processor_idle - idle state cpuidle driver.
+ *  Adapted from drivers/idle/intel_idle.c and
+ *  drivers/acpi/processor_idle.c
+ *
+ */
+
+#include linux/kernel.h
+#include linux/module.h
+#include linux/init.h
+#include linux/moduleparam.h
+#include linux/cpuidle.h
+#include linux/cpu.h
+
+#include asm/paca.h
+#include asm/reg.h
+#include asm/system.h
+#include asm/machdep.h
+#include asm/firmware.h
+
+#include plpar_wrappers.h
+#include pseries.h
+
+struct cpuidle_driver pseries_idle_driver = {
+   .name = pseries_idle,
+   .owner =THIS_MODULE,
+};
+
+#define MAX_IDLE_STATE_COUNT   2
+
+static int max_cstate = MAX_IDLE_STATE_COUNT - 1;
+static struct cpuidle_device __percpu *pseries_idle_cpuidle_devices;
+static struct cpuidle_state *cpuidle_state_table;
+
+void update_smt_snooze_delay(int snooze)
+{
+   struct cpuidle_driver *drv = cpuidle_get_driver();
+   if (drv)
+   drv-states[0].target_residency = snooze;
+}
+
+static inline void idle_loop_prolog(unsigned long *in_purr, ktime_t *kt_before)
+{
+
+   *kt_before = ktime_get_real();
+   *in_purr = mfspr(SPRN_PURR);
+   /*
+* Indicate to the HV that we are idle. Now would be
+* a good time to find other work to dispatch.
+*/
+   get_lppaca()-idle = 1;
+