[RFC v2 2/2] powerpc/powernv : Introduce capability for firmware-enabled-stop

2020-07-06 Thread Abhishek Goel
This patch introduces the capability for firmware to handle the stop
states instead. A bit is set based on the discovery of the feature
and correspondingly also the responsibility to handle the stop states.

If Kernel does not know about stop version, it can fallback to opal for
idle stop support if firmware-stop-supported property is present.

Earlier part of this patch was posted in this series :
https://lkml.org/lkml/2020/3/4/589

Signed-off-by: Abhishek Goel 
Signed-off-by: Pratik Rajesh Sampat 
---
v1->v2 : Combined patch 2 and 3 from previous iteration and rebased it.

 arch/powerpc/include/asm/processor.h  | 18 ++
 arch/powerpc/kernel/dt_cpu_ftrs.c | 13 +
 arch/powerpc/platforms/powernv/idle.c | 20 
 drivers/cpuidle/cpuidle-powernv.c |  3 ++-
 4 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index bfa336fbcfeb..b8de7146387c 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -428,6 +428,24 @@ extern void power4_idle_nap(void);
 extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
+#define STOP_ENABLE0x0001
+#define FIRMWARE_STOP_ENABLE   0x0010
+
+#define STOP_VERSION_P9   0x1
+
+/*
+ * Classify the dependencies of the stop states
+ * @idle_stop: function handler to handle the quirk stop version
+ * @cpuidle_prop: Signify support for stop states through kernel and/or 
firmware
+ * @stop_version: Classify quirk versions for stop states
+ */
+typedef struct {
+   unsigned long (*idle_stop)(unsigned long psscr, bool mmu_on);
+   uint8_t cpuidle_prop;
+   uint8_t stop_version;
+} stop_deps_t;
+extern stop_deps_t stop_dep;
+
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 
 extern void power7_idle_type(unsigned long type);
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 36bc0d5c4f3a..737686fae3c7 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -291,6 +291,15 @@ static int __init feat_enable_idle_stop(struct 
dt_cpu_feature *f)
lpcr |=  LPCR_PECE1;
lpcr |=  LPCR_PECE2;
mtspr(SPRN_LPCR, lpcr);
+   stop_dep.cpuidle_prop |= STOP_ENABLE;
+   stop_dep.stop_version = STOP_VERSION_P9;
+
+   return 1;
+}
+
+static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f)
+{
+   stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE;
 
return 1;
 }
@@ -589,6 +598,7 @@ static struct dt_cpu_feature_match __initdata
{"idle-nap", feat_enable_idle_nap, 0},
{"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0},
{"idle-stop", feat_enable_idle_stop, 0},
+   {"firmware-stop-supported", feat_enable_firmware_stop, 0},
{"machine-check-power8", feat_enable_mce_power8, 0},
{"performance-monitor-power8", feat_enable_pmu_power8, 0},
{"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR},
@@ -656,6 +666,9 @@ static void __init cpufeatures_setup_start(u32 isa)
}
 }
 
+stop_deps_t stop_dep = {NULL, 0x0, 0x0};
+EXPORT_SYMBOL(stop_dep);
+
 static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
 {
const struct dt_cpu_feature_match *m;
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 3afd4293f729..3602950f6c08 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -824,7 +824,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
 
 #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, true);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 #else
/*
@@ -840,7 +840,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
 
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, false);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
@@ -868,7 +868,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
 
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, true);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 
fini_irq_for_idle_irqsoff();
@@ -1365,8 +1365,20 @@ static int __init pnv_init_idle_states(void)
nr_pnv_idle_states = 0;
supported_cpuidle_states = 0;
 
-   if (cpuidle_disable != IDLE_NO_OVERRIDE)
+   if (cpuidle_disable != IDLE_NO_OVERRIDE ||
+   !(stop_dep.cpuidle_prop

[PATCH] cpuidle/powernv : Remove dead code block

2020-07-05 Thread Abhishek Goel
Commit 1961acad2f88559c2cdd2ef67c58c3627f1f6e54 removes usage of
function "validate_dt_prop_sizes". This patch removes this unused
function.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 14 --
 1 file changed, 14 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1b299e801f74..addaa6e6718b 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -244,20 +244,6 @@ static inline void add_powernv_state(int index, const char 
*name,
stop_psscr_table[index].mask = psscr_mask;
 }
 
-/*
- * Returns 0 if prop1_len == prop2_len. Else returns -1
- */
-static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len,
-const char *prop2, int prop2_len)
-{
-   if (prop1_len == prop2_len)
-   return 0;
-
-   pr_warn("cpuidle-powernv: array sizes don't match for %s and %s\n",
-   prop1, prop2);
-   return -1;
-}
-
 extern u32 pnv_get_supported_cpuidle_states(void);
 static int powernv_add_idle_states(void)
 {
-- 
2.17.1



[RFC v2 1/2] powerpc/powernv : Add support for pre-entry and post-exit of stop state using OPAL V4 OS services

2020-07-05 Thread Abhishek Goel
This patch provides kernel framework fro opal support of save restore
of sprs in idle stop loop. Opal support for stop states is needed to
selectively enable stop states or to introduce a quirk quickly in case
a buggy stop state is present.

We make a opal call from kernel if firmware-stop-support for stop
states is present and enabled. All the quirks for pre-entry of stop
state is handled inside opal. A call from opal is made into kernel
where we execute stop afer saving of NVGPRs.
After waking up from 0x100 vector in kernel, we enter back into opal.
All the quirks in post exit path, if any, are then handled in opal,
from where we return successfully back to kernel.

Signed-off-by: Abhishek Goel 
---
v1->v2 : Rebased the patch on Nick's Opal V4 OS patchset

 arch/powerpc/include/asm/opal-api.h|  4 +++-
 arch/powerpc/include/asm/opal.h|  1 +
 arch/powerpc/platforms/powernv/idle.c  | 12 
 arch/powerpc/platforms/powernv/opal-call.c |  1 +
 arch/powerpc/platforms/powernv/opal.c  | 15 +++
 5 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 97c5e5423827..437b6937685d 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -219,7 +219,8 @@
 #define OPAL_REPORT_TRAP   183
 #define OPAL_FIND_VM_AREA  184
 #define OPAL_REGISTER_OS_OPS   185
-#define OPAL_LAST  185
+#define OPAL_CPU_IDLE  186
+#define OPAL_LAST  186
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -1207,6 +1208,7 @@ struct opal_os_ops {
__be64  os_printf; /* void printf(int32_t level, const char *str) */
__be64  os_vm_map; /* int64_t os_vm_map(uint64_t ea, uint64_t pa, 
uint64_t flags) */
__be64  os_vm_unmap; /* void os_vm_unmap(uint64_t ea) */
+   __be64  os_idle_stop; /* void os_idle_stop(uint64_t srr1_addr, uint64_t 
psscr) */
 };
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 09985b7718b3..1774c056acb8 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -407,6 +407,7 @@ void opal_sensor_groups_init(void);
 
 int64_t opal_find_vm_area(uint64_t addr, struct opal_vm_area *opal_vm_area);
 int64_t opal_register_os_ops(struct opal_os_ops *ops, uint64_t size);
+int64_t opal_cpu_idle(uint64_t srr1_addr, uint64_t psscr);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..3afd4293f729 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -805,6 +805,18 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
return srr1;
 }
 
+static unsigned long power9_firmware_idle_stop(unsigned long psscr, bool 
mmu_on)
+{
+   unsigned long srr1;
+   int rc;
+
+   rc = opal_cpu_idle(cpu_to_be64(), (uint64_t) psscr);
+
+   if (mmu_on)
+   mtmsr(MSR_KERNEL);
+   return srr1;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 static unsigned long power9_offline_stop(unsigned long psscr)
 {
diff --git a/arch/powerpc/platforms/powernv/opal-call.c 
b/arch/powerpc/platforms/powernv/opal-call.c
index 11f419e76059..79076ca2de03 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -351,3 +351,4 @@ OPAL_CALL(opal_sym_to_addr, 
OPAL_SYM_TO_ADDR);
 OPAL_CALL(opal_report_trap,OPAL_REPORT_TRAP);
 OPAL_CALL(opal_find_vm_area,   OPAL_FIND_VM_AREA);
 OPAL_CALL(opal_register_os_ops,OPAL_REGISTER_OS_OPS);
+OPAL_CALL(opal_cpu_idle,   OPAL_CPU_IDLE);
diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 93b9afaf33b3..1fbf7065f918 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -1150,6 +1150,20 @@ static void os_vm_unmap(uint64_t ea)
local_flush_tlb_mm(mm);
 }
 
+int64_t os_idle_stop(uint64_t srr1_addr, uint64_t psscr)
+{
+   /*
+* For lite state which does not lose even GPRS we call
+* idle_stop_noloss while for all other states we call
+* idle_stop_mayloss. Saving and restoration of other additional
+* SPRs if required is handled in OPAL. All the quirks are also
+* handled in OPAL.
+*/
+   if (!(psscr & (PSSCR_EC|PSSCR_ESL)))
+   return isa300_idle_stop_noloss(psscr);
+   return isa300_idle_stop_mayloss(psscr);
+}
+
 static int __init opal_init_mm(void)
 {
struct mm_struct *mm;
@@ -1231,6 +1245,7 @@ static int __init opal_init_ear

[RFC 3/3] powernv/cpuidle : Introduce capability for firmware-enabled-stop

2020-04-26 Thread Abhishek Goel
This patch introduces the capability for firmware to handle the stop
states instead. A bit is set based on the discovery of the feature
and correspondingly also the responsibility to handle the stop states.

If Kernel does not know about stop version, it can fallback to opal for
idle stop support if firmware-stop-supported property is present.

Earlier this patch was posted as part of this series :
https://lkml.org/lkml/2020/3/4/589

Signed-off-by: Pratik Rajesh Sampat 
Signed-off-by: Abhishek Goel 
---

 v1->v2: This patch is newly added in this series.

 arch/powerpc/include/asm/processor.h  |  1 +
 arch/powerpc/kernel/dt_cpu_ftrs.c |  8 
 arch/powerpc/platforms/powernv/idle.c | 27 ---
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 66fa20476d0e..d5c6532b11ef 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -430,6 +430,7 @@ extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
 #define STOP_ENABLE0x0001
+#define FIRMWARE_STOP_ENABLE   0x0010
 
 #define STOP_VERSION_P9   0x1
 
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index db1a525e090d..ff4a87b79702 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -298,6 +298,13 @@ static int __init feat_enable_idle_stop(struct 
dt_cpu_feature *f)
return 1;
 }
 
+static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f)
+{
+   stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE;
+
+   return 1;
+}
+
 static int __init feat_enable_mmu_hash(struct dt_cpu_feature *f)
 {
u64 lpcr;
@@ -592,6 +599,7 @@ static struct dt_cpu_feature_match __initdata
{"idle-nap", feat_enable_idle_nap, 0},
{"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0},
{"idle-stop", feat_enable_idle_stop, 0},
+   {"firmware-stop-supported", feat_enable_firmware_stop, 0},
{"machine-check-power8", feat_enable_mce_power8, 0},
{"performance-monitor-power8", feat_enable_pmu_power8, 0},
{"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR},
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 538f0842ac3f..0de5de81902e 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -633,16 +633,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
unsigned long mmcr0 = 0;
struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
bool sprs_saved = false;
-   int rc = 0;
-
-   /*
-* Kernel takes decision whether to make OPAL call or not. This logic
-* will be combined with the logic for BE opal to take decision.
-*/
-   if (firmware_stop_supported) {
-   rc = opal_cpu_idle(cpu_to_be64(__pa()), (uint64_t) psscr);
-   goto out;
-   }
 
if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
/* EC=ESL=0 case */
@@ -835,6 +825,19 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
return srr1;
 }
 
+static unsigned long power9_firmware_idle_stop(unsigned long psscr, bool 
mmu_on)
+{
+   unsigned long srr1;
+   int rc;
+
+   rc = opal_cpu_idle(cpu_to_be64(__pa()), (uint64_t) psscr);
+
+   if (mmu_on)
+   mtmsr(MSR_KERNEL);
+   return srr1;
+
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 static unsigned long power9_offline_stop(unsigned long psscr)
 {
@@ -1394,9 +1397,11 @@ static int __init pnv_init_idle_states(void)
!(stop_dep.cpuidle_prop & STOP_ENABLE))
goto out;
 
-   /* Check for supported version in kernel */
+   /* Check for supported version in kernel or fallback to kernel*/
if (stop_dep.stop_version & STOP_VERSION_P9) {
stop_dep.idle_stop = power9_idle_stop;
+   } else if (stop_dep.cpuidle_prop & FIRMWARE_STOP_ENABLE) {
+   stop_dep.idle_stop = power9_firmware_idle_stop;
} else {
stop_dep.idle_stop = NULL;
goto out;
-- 
2.17.1



[RFC 2/3] powernv/cpuidle : Interface for an idle-stop dependency structure

2020-04-26 Thread Abhishek Goel
This patch introduces the idea of having a dependency structure for
idle-stop. The structure encapsulates the following:
1. Bitmask for version of idle-stop
2. Bitmask for propterties like ENABLE/DISABLE
3. Function pointer which helps handle how the stop must be invoked

This patch lays a foundation for other idle-stop versions to be added
and handled cleanly based on their specified requirments.
Currently it handles the existing "idle-stop" version by setting the
discovery bits and the function pointer.

Earlier this patch was posted as part of this series :
https://lkml.org/lkml/2020/3/4/589

Signed-off-by: Pratik Rajesh Sampat 
Signed-off-by: Abhishek Goel 
---

 v1->v2: This patch is newly added in this series.

 arch/powerpc/include/asm/processor.h  | 17 +
 arch/powerpc/kernel/dt_cpu_ftrs.c |  5 +
 arch/powerpc/platforms/powernv/idle.c | 18 ++
 drivers/cpuidle/cpuidle-powernv.c |  3 ++-
 4 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index eedcbfb9a6ff..66fa20476d0e 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -429,6 +429,23 @@ extern void power4_idle_nap(void);
 extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
+#define STOP_ENABLE0x0001
+
+#define STOP_VERSION_P9   0x1
+
+/*
+ * Classify the dependencies of the stop states
+ * @idle_stop: function handler to handle the quirk stop version
+ * @cpuidle_prop: Signify support for stop states through kernel and/or 
firmware
+ * @stop_version: Classify quirk versions for stop states
+ */
+typedef struct {
+   unsigned long (*idle_stop)(unsigned long psscr, bool mmu_on);
+   uint8_t cpuidle_prop;
+   uint8_t stop_version;
+} stop_deps_t;
+extern stop_deps_t stop_dep;
+
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 
 extern void power7_idle_type(unsigned long type);
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 182b4047c1ef..db1a525e090d 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -292,6 +292,8 @@ static int __init feat_enable_idle_stop(struct 
dt_cpu_feature *f)
lpcr |=  LPCR_PECE1;
lpcr |=  LPCR_PECE2;
mtspr(SPRN_LPCR, lpcr);
+   stop_dep.cpuidle_prop |= STOP_ENABLE;
+   stop_dep.stop_version = STOP_VERSION_P9;
 
return 1;
 }
@@ -657,6 +659,9 @@ static void __init cpufeatures_setup_start(u32 isa)
}
 }
 
+stop_deps_t stop_dep = {NULL, 0x0, 0x0};
+EXPORT_SYMBOL(stop_dep);
+
 static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
 {
const struct dt_cpu_feature_match *m;
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 1841027b25c5..538f0842ac3f 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -842,7 +842,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
 
 #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, true);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 #else
/*
@@ -858,7 +858,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
 
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, false);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
@@ -886,7 +886,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
 
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr, true);
+   srr1 = stop_dep.idle_stop(psscr, true);
__ppc64_runlatch_on();
 
fini_irq_for_idle_irqsoff();
@@ -1390,8 +1390,18 @@ static int __init pnv_init_idle_states(void)
nr_pnv_idle_states = 0;
supported_cpuidle_states = 0;
 
-   if (cpuidle_disable != IDLE_NO_OVERRIDE)
+   if (cpuidle_disable != IDLE_NO_OVERRIDE ||
+   !(stop_dep.cpuidle_prop & STOP_ENABLE))
goto out;
+
+   /* Check for supported version in kernel */
+   if (stop_dep.stop_version & STOP_VERSION_P9) {
+   stop_dep.idle_stop = power9_idle_stop;
+   } else {
+   stop_dep.idle_stop = NULL;
+   goto out;
+   }
+
rc = pnv_parse_cpuidle_dt();
if (rc)
return rc;
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1b299e801f74..7430a8edf5c9 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -371,7 +371,8 @@ sta

[RFC 1/3] powernv/cpuidle : Support for pre-entry and post exit of stop state in firmware

2020-04-26 Thread Abhishek Goel
This patch provides kernel framework fro opal support of save restore
of sprs in idle stop loop. Opal support for stop states is needed to
selectively enable stop states or to introduce a quirk quickly in case
a buggy stop state is present.

We make a opal call from kernel if firmware-stop-support for stop
states is present and enabled. All the quirks for pre-entry of stop
state is handled inside opal. A call from opal is made into kernel
where we execute stop afer saving of NVGPRs.
After waking up from 0x100 vector in kernel, we enter back into opal.
All the quirks in post exit path, if any, are then handled in opal,
from where we return successfully back to kernel.
For deep stop states in which additional SPRs are lost, saving and
restoration will be done in OPAL.

This idea was first proposed by Nick here:
https://patchwork.ozlabs.org/patch/1208159/

The corresponding skiboot patch for this kernel patch is here:
https://patchwork.ozlabs.org/project/skiboot/list/?series=172831

When we callback from OPAL into kernel, r13 is clobbered. So, to
access PACA we need to restore it from HSPRGO. In future we can
handle this into OPAL as in here:
https://patchwork.ozlabs.org/patch/1245275/

Signed-off-by: Abhishek Goel 
Signed-off-by: Nicholas Piggin 
---

 v1->v2 : No change in this patch.

 arch/powerpc/include/asm/opal-api.h|  8 -
 arch/powerpc/include/asm/opal.h|  3 ++
 arch/powerpc/kernel/idle_book3s.S  |  5 +++
 arch/powerpc/platforms/powernv/idle.c  | 37 ++
 arch/powerpc/platforms/powernv/opal-call.c |  2 ++
 5 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index c1f25a760eb1..a2c782c99c9e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -214,7 +214,9 @@
 #define OPAL_SECVAR_GET176
 #define OPAL_SECVAR_GET_NEXT   177
 #define OPAL_SECVAR_ENQUEUE_UPDATE 178
-#define OPAL_LAST  178
+#define OPAL_REGISTER_OS_OPS   181
+#define OPAL_CPU_IDLE  182
+#define OPAL_LAST  182
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump {
struct  opal_mpipl_region region[];
 } __packed;
 
+struct opal_os_ops {
+   __be64 os_idle_stop;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9986ac34b8e2..3c340bc4df8e 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -400,6 +400,9 @@ void opal_powercap_init(void);
 void opal_psr_init(void);
 void opal_sensor_groups_init(void);
 
+extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops);
+extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_OPAL_H */
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 22f249b6f58d..8d287d1d06c0 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss)
  */
 _GLOBAL(isa300_idle_stop_mayloss)
mtspr   SPRN_PSSCR,r3
+   mr  r6, r13
+   mfspr   r13, SPRN_HSPRG0
std r1,PACAR1(r13)
mflrr4
mfcrr5
@@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss)
std r31,-8*18(r1)
std r4,-8*19(r1)
std r5,-8*20(r1)
+   std r6,-8*21(r1)
/* 168 bytes */
PPC_STOP
b   .   /* catch bugs */
@@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss)
ld  r1,PACAR1(r13)
ld  r4,-8*19(r1)
ld  r5,-8*20(r1)
+   ld  r6,-8*21(r1)
mtlrr4
mtcrr5
+   mr  r13,r6
/*
 * KVM nap requires r2 to be saved, rather than just restoring it
 * from PACATOC. This could be avoided for that less common case
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..1841027b25c5 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -35,6 +35,7 @@
 static u32 supported_cpuidle_states;
 struct pnv_idle_states_t *pnv_idle_states;
 int nr_pnv_idle_states;
+static bool firmware_stop_supported;
 
 /*
  * The default stop state that will be used by ppc_md.power_save
@@ -602,6 +603,25 @@ struct p9_sprs {
u64 uamor;
 };
 
+/*
+ * This function is called from OPAL if firmware support for stop
+ * states is present and enabled. It provides a fallback for idle
+ * stop states via OPAL.
+ */
+static uint64_t os_idle_stop(uint64_t psscr, bool save_gprs)
+{
+   /*
+* For l

[RFC] cpuidle/powernv : Support for pre-entry and post exit of stop state in firmware

2020-04-03 Thread Abhishek Goel
This patch provides kernel framework fro opal support of save restore
of sprs in idle stop loop. Opal support for stop states is needed to
selectively enable stop states or to introduce a quirk quickly in case
a buggy stop state is present.

We make a opal call from kernel if firmware-stop-support for stop
states is present and enabled. All the quirks for pre-entry of stop
state is handled inside opal. A call from opal is made into kernel
where we execute stop afer saving of NVGPRs.
After waking up from 0x100 vector in kernel, we enter back into opal.
All the quirks in post exit path, if any, are then handled in opal,
from where we return successfully back to kernel.
For deep stop states in which additional SPRs are lost, saving and
restoration will be done in OPAL.

This idea was first proposed by Nick here:
https://patchwork.ozlabs.org/patch/1208159/

The corresponding skiboot patch for this kernel patch is here:
https://patchwork.ozlabs.org/patch/1265959/

When we callback from OPAL into kernel, r13 is clobbered. So, to
access PACA we need to restore it from HSPRGO. In future we can
handle this into OPAL as in here:
https://patchwork.ozlabs.org/patch/1245275/

Signed-off-by: Abhishek Goel 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/opal-api.h|  8 -
 arch/powerpc/include/asm/opal.h|  3 ++
 arch/powerpc/kernel/idle_book3s.S  |  5 +++
 arch/powerpc/platforms/powernv/idle.c  | 37 ++
 arch/powerpc/platforms/powernv/opal-call.c |  2 ++
 5 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index c1f25a760eb1..a2c782c99c9e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -214,7 +214,9 @@
 #define OPAL_SECVAR_GET176
 #define OPAL_SECVAR_GET_NEXT   177
 #define OPAL_SECVAR_ENQUEUE_UPDATE 178
-#define OPAL_LAST  178
+#define OPAL_REGISTER_OS_OPS   181
+#define OPAL_CPU_IDLE  182
+#define OPAL_LAST  182
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump {
struct  opal_mpipl_region region[];
 } __packed;
 
+struct opal_os_ops {
+   __be64 os_idle_stop;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9986ac34b8e2..3c340bc4df8e 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -400,6 +400,9 @@ void opal_powercap_init(void);
 void opal_psr_init(void);
 void opal_sensor_groups_init(void);
 
+extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops);
+extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_OPAL_H */
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 22f249b6f58d..8d287d1d06c0 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss)
  */
 _GLOBAL(isa300_idle_stop_mayloss)
mtspr   SPRN_PSSCR,r3
+   mr  r6, r13
+   mfspr   r13, SPRN_HSPRG0
std r1,PACAR1(r13)
mflrr4
mfcrr5
@@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss)
std r31,-8*18(r1)
std r4,-8*19(r1)
std r5,-8*20(r1)
+   std r6,-8*21(r1)
/* 168 bytes */
PPC_STOP
b   .   /* catch bugs */
@@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss)
ld  r1,PACAR1(r13)
ld  r4,-8*19(r1)
ld  r5,-8*20(r1)
+   ld  r6,-8*21(r1)
mtlrr4
mtcrr5
+   mr  r13,r6
/*
 * KVM nap requires r2 to be saved, rather than just restoring it
 * from PACATOC. This could be avoided for that less common case
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..1841027b25c5 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -35,6 +35,7 @@
 static u32 supported_cpuidle_states;
 struct pnv_idle_states_t *pnv_idle_states;
 int nr_pnv_idle_states;
+static bool firmware_stop_supported;
 
 /*
  * The default stop state that will be used by ppc_md.power_save
@@ -602,6 +603,25 @@ struct p9_sprs {
u64 uamor;
 };
 
+/*
+ * This function is called from OPAL if firmware support for stop
+ * states is present and enabled. It provides a fallback for idle
+ * stop states via OPAL.
+ */
+static uint64_t os_idle_stop(uint64_t psscr, bool save_gprs)
+{
+   /*
+* For lite state which does not lose even GPRS we call

[RFC v5 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled

2019-10-03 Thread Abhishek Goel
The disable callback can be used to compute timeout for other states
whenever a state is enabled or disabled. We store the computed timeout
in "timeout" defined in cpuidle state strucure. So, we compute timeout
only when some state is enabled or disabled and not every time in the
fast idle path.
We also use the computed timeout to get timeout for snooze, thus getting
rid of get_snooze_timeout for snooze loop.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 35 +++
 include/linux/cpuidle.h   |  1 +
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index d7686ce6e..a75226f52 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -45,7 +45,6 @@ struct stop_psscr_table {
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
 static u64 default_snooze_timeout __read_mostly;
-static bool snooze_timeout_en __read_mostly;
 
 static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
@@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
return 0;
 }
 
-static u64 get_snooze_timeout(struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index)
+static void pnv_disable_callback(struct cpuidle_device *dev,
+struct cpuidle_driver *drv)
 {
int i;
 
-   if (unlikely(!snooze_timeout_en))
-   return default_snooze_timeout;
-
-   for (i = index + 1; i < drv->state_count; i++) {
-   struct cpuidle_state *s = >states[i];
-   struct cpuidle_state_usage *su = >states_usage[i];
-
-   if (s->disabled || su->disable)
-   continue;
-
-   return s->target_residency * tb_ticks_per_usec;
-   }
-
-   return default_snooze_timeout;
+   for (i = 0; i < drv->state_count; i++)
+   drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i);
 }
 
 static int snooze_loop(struct cpuidle_device *dev,
@@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev,
int index)
 {
u64 snooze_exit_time;
+   u64 snooze_timeout = drv->states[index].timeout;
+
+   if (!snooze_timeout)
+   snooze_timeout = default_snooze_timeout;
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
local_irq_enable();
 
-   snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+   snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+   if (get_tb() > snooze_exit_time) {
/*
 * Task has not woken up but we are exiting the polling
 * loop anyway. Require a barrier after polling is
@@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev,
u64 timeout_tb;
bool forced_wakeup = false;
 
-   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   timeout_tb = drv->states[index].timeout;
 
if (timeout_tb) {
/* Ensure that the timeout is at least one microsecond
@@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void)
 */
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
+   drv->disable_callback = pnv_disable_callback;
 
return 0;
 }
@@ -422,8 +413,6 @@ static int powernv_idle_probe(void)
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
-   if (max_idle_state > 1)
-   snooze_timeout_en = true;
} else
return -ENODEV;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 1729a497b..64195861b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
+   unsigned long long timeout; /* timeout for exiting out of a state */
 
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
-- 
2.17.1



[RFC v5 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled

2019-10-03 Thread Abhishek Goel
To force wakeup a cpu, we need to compute the timeout in the fast idle
path as a state may be enabled or disabled but there did not exist a
feedback to driver when a state is enabled or disabled.
This patch adds a callback whenever a state_usage records a store for
disable attribute

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/sysfs.c | 15 ++-
 include/linux/cpuidle.h |  3 +++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 2bb2683b4..6c9bf2f7b 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -418,8 +418,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, 
struct attribute *attr,
struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
struct cpuidle_device *dev = kobj_to_device(kobj);
 
-   if (cattr->store)
+   if (cattr->store) {
ret = cattr->store(state, state_usage, buf, size);
+   if (ret == size &&
+   strncmp(cattr->attr.name, "disable",
+   strlen("disable"))) {
+   struct kobject *cpuidle_kobj = kobj->parent;
+   struct cpuidle_device *dev =
+   to_cpuidle_device(cpuidle_kobj);
+   struct cpuidle_driver *drv =
+   cpuidle_get_cpu_driver(dev);
+
+   if (drv->disable_callback)
+   drv->disable_callback(dev, drv);
+   }
+   }
 
/* reset poll time cache */
dev->poll_limit_ns = 0;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 4b6b5bea8..1729a497b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -122,6 +122,9 @@ struct cpuidle_driver {
/* the driver handles the cpus in cpumask */
struct cpumask  *cpumask;
 
+   void (*disable_callback)(struct cpuidle_device *dev,
+struct cpuidle_driver *drv);
+
/* preferred governor to switch at register time */
const char  *governor;
 };
-- 
2.17.1



[PATCH v5 1/3] cpuidle-powernv : forced wakeup for stop states

2019-10-03 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
  v1 : started as auto promotion logic for cpuidle states in generic
driver
  v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
  v1 : New patch with name of forced wakeup started
  v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
  v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work
  v4 : Changed type and name of set/reset decrementer fucntion
   Handled irq_work_pending in try_set_dec_before_idle
  v5 : Removed forced wakeup for last stop state by changing the
   checking conditon of timeout_tb

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 +++
 drivers/cpuidle/cpuidle-powernv.c | 40 
 3 files changed, 85 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 08dbe3e68..06a6a2314 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -184,6 +184,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
+extern bool try_set_dec_before_idle(u64 timeout);
+extern void try_reset_dec_after_idle(void);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 694522308..d004c0d8e 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,49 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+/*
+ * This function tries setting decrementer before entering into idle.
+ * Returns true if we have reprogrammed the decrementer for idle.
+ * Returns false if the decrementer is unchanged.
+ */
+bool try_set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   if (now + timeout > *next_tb)
+   return false;
+
+   set_dec(timeout);
+   if (test_irq_work_pending())
+   set_dec(1);
+
+   return true;
+}
+
+/*
+ * This function gets called if we have set decrementer before
+ * entering into idle. It tries to reset/restore the decrementer
+ * to its original value.
+ */
+void try_reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(_next_tb);
+   if (now >= *next_tb)
+   return;
+
+   set_dec(*next_tb - now);
+   if (test_irq_work_pending())
+   set_dec(1);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..d7686ce6e 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -46,6 +47,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;

[PATCH v5 0/3] Forced-wakeup for stop states on Powernv

2019-10-03 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 : Removed timeout_needed and rebased to current
upstream kernel

Then,
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

v3 : 1) Cleanly handle setting the decrementer after exiting out of stop
   states.
 2) Added a disable_callback feature to compute timeout whenever a
state is enbaled or disabled instead of computing everytime in fast
idle path.
 3) Use disable callback to recompute timeout whenever state usage
is changed for a state. Also, cleaned up the get_snooze_timeout
function.

v4 :Changed the type and name of set/reset decrementer function.
Handled irq work pending in try_set_dec_before_idle.
No change in patch 2 and 3.

v5 :Removed forced wakeup for the last state. We dont want to wakeup
unnecessarily when already in deepest state. It was a mistake in
previous patches that was found out in recent experiments.
No change in patch 2 and 3.

Abhishek Goel (3):
  cpuidle-powernv : forced wakeup for stop states
  cpuidle : Add callback whenever a state usage is enabled/disabled
  cpuidle-powernv : Recompute the idle-state timeouts when state usage
is enabled/disabled

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 
 drivers/cpuidle/cpuidle-powernv.c | 55 +++
 drivers/cpuidle/sysfs.c   | 15 -
 include/linux/cpuidle.h   |  4 +++
 5 files changed, 105 insertions(+), 14 deletions(-)

-- 
2.17.1



[RFC 3/3] cpuidle/powernv : Add flags to identify stop state type

2019-08-23 Thread Abhishek Goel
Removed threshold latency which was being used to decide if a state
is cpuidle type or not. This decision can be taken using flags, as this
information has been encapsulated in the state->flags and being read
from idle device-tree.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  7 +++
 drivers/cpuidle/cpuidle-powernv.c   | 16 +---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 383242eb0dea..b9068fee21d8 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -233,6 +233,13 @@
 #define OPAL_PM_STOP_INST_FAST 0x0010
 #define OPAL_PM_STOP_INST_DEEP 0x0020
 
+/*
+ * Flags for stop states to distinguish between cpuidle and
+ * cpuoffline type of states.
+ */
+#define OPAL_PM_STOP_CPUIDLE   0x0100
+#define OPAL_PM_STOP_CPUHOTPLUG0x0200
+
 /*
  * OPAL_CONFIG_CPU_IDLE_STATE parameters
  */
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1b6c84d4ac77..1a33a548b769 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -270,8 +270,13 @@ static int powernv_add_idle_states(void)
goto out;
}
 
-   /* TODO: Count only states which are eligible for cpuidle */
-   dt_idle_states = nr_pnv_idle_states;
+   /* Count only cpuidle states*/
+   for (i = 0; i < nr_pnv_idle_states; i++) {
+   if (pnv_idle_states[i].flags & OPAL_PM_STOP_CPUIDLE)
+   dt_idle_states++;
+   }
+   pr_info("idle states in dt = %d , states with idle flag = %d",
+   nr_pnv_idle_states, dt_idle_states);
 
/*
 * Since snooze is used as first idle state, max idle states allowed is
@@ -300,11 +305,8 @@ static int powernv_add_idle_states(void)
continue;
}
 
-   /*
-* If an idle state has exit latency beyond
-* POWERNV_THRESHOLD_LATENCY_NS then don't use it in cpu-idle.
-*/
-   if (state->latency_ns > POWERNV_THRESHOLD_LATENCY_NS) {
+   /* Check whether a state is of cpuidle type */
+   if ((state->flags & OPAL_PM_STOP_CPUIDLE) != state->flags) {
pr_info("State %d is not idletype\n", i);
continue;
}
-- 
2.17.1



[RFC 2/3] cpuidle/powernv: Add support for versioned stop states

2019-08-23 Thread Abhishek Goel
New device tree format adds a compatible flag which has version
corresponding to every state, so that only kernel which has the capability
to handle the version of stop state will enable it. Drawback of the array
based dt node is that versioning of idle states is not possible.

Older kernel will still see stop0 and stop0_lite in older format and we
will deprecate it after some time.

Consider a case that stop4 has a bug. We take the following steps to
mitigate the problem.

1) Change compatible string for stop4 in OPAL to "stop4,v2" from
"stop4,v1", i.e. basicallly bump up the previous version and ship the
new firmware.
2) The kernel will ignore stop4 as it won't be able to recognize this
new version. Kernel will also ignore all the deeper states because its
possible that a cpu have requested for a deeper state but was never able
to enter into it. But we will still have shallower states that are there
before stop 4. This, thus prevents from completely disabling stop states.

Linux kernel can now look at the version string and decide if it has the
ability to handle that idle state. Henceforth, if kernel does not know
about a version, it will skip that state and all the deeper state.

Once when the workaround are implemented into the kernel, we can bump up
the known version in kernel for that state, so that support can be
enabled once again in kernel.

New Device-tree :

Final output
   power-mgt {
...
 ibm,enabled-stop-levels = <0xec00>;
 ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>;
 ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>;
 ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>;
 ibm,cpu-idle-state-flags = <0x10 0x101000>;
 ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>;
 ibm,idle-states {
 stop4 {
 flags = <0x207000>;
 compatible = "stop4,v1",
 psscr-mask = <0x0 0x3003ff>;
 latency-ns = <0x186a0>;
 residency-ns = <0x989680>;
 psscr = <0x0 0x300374>;
 ...
  };
...
stop11 {
 ...
 compatible = "stop11,v1",
 ...
  };
 };

Since state pointer is being passed to stop loop, I have separated idle
fast path for lite, shallow and deep states. This improves the
readability of the code and also future maintainability of the code.
Stop handle corresponding to each state can be called directly since
state pointer is being passed now.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/cpuidle.h|   8 +-
 arch/powerpc/platforms/powernv/idle.c | 331 +++---
 2 files changed, 299 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index 9844b3ded187..5eb9a98fcb86 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -70,15 +70,19 @@
 
 #ifndef __ASSEMBLY__
 
-#define PNV_IDLE_NAME_LEN16
+#define PNV_VERSION_LEN25
+#define PNV_IDLE_NAME_LEN  16
 struct pnv_idle_states_t {
char name[PNV_IDLE_NAME_LEN];
+   char compat_version[PNV_VERSION_LEN];
u32 latency_ns;
u32 residency_ns;
u64 psscr_val;
u64 psscr_mask;
-   u32 flags;
+   u64 flags;
bool valid;
+   unsigned long (*stop_handle)(struct pnv_idle_states_t *state,
+bool mmu_on);
 };
 
 extern struct pnv_idle_states_t *pnv_idle_states;
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index db810c0a16e1..7398a66e1ddb 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -49,6 +49,12 @@ static bool default_stop_found;
 static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
 static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
 
+struct stop_version_t {
+   const char compat_version[PNV_VERSION_LEN];
+   unsigned long (*stop_handle)(struct pnv_idle_states_t *state,
+bool mmu_on);
+};
+
 /*
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
@@ -599,40 +605,152 @@ struct p9_sprs {
u64 uamor;
 };
 
-static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
+/* Splitting the previous power9_idle_stop into three functions
+ * to separately handle lite, shallow and deep states.
+ */
+
+static unsigned long power9_stop_lite(struct pnv_idle_states_t *state,
+ bool mmu_on)
 {
-   int cpu = raw_smp_processor_id();
-   int first = cpu_first_thread_sib

[RFC 1/3] cpuidle/powernv : Pass state pointer instead of values to stop loop

2019-08-23 Thread Abhishek Goel
Instead of passing psscr_val and psscr_mask, we will pass state pointer
to stop loop. This will help to figure out the method to enter/exit idle
state. Removed psscr_mask and psscr_val from driver idle code. Same has
also been done in platform idle code.
Also, added some cleanups and debugging info.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/processor.h  |  5 +-
 arch/powerpc/platforms/powernv/idle.c | 50 ---
 drivers/cpuidle/cpuidle-powernv.c | 69 +--
 3 files changed, 55 insertions(+), 69 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index a9993e7a443b..855666e8d271 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* We do _not_ want to define new machine types at all, those must die
  * in favor of using the device-tree
@@ -419,9 +420,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, 
IDLE_POWERSAVE_OFF};
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 
 extern void power7_idle_type(unsigned long type);
-extern void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask);
-
+extern void power9_idle_type(struct pnv_idle_states_t *state);
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
 extern void poweroff_now(void);
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 09f49eed7fb8..db810c0a16e1 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -40,8 +40,7 @@ int nr_pnv_idle_states;
  * The default stop state that will be used by ppc_md.power_save
  * function on platforms that support stop instruction.
  */
-static u64 pnv_default_stop_val;
-static u64 pnv_default_stop_mask;
+struct pnv_idle_states_t *pnv_default_state;
 static bool default_stop_found;
 
 /*
@@ -54,9 +53,7 @@ static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
  */
-static u64 pnv_deepest_stop_psscr_val;
-static u64 pnv_deepest_stop_psscr_mask;
-static u64 pnv_deepest_stop_flag;
+static struct pnv_idle_states_t *pnv_deepest_state;
 static bool deepest_stop_found;
 
 static unsigned long power7_offline_type;
@@ -78,7 +75,7 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t hid5_val   = mfspr(SPRN_HID5);
uint64_t hmeer_val  = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
-   uint64_t psscr_val = pnv_deepest_stop_psscr_val;
+   uint64_t psscr_val = pnv_deepest_state->psscr_val;
 
for_each_present_cpu(cpu) {
uint64_t pir = get_hard_smp_processor_id(cpu);
@@ -804,9 +801,13 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static unsigned long power9_offline_stop(unsigned long psscr)
+static unsigned long power9_offline_stop(struct pnv_idle_states_t *state)
 {
unsigned long srr1;
+   unsigned long psscr;
+
+   psscr = mfspr(SPRN_PSSCR);
+   psscr = (psscr & ~state->psscr_mask) | state->psscr_val;
 
 #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
__ppc64_runlatch_off();
@@ -841,8 +842,7 @@ static unsigned long power9_offline_stop(unsigned long 
psscr)
 }
 #endif
 
-void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask)
+void power9_idle_type(struct pnv_idle_states_t *state)
 {
unsigned long psscr;
unsigned long srr1;
@@ -851,7 +851,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
return;
 
psscr = mfspr(SPRN_PSSCR);
-   psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+   psscr = (psscr & ~state->psscr_mask) | state->psscr_val;
 
__ppc64_runlatch_off();
srr1 = power9_idle_stop(psscr, true);
@@ -867,7 +867,7 @@ void power9_idle_type(unsigned long stop_psscr_val,
  */
 void power9_idle(void)
 {
-   power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+   power9_idle_type(pnv_default_state);
 }
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -970,12 +970,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
__ppc64_runlatch_off();
 
if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
-   unsigned long psscr;
-
-   psscr = mfspr(SPRN_PSSCR);
-   psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
-   pnv_deepest_stop_psscr_val;
-   srr1 = power9_offline_stop(psscr);
+   srr1 = power9_offline_stop(pnv_deepest_state);
} else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
srr1 = power7_offline();
} el

[RFC 0/3] New idle device-tree format and support for versioned stop state

2019-08-23 Thread Abhishek Goel
Background
--

Previously if a older kernel runs on a newer firmware, it may enable
all available states irrespective of its capability of handling it.
Consider a case that some stop state has a bug, we end up disabling all
the stop states. This patch introduces selective control to solve this
problem.

Previous version of these patches can be found at:
https://lkml.org/lkml/2018/10/11/544
These patch however also had patches for support of opal save-restore
which now I am decoupling and will take them seperately.
I have posted the corresponding skiboot patches for this kernel patchset
here : https://patchwork.ozlabs.org/cover/1144587/

What's new?


Add stop states under ibm,idle-states in addition to the current array
based device tree properties.

New device tree format adds a compatible flag which has version
corresponding to every state, so that only kernel which has the capability
to handle the version of stop state will enable it. Drawback of the array
based dt node is that versioning of idle states is not possible.

Older kernel will still see stop0 and stop0_lite in older format and we
will deprecate it after some time.

Consider a case that stop4 has a bug. We take the following steps to
mitigate the problem.

1) Change compatible string for stop4 in OPAL to "stop4,v2" from
"stop4,v1", i.e. basicallly bump up the previous version and ship the
new firmware.

2) The kernel will ignore stop4 as it won't be able to recognize this
new version. Kernel will also ignore all the deeper states because its
possible that a cpu have requested for a deeper state but was never able
to enter into it. But we will still have shallower states that are there
before stop 4. This, thus prevents from completely disabling stop states.

Linux kernel can now look at the version string and decide if it has the
ability to handle that idle state. Henceforth, if kernel does not know
about a version, it will skip that state and all the deeper state.

Once when the workaround are implemented into the kernel, we can bump up
the known version in kernel for that state, so that support can be
enabled once again in kernel.

New Device-tree :

Final output
   power-mgt {
...
 ibm,enabled-stop-levels = <0xec00>;
 ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>;
 ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>;
 ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>;
 ibm,cpu-idle-state-flags = <0x10 0x101000>;
 ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>;
 ibm,idle-states {
 stop4 {
 flags = <0x207000>;
 compatible = "stop4,v1",
 psscr-mask = <0x0 0x3003ff>;
 latency-ns = <0x186a0>;
 residency-ns = <0x989680>;
 psscr = <0x0 0x300374>;
 ...
  };
...
stop11 {
 ...
     compatible = "stop11,v1",
 ...
  };
 };


Abhishek Goel (3):
  cpuidle/powernv : Pass state pointer instead of values to stop loop
  cpuidle/powernv: Add support for versioned stop states
  cpuidle/powernv : Add flags to identify stop state type

 arch/powerpc/include/asm/cpuidle.h|   8 +-
 arch/powerpc/include/asm/opal-api.h   |   7 +
 arch/powerpc/include/asm/processor.h  |   5 +-
 arch/powerpc/platforms/powernv/idle.c | 371 +-
 drivers/cpuidle/cpuidle-powernv.c |  81 +++---
 5 files changed, 363 insertions(+), 109 deletions(-)

-- 
2.17.1



[RFC v4 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled

2019-07-12 Thread Abhishek Goel
The disable callback can be used to compute timeout for other states
whenever a state is enabled or disabled. We store the computed timeout
in "timeout" defined in cpuidle state strucure. So, we compute timeout
only when some state is enabled or disabled and not every time in the
fast idle path.
We also use the computed timeout to get timeout for snooze, thus getting
rid of get_snooze_timeout for snooze loop.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 35 +++
 include/linux/cpuidle.h   |  1 +
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 17e20e408ffe..29add322d0c4 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -45,7 +45,6 @@ struct stop_psscr_table {
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
 static u64 default_snooze_timeout __read_mostly;
-static bool snooze_timeout_en __read_mostly;
 
 static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
@@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
return 0;
 }
 
-static u64 get_snooze_timeout(struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index)
+static void pnv_disable_callback(struct cpuidle_device *dev,
+struct cpuidle_driver *drv)
 {
int i;
 
-   if (unlikely(!snooze_timeout_en))
-   return default_snooze_timeout;
-
-   for (i = index + 1; i < drv->state_count; i++) {
-   struct cpuidle_state *s = >states[i];
-   struct cpuidle_state_usage *su = >states_usage[i];
-
-   if (s->disabled || su->disable)
-   continue;
-
-   return s->target_residency * tb_ticks_per_usec;
-   }
-
-   return default_snooze_timeout;
+   for (i = 0; i < drv->state_count; i++)
+   drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i);
 }
 
 static int snooze_loop(struct cpuidle_device *dev,
@@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev,
int index)
 {
u64 snooze_exit_time;
+   u64 snooze_timeout = drv->states[index].timeout;
+
+   if (!snooze_timeout)
+   snooze_timeout = default_snooze_timeout;
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
local_irq_enable();
 
-   snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+   snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+   if (get_tb() > snooze_exit_time) {
/*
 * Task has not woken up but we are exiting the polling
 * loop anyway. Require a barrier after polling is
@@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev,
u64 timeout_tb;
bool forced_wakeup = false;
 
-   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   timeout_tb = drv->states[index].timeout;
 
/* Ensure that the timeout is at least one microsecond
 * greater than current decrement value. Else, we will
@@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void)
 */
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
+   drv->disable_callback = pnv_disable_callback;
 
return 0;
 }
@@ -422,8 +413,6 @@ static int powernv_idle_probe(void)
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
-   if (max_idle_state > 1)
-   snooze_timeout_en = true;
} else
return -ENODEV;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8a0e54bd0d5d..31662b657b9c 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
+   unsigned long long timeout; /* timeout for exiting out of a state */
 
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
-- 
2.17.1



[RFC v4 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled

2019-07-12 Thread Abhishek Goel
To force wakeup a cpu, we need to compute the timeout in the fast idle
path as a state may be enabled or disabled but there did not exist a
feedback to driver when a state is enabled or disabled.
This patch adds a callback whenever a state_usage records a store for
disable attribute.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/sysfs.c | 15 ++-
 include/linux/cpuidle.h |  4 
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index eb20adb5de23..141671a53967 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, 
struct attribute *attr,
struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
 
-   if (cattr->store)
+   if (cattr->store) {
ret = cattr->store(state, state_usage, buf, size);
+   if (ret == size &&
+   strncmp(cattr->attr.name, "disable",
+   strlen("disable"))) {
+   struct kobject *cpuidle_kobj = kobj->parent;
+   struct cpuidle_device *dev =
+   to_cpuidle_device(cpuidle_kobj);
+   struct cpuidle_driver *drv =
+   cpuidle_get_cpu_driver(dev);
+
+   if (drv->disable_callback)
+   drv->disable_callback(dev, drv);
+   }
+   }
 
return ret;
 }
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb9a0db89f1a..8a0e54bd0d5d 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -119,6 +119,10 @@ struct cpuidle_driver {
 
/* the driver handles the cpus in cpumask */
struct cpumask  *cpumask;
+
+   void (*disable_callback)(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv);
+
 };
 
 #ifdef CONFIG_CPU_IDLE
-- 
2.17.1



[PATCH v4 1/3] cpuidle-powernv : forced wakeup for stop states

2019-07-12 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
  v1 : started as auto promotion logic for cpuidle states in generic
driver
  v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
  v1 : New patch with name of forced wakeup started
  v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
  v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work
  v4 : Changed type and name of set/reset decrementer fucntion
   Handled irq_work_pending in try_set_dec_before_idle

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 +++
 drivers/cpuidle/cpuidle-powernv.c | 40 
 3 files changed, 85 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54f4ec1f9fab..294a472ce161 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
+extern bool try_set_dec_before_idle(u64 timeout);
+extern void try_reset_dec_after_idle(void);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 694522308cd5..d004c0d8e099 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,49 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+/*
+ * This function tries setting decrementer before entering into idle.
+ * Returns true if we have reprogrammed the decrementer for idle.
+ * Returns false if the decrementer is unchanged.
+ */
+bool try_set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   if (now + timeout > *next_tb)
+   return false;
+
+   set_dec(timeout);
+   if (test_irq_work_pending())
+   set_dec(1);
+
+   return true;
+}
+
+/*
+ * This function gets called if we have set decrementer before
+ * entering into idle. It tries to reset/restore the decrementer
+ * to its original value.
+ */
+void try_reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(_next_tb);
+   if (now >= *next_tb)
+   return;
+
+   set_dec(*next_tb - now);
+   if (test_irq_work_pending())
+   set_dec(1);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b3..17e20e408ffe 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -46,6 +47,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+  

[PATCH v4 0/3] Forced-wakeup for stop states on Powernv

2019-07-12 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 : Removed timeout_needed and rebased to current
upstream kernel

Then,
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

v3 : 1) Cleanly handle setting the decrementer after exiting out of stop
   states.
 2) Added a disable_callback feature to compute timeout whenever a
state is enbaled or disabled instead of computing everytime in fast
idle path.
 3) Use disable callback to recompute timeout whenever state usage
is changed for a state. Also, cleaned up the get_snooze_timeout
function.

v4 :Changed the type and name of set/reset decrementer function.
Handled irq work pending in try_set_dec_before_idle.
No change in patch 2 and 3.

Abhishek Goel (3):
  cpuidle-powernv : forced wakeup for stop states
  cpuidle : Add callback whenever a state usage is enabled/disabled
  cpuidle-powernv : Recompute the idle-state timeouts when state usage
is enabled/disabled

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 
 drivers/cpuidle/cpuidle-powernv.c | 55 +++
 drivers/cpuidle/sysfs.c   | 15 -
 include/linux/cpuidle.h   |  5 +++
 5 files changed, 106 insertions(+), 14 deletions(-)

-- 
2.17.1



[RFC v3 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled

2019-07-04 Thread Abhishek Goel
The disable callback can be used to compute timeout for other states
whenever a state is enabled or disabled. We store the computed timeout
in "timeout" defined in cpuidle state strucure. So, we compute timeout
only when some state is enabled or disabled and not every time in the
fast idle path.
We also use the computed timeout to get timeout for snooze, thus getting
rid of get_snooze_timeout for snooze loop.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 35 +++
 include/linux/cpuidle.h   |  1 +
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index f51478460..7350f404a 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -45,7 +45,6 @@ struct stop_psscr_table {
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
 static u64 default_snooze_timeout __read_mostly;
-static bool snooze_timeout_en __read_mostly;
 
 static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
@@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
return 0;
 }
 
-static u64 get_snooze_timeout(struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index)
+static void pnv_disable_callback(struct cpuidle_device *dev,
+struct cpuidle_driver *drv)
 {
int i;
 
-   if (unlikely(!snooze_timeout_en))
-   return default_snooze_timeout;
-
-   for (i = index + 1; i < drv->state_count; i++) {
-   struct cpuidle_state *s = >states[i];
-   struct cpuidle_state_usage *su = >states_usage[i];
-
-   if (s->disabled || su->disable)
-   continue;
-
-   return s->target_residency * tb_ticks_per_usec;
-   }
-
-   return default_snooze_timeout;
+   for (i = 0; i < drv->state_count; i++)
+   drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i);
 }
 
 static int snooze_loop(struct cpuidle_device *dev,
@@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev,
int index)
 {
u64 snooze_exit_time;
+   u64 snooze_timeout = drv->states[index].timeout;
+
+   if (!snooze_timeout)
+   snooze_timeout = default_snooze_timeout;
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
local_irq_enable();
 
-   snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+   snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+   if (get_tb() > snooze_exit_time) {
/*
 * Task has not woken up but we are exiting the polling
 * loop anyway. Require a barrier after polling is
@@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev,
u64 timeout_tb;
int forced_wakeup = 0;
 
-   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   timeout_tb = drv->states[index].timeout;
if (timeout_tb)
forced_wakeup = set_dec_before_idle(timeout_tb);
 
@@ -255,6 +245,7 @@ static int powernv_cpuidle_driver_init(void)
 */
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
+   drv->disable_callback = pnv_disable_callback;
 
return 0;
 }
@@ -414,8 +405,6 @@ static int powernv_idle_probe(void)
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
-   if (max_idle_state > 1)
-   snooze_timeout_en = true;
} else
return -ENODEV;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8a0e54bd0..31662b657 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
+   unsigned long long timeout; /* timeout for exiting out of a state */
 
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
-- 
2.17.1



[RFC v3 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled

2019-07-04 Thread Abhishek Goel
To force wakeup a cpu, we need to compute the timeout in the fast idle
path as a state may be enabled or disabled but there did not exist a
feedback to driver when a state is enabled or disabled.
This patch adds a callback whenever a state_usage records a store for
disable attribute.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/sysfs.c | 15 ++-
 include/linux/cpuidle.h |  4 
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index eb20adb5d..141671a53 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, 
struct attribute *attr,
struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
 
-   if (cattr->store)
+   if (cattr->store) {
ret = cattr->store(state, state_usage, buf, size);
+   if (ret == size &&
+   strncmp(cattr->attr.name, "disable",
+   strlen("disable"))) {
+   struct kobject *cpuidle_kobj = kobj->parent;
+   struct cpuidle_device *dev =
+   to_cpuidle_device(cpuidle_kobj);
+   struct cpuidle_driver *drv =
+   cpuidle_get_cpu_driver(dev);
+
+   if (drv->disable_callback)
+   drv->disable_callback(dev, drv);
+   }
+   }
 
return ret;
 }
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb9a0db89..8a0e54bd0 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -119,6 +119,10 @@ struct cpuidle_driver {
 
/* the driver handles the cpus in cpumask */
struct cpumask  *cpumask;
+
+   void (*disable_callback)(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv);
+
 };
 
 #ifdef CONFIG_CPU_IDLE
-- 
2.17.1



[PATCH v3 1/3] cpuidle-powernv : forced wakeup for stop states

2019-07-04 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
 v1 : started as auto promotion logic for cpuidle states in generic
driver
 v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
 v1 : New patch with name of forced wakeup started
 v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
 v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work 

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 40 +++
 drivers/cpuidle/cpuidle-powernv.c | 32 +
 3 files changed, 74 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54f4ec1f9..a3bd4f3c0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
+extern int set_dec_before_idle(u64 timeout);
+extern void reset_dec_after_idle(void);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 694522308..814de3469 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,46 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+/*
+ * Returns 1 if we have reprogrammed the decrementer for idle.
+ * Returns 0 if the decrementer is unchanged.
+ */
+int set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   /*
+* Ensure that the timeout is at least one microsecond
+* before the current decrement value. Else, we will
+* unnecesarily wakeup again within a microsecond.
+*/
+   if (now + timeout + 512 > *next_tb)
+   return 0;
+
+   set_dec(timeout);
+
+   return 1;
+}
+
+void reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(_next_tb);
+   if (now >= *next_tb)
+   return;
+
+   set_dec(*next_tb - now);
+   if (test_irq_work_pending())
+   set_dec(1);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..f51478460 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -46,6 +47,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+   tb_ticks_per_usec;
+   }
+
+   return 0;
+}
+
 static u64 get_snooze_timeout(struct cpuidle_device *dev,
  struct cpuidle_driver *drv,
  int index)
@@ -144,8 +165,19 @@ static int stop_l

[PATCH v3 0/3] Forced-wakeup for stop states on Powernv

2019-07-04 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 : Removed timeout_needed and rebased to current
upstream kernel

Then,
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

v3 : 1) Cleanly handle setting the decrementer after exiting out of stop
   states.
 2) Added a disable_callback feature to compute timeout whenever a
state is enbaled or disabled instead of computing everytime in fast
idle path.
 3) Use disable callback to recompute timeout whenever state usage
is changed for a state. Also, cleaned up the get_snooze_timeout
function.


Abhishek Goel (3):
  cpuidle-powernv : forced wakeup for stop states
  cpuidle : Add callback whenever a state usage is enabled/disabled
  cpuidle-powernv : Recompute the idle-state timeouts when state usage
is enabled/disabled

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 40 ++
 drivers/cpuidle/cpuidle-powernv.c | 47 ++-
 drivers/cpuidle/sysfs.c   | 15 +-
 include/linux/cpuidle.h   |  5 
 5 files changed, 95 insertions(+), 14 deletions(-)

-- 
2.17.1



[PATCH v2 1/1] cpuidle-powernv : forced wakeup for stop states

2019-06-17 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
 v1 : started as auto promotion logic for cpuidle states in generic
driver
 v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
 v1 : New patch with name of forced wakeup started
 v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.

 drivers/cpuidle/cpuidle-powernv.c | 38 +++
 1 file changed, 38 insertions(+)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b3..bc9ca18ae7e3 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -46,6 +46,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+   tb_ticks_per_usec;
+   }
+
+   return 0;
+}
+
 static u64 get_snooze_timeout(struct cpuidle_device *dev,
  struct cpuidle_driver *drv,
  int index)
@@ -144,8 +164,26 @@ static int stop_loop(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
 int index)
 {
+   u64 dec_expiry_tb, dec, timeout_tb, forced_wakeup;
+
+   dec = mfspr(SPRN_DEC);
+   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   forced_wakeup = 0;
+
+   if (timeout_tb && timeout_tb < dec) {
+   forced_wakeup = 1;
+   dec_expiry_tb = mftb() + dec;
+   }
+
+   if (forced_wakeup)
+   mtspr(SPRN_DEC, timeout_tb);
+
power9_idle_type(stop_psscr_table[index].val,
 stop_psscr_table[index].mask);
+
+   if (forced_wakeup)
+   mtspr(SPRN_DEC, dec_expiry_tb - mftb());
+
return index;
 }
 
-- 
2.17.1



[PATCH v2 0/1] Forced-wakeup for stop states on Powernv

2019-06-17 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 of auto-promotion : Removed timeout_needed and rebased to current
upstream kernel

Then, 
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 of forced-wakeup : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

Abhishek Goel (1):
  cpuidle-powernv : forced wakeup for stop states

 drivers/cpuidle/cpuidle-powernv.c | 38 +++
 1 file changed, 38 insertions(+)

-- 
2.17.1



[PATCH 1/1] cpuidle-powernv : forced wakeup for stop lite states

2019-04-22 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

So we do not want to get stucked in such states for longer duration. To
address this, the cpuidle-core can queue timer to correspond with the
residency value of the next available state. This timer will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Cpu will be kicked out of the lite
state and end up in a non-lite state.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/cpuidle-powernv.c   | 71 -
 2 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b23..735dec731 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -226,6 +226,7 @@
  */
 
 #define OPAL_PM_TIMEBASE_STOP  0x0002
+#define OPAL_PM_LOSE_USER_CONTEXT  0x1000
 #define OPAL_PM_LOSE_HYP_CONTEXT   0x2000
 #define OPAL_PM_LOSE_FULL_CONTEXT  0x4000
 #define OPAL_PM_NAP_ENABLED0x0001
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..30b877962 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -43,6 +44,40 @@ struct stop_psscr_table {
 
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
+DEFINE_PER_CPU(struct hrtimer, forced_wakeup_timer);
+
+static int forced_wakeup_time_compute(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv,
+ int index)
+{
+   int i, timeout_us = 0;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   if (drv->states[i].disabled || dev->states_usage[i].disable)
+   continue;
+   timeout_us = drv->states[i].target_residency +
+   2 * drv->states[i].exit_latency;
+   break;
+   }
+
+   return timeout_us;
+}
+
+enum hrtimer_restart forced_wakeup_hrtimer_callback(struct hrtimer *hrtimer)
+{
+   return HRTIMER_NORESTART;
+}
+
+static void forced_wakeup_timer_init(int cpu, struct cpuidle_driver *drv)
+{
+   struct hrtimer *cpu_forced_wakeup_timer = _cpu(forced_wakeup_timer,
+   cpu);
+
+   hrtimer_init(cpu_forced_wakeup_timer, CLOCK_MONOTONIC,
+   HRTIMER_MODE_REL);
+   cpu_forced_wakeup_timer->function = forced_wakeup_hrtimer_callback;
+}
+
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
@@ -103,6 +138,28 @@ static int snooze_loop(struct cpuidle_device *dev,
return index;
 }
 
+static int stop_lite_loop(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv,
+ int index)
+{
+   int timeout_us;
+   struct hrtimer *this_timer = _cpu(forced_wakeup_timer, dev->cpu);
+
+   timeout_us = forced_wakeup_time_compute(dev, drv, index);
+
+   if (timeout_us > 0)
+   hrtimer_start(this_timer, ns_to_ktime(timeout_us * 1000),
+   HRTIMER_MODE_REL_PINNED);
+
+   power9_idle_type(stop_psscr_table[index].val,
+stop_psscr_table[index].mask);
+
+   if (unlikely(hrtimer_is_queued(this_timer)))
+   hrtimer_cancel(this_timer);
+
+   return index;
+}
+
 static int nap_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
@@ -190,7 +247,7 @@ static int powernv_cpuidle_cpu_dead(unsigned int cpu)
  */
 static int powernv_cpuidle_driver_init(void)
 {
-   int idle_state;
+   int idle_state, cpu;
struct cpuidle_driver *drv = _idle_driver;
 
drv->state_count = 0;
@@ -224,6 +281,9 @@ static int powernv_cpuidle_driver_init(void)
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
 
+   for_each_cpu(cpu, drv->cpumask)
+   forced_wakeup_time

[PATCH 0/1] Forced-wakeup for stop lite states on Powernv

2019-04-22 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

So we do not want to get stucked in such states for longer duration. To
address this, the cpuidle-core can queue timer to correspond with the
residency value of the next available state. This timer will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Cpu will be kicked out of the lite
state and end up in a non-lite state.

Experiment
--
I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us


In this patch, we queue a timer just before entering into a stop0_lite
state. The timer fires at (residency of next available state + exit latency
of next available state * 2). Let's say if next state(stop0) is available
which has residency of 20us, it should get out in as low as (20+2*2)*8
[Based on the forumla (residency + 2xlatency)*history length] microseconds
= 192us. Ideally we would expect 8 iterations, it was observed to get out
in 6-7 iterations. Even if let's say stop2 is next available state(stop0
and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get
into stop2.

So, We are able to get out of stop0_lite generally in 150us(with this
patch) as compared to 4ms(with tick retained). As stated earlier, we do not
want to get stuck into stop0_lite as it inhibits SMT folding for other
sibling threads, depriving them of core resources. Current patch is using
forced-wakeup only for stop0_lite, as it gives performance benefit(primary
reason) along with lowering down power consumption. We may extend this
model for other states in future.

Abhishek Goel (1):
  cpuidle-powernv : forced wakeup for stop lite states

 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/cpuidle-powernv.c   | 71 -
 2 files changed, 71 insertions(+), 1 deletion(-)

-- 
2.17.1



[PATCH v2 2/2] cpuidle : Add auto-promotion flag to cpuidle flags

2019-04-05 Thread Abhishek Goel
This patch sets up flags for the state which needs to be auto-promoted. On
POWERNV system, only lite states need to be autopromoted. We identify lite
states by those which do not lose user context. That information has been
used to set the flag for lite states.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 
 drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b23..735dec731 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -226,6 +226,7 @@
  */
 
 #define OPAL_PM_TIMEBASE_STOP  0x0002
+#define OPAL_PM_LOSE_USER_CONTEXT  0x1000
 #define OPAL_PM_LOSE_HYP_CONTEXT   0x2000
 #define OPAL_PM_LOSE_FULL_CONTEXT  0x4000
 #define OPAL_PM_NAP_ENABLED0x0001
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 8caccbbd7..9b8e9b96a 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -35,6 +35,10 @@ config CPU_IDLE_GOV_TEO
 config DT_IDLE_STATES
bool
 
+config CPU_IDLE_AUTO_PROMOTION
+   bool
+   default y if PPC_POWERNV
+
 menu "ARM CPU Idle Drivers"
 depends on ARM || ARM64
 source "drivers/cpuidle/Kconfig.arm"
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..0dd767270 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -299,6 +299,7 @@ static int powernv_add_idle_states(void)
for (i = 0; i < dt_idle_states; i++) {
unsigned int exit_latency, target_residency;
bool stops_timebase = false;
+   bool lose_user_context = false;
struct pnv_idle_states_t *state = _idle_states[i];
 
/*
@@ -324,6 +325,9 @@ static int powernv_add_idle_states(void)
if (has_stop_states && !(state->valid))
continue;
 
+   if (state->flags & OPAL_PM_LOSE_USER_CONTEXT)
+   lose_user_context = true;
+
if (state->flags & OPAL_PM_TIMEBASE_STOP)
stops_timebase = true;
 
@@ -332,12 +336,17 @@ static int powernv_add_idle_states(void)
add_powernv_state(nr_idle_states, "Nap",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
+   } else if (has_stop_states && !lose_user_context) {
+   add_powernv_state(nr_idle_states, state->name,
+ CPUIDLE_FLAG_AUTO_PROMOTION,
+ stop_loop, target_residency,
+ exit_latency, state->psscr_val,
+ state->psscr_mask);
} else if (has_stop_states && !stops_timebase) {
add_powernv_state(nr_idle_states, state->name,
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
- state->psscr_val,
- state->psscr_mask);
+ state->psscr_val, state->psscr_mask);
}
 
/*
-- 
2.17.1



[PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states

2019-04-05 Thread Abhishek Goel
Currently, the cpuidle governors (menu /ladder) determine what idle state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Signed-off-by: Abhishek Goel 
---

v1->v2 : Removed timeout_needed and rebased to current upstream kernel

 drivers/cpuidle/cpuidle.c  | 68 +-
 drivers/cpuidle/governors/ladder.c |  3 +-
 drivers/cpuidle/governors/menu.c   | 22 +-
 include/linux/cpuidle.h| 10 -
 4 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 7f108309e..11ce43f19 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -36,6 +36,11 @@ static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
 
+struct auto_promotion {
+   struct hrtimer  hrtimer;
+   unsigned long   timeout_us;
+};
+
 int cpuidle_disabled(void)
 {
return off;
@@ -188,6 +193,54 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, 
struct cpuidle_device *dev)
 }
 #endif /* CONFIG_SUSPEND */
 
+enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer)
+{
+   return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION
+DEFINE_PER_CPU(struct auto_promotion, ap);
+
+static void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state *state)
+{
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   if (state->flags & CPUIDLE_FLAG_AUTO_PROMOTION)
+   hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout_us
+   * 1000), HRTIMER_MODE_REL_PINNED);
+}
+
+static void cpuidle_auto_promotion_cancel(int cpu)
+{
+   struct hrtimer *hrtimer;
+
+   hrtimer = _cpu(ap, cpu).hrtimer;
+   if (hrtimer_is_queued(hrtimer))
+   hrtimer_cancel(hrtimer);
+}
+
+static void cpuidle_auto_promotion_update(int cpu, unsigned long timeout)
+{
+   per_cpu(ap, cpu).timeout_us = timeout;
+}
+
+static void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver *drv)
+{
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   this_ap->hrtimer.function = auto_promotion_hrtimer_callback;
+}
+#else
+static inline void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state
+   *state) { }
+static inline void cpuidle_auto_promotion_cancel(int cpu) { }
+static inline void cpuidle_auto_promotion_update(int cpu, unsigned long
+   timeout) { }
+static inline void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver
+   *drv) { }
+#endif
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -225,12 +278,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, 
struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ns_to_ktime(local_clock());
 
+   cpuidle_auto_promotion_start(dev->cpu, target_state);
+
stop_critical_timings();
entered_state = target_state->enter(dev, drv, index);
start_critical_timings();
 
sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
+
+   cpuidle_auto_promotion_cancel(dev->cpu);
+
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
/* The cpu is no longer idle or about to enter idle. */
@@ -312,7 +370,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct 
cpuidle_driver *drv,
 int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
   bool *stop_tick)
 {
-   return cpuidle_curr_governor->select(drv, dev, stop_tick);
+   unsigned long timeout_us, ret;
+
+   timeout_us = UINT_MAX;
+   ret = cpuidle_curr_governor->select(drv, dev, stop_tick, _us);
+   cpuidle_auto_promotion_update(dev->cpu, timeout_us);
+
+   return ret;
 }
 
 /**
@@ -658,6 +722,8 @@ int cpuidle_register(struct cpuidle_driver *drv,
device = _cpu(cpuidl

[PATCH v2 0/2] Auto-promotion logic for cpuidle states

2019-04-05 Thread Abhishek Goel
Currently, the cpuidle governors (menu/ladder) determine what idle state a
idling CPU should enter into based on heuristics that depend on the idle
history on that CPU. Given that no predictive heuristic is perfect, there
are cases where the governor predicts a shallow idle state, hoping that
the CPU will be busy soon. However, if no new workload is scheduled on
that CPU in the near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-core
can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Experiment
--
I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained(as suggested) -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=
*ms = milliseconds

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us

*us = microseconds

In this patch, we queue a timer just before entering into a stop0_lite
state. The timer fires at (residency of next available state + exit latency
of next available state * 2). Let's say if next state(stop0) is available
which has residency of 20us, it should get out in as low as (20+2*2)*8
[Based on the forumla (residency + 2xlatency)*history length] microseconds
= 192us. Ideally we would expect 8 iterations, it was observed to get out
in 6-7 iterations. Even if let's say stop2 is next available state(stop0
and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get
into stop2.

So, We are able to get out of stop0_lite generally in 150us(with this
patch) as compared to 4ms(with tick retained). As stated earlier, we do not
want to get stuck into stop0_lite as it inhibits SMT folding for other
sibling threads, depriving them of core resources. Current patch is using
auto-promotion only for stop0_lite, as it gives performance benefit(primary
reason) along with lowering down power consumption. We may extend this
model for other states in future.

Abhishek Goel (2):
  cpuidle : auto-promotion for cpuidle states
  cpuidle : Add auto-promotion flag to cpuidle flags

 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 ++
 drivers/cpuidle/cpuidle-powernv.c   | 13 +-
 drivers/cpuidle/cpuidle.c   | 68 -
 drivers/cpuidle/governors/ladder.c  |  3 +-
 drivers/cpuidle/governors/menu.c| 22 +-
 include/linux/cpuidle.h | 10 -
 7 files changed, 115 insertions(+), 6 deletions(-)

-- 
2.17.1



[PATCH 2/2] cpuidle : Add auto-promotion flag to cpuidle flags

2019-03-22 Thread Abhishek Goel
This patch sets up flags for the state which needs to be auto-promoted.
For powernv systems, lite states do not even lose user context. That
information has been used to set the flag for lite states.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 
 drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b23..735dec731 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -226,6 +226,7 @@
  */
 
 #define OPAL_PM_TIMEBASE_STOP  0x0002
+#define OPAL_PM_LOSE_USER_CONTEXT  0x1000
 #define OPAL_PM_LOSE_HYP_CONTEXT   0x2000
 #define OPAL_PM_LOSE_FULL_CONTEXT  0x4000
 #define OPAL_PM_NAP_ENABLED0x0001
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 7e48eb5bf..0ece62684 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -26,6 +26,10 @@ config CPU_IDLE_GOV_MENU
 config DT_IDLE_STATES
bool
 
+config CPU_IDLE_AUTO_PROMOTION
+   bool
+   default y if PPC_POWERNV
+
 menu "ARM CPU Idle Drivers"
 depends on ARM || ARM64
 source "drivers/cpuidle/Kconfig.arm"
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..e351f5f9c 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -299,6 +299,7 @@ static int powernv_add_idle_states(void)
for (i = 0; i < dt_idle_states; i++) {
unsigned int exit_latency, target_residency;
bool stops_timebase = false;
+   bool lose_user_context = false;
struct pnv_idle_states_t *state = _idle_states[i];
 
/*
@@ -324,6 +325,9 @@ static int powernv_add_idle_states(void)
if (has_stop_states && !(state->valid))
continue;
 
+   if (state->flags & OPAL_PM_LOSE_USER_CONTEXT)
+   lose_user_context = true;
+
if (state->flags & OPAL_PM_TIMEBASE_STOP)
stops_timebase = true;
 
@@ -332,12 +336,17 @@ static int powernv_add_idle_states(void)
add_powernv_state(nr_idle_states, "Nap",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
+   } else if (has_stop_states & !lose_user_context) {
+   add_powernv_state(nr_idle_states, state->name,
+ CPUIDLE_FLAG_AUTO_PROMOTION,
+ stop_loop, target_residency,
+ exit_latency, state->psscr_val,
+ state->psscr_mask);
} else if (has_stop_states && !stops_timebase) {
add_powernv_state(nr_idle_states, state->name,
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
- state->psscr_val,
- state->psscr_mask);
+ state->psscr_val, state->psscr_mask);
}
 
/*
-- 
2.17.1



[PATCH 1/2] cpuidle : auto-promotion for cpuidle states

2019-03-22 Thread Abhishek Goel
Currently, the cpuidle governors (menu /ladder) determine what idle state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle.c  | 79 +-
 drivers/cpuidle/governors/ladder.c |  3 +-
 drivers/cpuidle/governors/menu.c   | 23 -
 include/linux/cpuidle.h| 12 +++--
 4 files changed, 111 insertions(+), 6 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 7f108309e..c4d1c1b38 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -36,6 +36,12 @@ static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
 
+struct auto_promotion {
+   struct hrtimer  hrtimer;
+   int timeout;
+   booltimeout_needed;
+};
+
 int cpuidle_disabled(void)
 {
return off;
@@ -188,6 +194,64 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, 
struct cpuidle_device *dev)
 }
 #endif /* CONFIG_SUSPEND */
 
+enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer)
+{
+   return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION
+DEFINE_PER_CPU(struct auto_promotion, ap);
+
+static void cpuidle_auto_promotion_start(struct cpuidle_state *state, int cpu)
+{
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   if (this_ap->timeout_needed && (state->flags &
+   CPUIDLE_FLAG_AUTO_PROMOTION))
+   hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout
+   * 1000), HRTIMER_MODE_REL_PINNED);
+}
+
+static void cpuidle_auto_promotion_cancel(int cpu)
+{
+   struct hrtimer *hrtimer;
+
+   hrtimer = _cpu(ap, cpu).hrtimer;
+   if (hrtimer_is_queued(hrtimer))
+   hrtimer_cancel(hrtimer);
+}
+
+static void cpuidle_auto_promotion_update(int time, int cpu)
+{
+   per_cpu(ap, cpu).timeout = time;
+}
+
+static void cpuidle_auto_promotion_init(struct cpuidle_driver *drv, int cpu)
+{
+   int i;
+   struct auto_promotion *this_ap = _cpu(ap, cpu);
+
+   this_ap->timeout_needed = 0;
+
+   for (i = 0; i < drv->state_count; i++) {
+   if (drv->states[i].flags & CPUIDLE_FLAG_AUTO_PROMOTION) {
+   this_ap->timeout_needed = 1;
+   break;
+   }
+   }
+
+   hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   this_ap->hrtimer.function = auto_promotion_hrtimer_callback;
+}
+#else
+static inline void cpuidle_auto_promotion_start(struct cpuidle_state *state,
+   int cpu) { }
+static inline void cpuidle_auto_promotion_cancel(int cpu) { }
+static inline void cpuidle_auto_promotion_update(int timeout, int cpu) { }
+static inline void cpuidle_auto_promotion_init(struct cpuidle_driver *drv,
+   int cpu) { }
+#endif
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -225,12 +289,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, 
struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ns_to_ktime(local_clock());
 
+   cpuidle_auto_promotion_start(target_state, dev->cpu);
+
stop_critical_timings();
entered_state = target_state->enter(dev, drv, index);
start_critical_timings();
 
sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
+
+   cpuidle_auto_promotion_cancel(dev->cpu);
+
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
/* The cpu is no longer idle or about to enter idle. */
@@ -312,7 +381,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct 
cpuidle_driver *drv,
 int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
   bool *stop_tick)
 {
-   return cpuidle_curr_governor->select(drv, dev, stop_tick);
+   int timeout, ret;
+
+   timeout = INT_MAX;
+   ret = cpuidle_curr_governor->select

[PATCH 0/2] Auto-promotion logic for cpuidle states

2019-03-22 Thread Abhishek Goel
Currently, the cpuidle governors (menu/ladder) determine what idle state a
idling CPU should enter into based on heuristics that depend on the idle
history on that CPU. Given that no predictive heuristic is perfect, there
are cases where the governor predicts a shallow idle state, hoping that
the CPU will be busy soon. However, if no new workload is scheduled on
that CPU in the near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-core
can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Experiment
--
Without this patch -
It was seen that for a idle system, a cpu may remain in stop0_lite for few
seconds and then directly goes to a deeper state such as stop2.

With this patch -
A cpu will not remain in stop0_lite for more than the residency of next
available state, and thus it will go to a deeper state in conservative
fashion. Using this, we may spent even less than 20 milliseconds if
susbsequent stop states are enabled. In the worst case, we may end up
spending more than a second, as was the case without this patch. The
worst case will occur in the scenario when no other shallow states are
enbaled, and only deep states are available for auto-promotion.

Abhishek Goel (2):
  cpuidle : auto-promotion for cpuidle states
  cpuidle : Add auto-promotion flag to cpuidle flags

 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 ++
 drivers/cpuidle/cpuidle-powernv.c   | 13 -
 drivers/cpuidle/cpuidle.c   | 79 -
 drivers/cpuidle/governors/ladder.c  |  3 +-
 drivers/cpuidle/governors/menu.c| 23 -
 include/linux/cpuidle.h | 12 +++--
 7 files changed, 127 insertions(+), 8 deletions(-)

-- 
2.17.1



[PATCH 0/2] Auto-promotion logic for cpuidle states

2019-03-22 Thread Abhishek Goel
Currently, the cpuidle governors (menu/ladder) determine what idle state a
idling CPU should enter into based on heuristics that depend on the idle
history on that CPU. Given that no predictive heuristic is perfect, there
are cases where the governor predicts a shallow idle state, hoping that
the CPU will be busy soon. However, if no new workload is scheduled on
that CPU in the near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-core
can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Experiment
--
Without this patch -
It was seen that for a idle system, a cpu may remain in stop0_lite for few
seconds and then directly goes to a deeper state such as stop2.

With this patch -
A cpu will not remain in stop0_lite for more than the residency of next
available state, and thus it will go to a deeper state in conservative
fashion. Using this, we may spent even less than 20 milliseconds if
susbsequent stop states are enabled. In the worst case, we may end up
spending more than a second, as was the case without this patch. The
worst case will occur in the scenario when no other shallow states are
enbaled, and only deep states are available for auto-promotion.

Abhishek Goel (2):
  cpuidle : auto-promotion for cpuidle states
  cpuidle : Add auto-promotion flag to cpuidle flags

 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 
 drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
 drivers/cpuidle/cpuidle.c   |  3 ---
 4 files changed, 16 insertions(+), 5 deletions(-)

-- 
2.17.1



[PATCH 2/2] cpuidle : Add auto-promotion flag to cpuidle flags

2019-03-22 Thread Abhishek Goel
This patch sets up flags for the state which needs to be auto-promoted.
For powernv systems, lite states do not even lose user context. That
information has been used to set the flag for lite states.

Signed-off-by: Abhishek Goel 
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 drivers/cpuidle/Kconfig |  4 
 drivers/cpuidle/cpuidle-powernv.c   | 13 +++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b23..735dec731 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -226,6 +226,7 @@
  */
 
 #define OPAL_PM_TIMEBASE_STOP  0x0002
+#define OPAL_PM_LOSE_USER_CONTEXT  0x1000
 #define OPAL_PM_LOSE_HYP_CONTEXT   0x2000
 #define OPAL_PM_LOSE_FULL_CONTEXT  0x4000
 #define OPAL_PM_NAP_ENABLED0x0001
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 7e48eb5bf..0ece62684 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -26,6 +26,10 @@ config CPU_IDLE_GOV_MENU
 config DT_IDLE_STATES
bool
 
+config CPU_IDLE_AUTO_PROMOTION
+   bool
+   default y if PPC_POWERNV
+
 menu "ARM CPU Idle Drivers"
 depends on ARM || ARM64
 source "drivers/cpuidle/Kconfig.arm"
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..e351f5f9c 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -299,6 +299,7 @@ static int powernv_add_idle_states(void)
for (i = 0; i < dt_idle_states; i++) {
unsigned int exit_latency, target_residency;
bool stops_timebase = false;
+   bool lose_user_context = false;
struct pnv_idle_states_t *state = _idle_states[i];
 
/*
@@ -324,6 +325,9 @@ static int powernv_add_idle_states(void)
if (has_stop_states && !(state->valid))
continue;
 
+   if (state->flags & OPAL_PM_LOSE_USER_CONTEXT)
+   lose_user_context = true;
+
if (state->flags & OPAL_PM_TIMEBASE_STOP)
stops_timebase = true;
 
@@ -332,12 +336,17 @@ static int powernv_add_idle_states(void)
add_powernv_state(nr_idle_states, "Nap",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
+   } else if (has_stop_states & !lose_user_context) {
+   add_powernv_state(nr_idle_states, state->name,
+ CPUIDLE_FLAG_AUTO_PROMOTION,
+ stop_loop, target_residency,
+ exit_latency, state->psscr_val,
+ state->psscr_mask);
} else if (has_stop_states && !stops_timebase) {
add_powernv_state(nr_idle_states, state->name,
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
- state->psscr_val,
- state->psscr_mask);
+ state->psscr_val, state->psscr_mask);
}
 
/*
-- 
2.17.1



[PATCH 1/2] cpuidle : auto-promotion for cpuidle states

2019-03-22 Thread Abhishek Goel
Currently, the cpuidle governors (menu /ladder) determine what idle state
an idling CPU should enter into based on heuristics that depend on the
idle history on that CPU. Given that no predictive heuristic is perfect,
there are cases where the governor predicts a shallow idle state, hoping
that the CPU will be busy soon. However, if no new workload is scheduled
on that CPU in the near future, the CPU will end up in the shallow state.

In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a lite stop state, as such lite states will
inhibit SMT folding, thereby depriving the other threads in the core from
using the core resources.

To address this, such lite states need to be autopromoted. The cpuidle-
core can queue timer to correspond with the residency value of the next
available state. Thus leading to auto-promotion to a deeper idle state as
soon as possible.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2406e2655..c4d1c1b38 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -584,11 +584,8 @@ static void __cpuidle_unregister_device(struct 
cpuidle_device *dev)
 
 static void __cpuidle_device_init(struct cpuidle_device *dev)
 {
-   int i;
memset(dev->states_usage, 0, sizeof(dev->states_usage));
dev->last_residency = 0;
-   for (i = 0; i < CPUIDLE_STATE_MAX; i++)
-   dev->states_usage[i].disable = true;
 }
 
 /**
-- 
2.17.1



[RFC 1/1] cpuidle : Move saving and restoring of sprs to opal

2018-07-23 Thread Abhishek Goel
This patch moves the saving and restoring of sprs for P9 cpuidle
from kernel to opal. This patch still uses existing code to detect
first thread in core.
In an attempt to make the powernv idle code backward compatible,
and to some extent forward compatible, add support for pre-stop entry
and post-stop exit actions in OPAL. If a kernel knows about this
opal call, then just a firmware supporting newer hardware is required,
instead of waiting for kernel updates.

Signed-off-by: Abhishek Goel 
---

Link to the Skiboot patch corresponding to this patch:
http://patchwork.ozlabs.org/patch/947568/

 arch/powerpc/include/asm/cpuidle.h|  10 --
 arch/powerpc/include/asm/opal-api.h   |   4 +-
 arch/powerpc/include/asm/opal.h   |   3 +
 arch/powerpc/include/asm/paca.h   |   5 +-
 arch/powerpc/kernel/asm-offsets.c |  10 +-
 arch/powerpc/kernel/idle_book3s.S | 130 ++
 arch/powerpc/platforms/powernv/idle.c |   4 +
 .../powerpc/platforms/powernv/opal-wrappers.S |   2 +
 arch/powerpc/xmon/xmon.c  |  12 +-
 9 files changed, 61 insertions(+), 119 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index e210a83eb196..c10f47af9a55 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -68,16 +68,6 @@
 #define ERR_DEEP_STATE_ESL_MISMATCH-2
 
 #ifndef __ASSEMBLY__
-/* Additional SPRs that need to be saved/restored during stop */
-struct stop_sprs {
-   u64 pid;
-   u64 ldbar;
-   u64 fscr;
-   u64 hfscr;
-   u64 mmcr1;
-   u64 mmcr2;
-   u64 mmcra;
-};
 
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 3bab299eda49..6792a737bc9a 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -208,7 +208,9 @@
 #define OPAL_SENSOR_READ_U64   162
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR   164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR   165
-#define OPAL_LAST  165
+#define OPAL_IDLE_SAVE 168
+#define OPAL_IDLE_RESTORE  169
+#define OPAL_LAST  169
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index e1b2910c6e81..12d57aeacde2 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -356,6 +356,9 @@ extern void opal_kmsg_init(void);
 
 extern int opal_event_request(unsigned int opal_event_nr);
 
+extern int opal_cpuidle_save(u64 *stop_sprs, int scope, u64 psscr);
+extern int opal_cpuidle_restore(u64 *stop_sprs, int scope, u64 psscr, u64 
srr1);
+
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
 unsigned long vmalloc_size);
 void opal_free_sg_list(struct opal_sg_list *sg);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 6d34bd71139d..765524e76beb 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -195,11 +195,12 @@ struct paca_struct {
/* The PSSCR value that the kernel requested before going to stop */
u64 requested_psscr;
 
+   u64 wakeup_psscr;
/*
-* Save area for additional SPRs that need to be
+* Save area for SPRs that need to be
 * saved/restored during cpuidle stop.
 */
-   struct stop_sprs stop_sprs;
+   u64 *opal_stop_sprs;
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 0a0544335950..65a3d8582017 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -769,14 +769,8 @@ int main(void)
OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
OFFSET(PACA_DONT_STOP, paca_struct, dont_stop);
-#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f)
-   STOP_SPR(STOP_PID, pid);
-   STOP_SPR(STOP_LDBAR, ldbar);
-   STOP_SPR(STOP_FSCR, fscr);
-   STOP_SPR(STOP_HFSCR, hfscr);
-   STOP_SPR(STOP_MMCR1, mmcr1);
-   STOP_SPR(STOP_MMCR2, mmcr2);
-   STOP_SPR(STOP_MMCRA, mmcra);
+   OFFSET(PACA_WAKEUP_PSSCR, paca_struct, wakeup_psscr);
+   OFFSET(STOP_SPRS, paca_struct, opal_stop_sprs);
 #endif
 
DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index e734f6e45abc..66fc955abee3 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -45,6 +45,9 @@
 
 #define

[PATCH v2] cpuidle/powernv : Add Description for cpuidle state

2018-06-04 Thread Abhishek Goel
Names of cpuidle states were being used for description of states
in POWER as no descriptions were added in device tree. This patch
reads description for idle states which have been added in device
tree.
The description for idle states in case of POWER can be printed
using "cpupower monitor -l" or "cpupower idle-info".

Signed-off-by: Abhishek Goel 
---

The skiboot patch which adds description for idle states in device tree
can be found here: https://patchwork.ozlabs.org/patch/924879/

 drivers/cpuidle/cpuidle-powernv.c | 19 +++
 include/linux/cpuidle.h   |  2 +-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1a8234e706bc..08d8b0953a14 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -133,7 +133,7 @@ static int stop_loop(struct cpuidle_device *dev,
 static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = {
{ /* Snooze */
.name = "snooze",
-   .desc = "snooze",
+   .desc = "idle polling state",
.exit_latency = 0,
.target_residency = 0,
.enter = snooze_loop },
@@ -206,6 +206,7 @@ static int powernv_cpuidle_driver_init(void)
 }
 
 static inline void add_powernv_state(int index, const char *name,
+const char *desc,
 unsigned int flags,
 int (*idle_fn)(struct cpuidle_device *,
struct cpuidle_driver *,
@@ -215,7 +216,7 @@ static inline void add_powernv_state(int index, const char 
*name,
 u64 psscr_val, u64 psscr_mask)
 {
strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
-   strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+   strlcpy(powernv_states[index].desc, desc, CPUIDLE_DESC_LEN);
powernv_states[index].flags = flags;
powernv_states[index].target_residency = target_residency;
powernv_states[index].exit_latency = exit_latency;
@@ -250,6 +251,7 @@ static int powernv_add_idle_states(void)
u64 psscr_val[CPUIDLE_STATE_MAX];
u64 psscr_mask[CPUIDLE_STATE_MAX];
const char *names[CPUIDLE_STATE_MAX];
+   const char *descs[CPUIDLE_STATE_MAX];
u32 has_stop_states = 0;
int i, rc;
u32 supported_flags = pnv_get_supported_cpuidle_states();
@@ -311,6 +313,13 @@ static int powernv_add_idle_states(void)
pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in 
DT\n");
goto out;
}
+   if (of_property_read_string_array(power_mgt,
+   "ibm,cpu-idle-state-descs", descs, dt_idle_states) < 0) {
+   of_property_read_string_array(power_mgt,
+   "ibm,cpu-idle-state-names", descs, 
dt_idle_states);
+   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-descs in 
DT\n"
+   "Name will be used for description\n");
+   }
 
/*
 * If the idle states use stop instruction, probe for psscr values
@@ -414,10 +423,11 @@ static int powernv_add_idle_states(void)
target_residency = 100;
/* Add NAP state */
add_powernv_state(nr_idle_states, "Nap",
+ "stop processor execution",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
} else if (has_stop_states && !stops_timebase) {
-   add_powernv_state(nr_idle_states, names[i],
+   add_powernv_state(nr_idle_states, names[i], descs[i],
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
  psscr_val[i], psscr_mask[i]);
@@ -434,11 +444,12 @@ static int powernv_add_idle_states(void)
target_residency = 30;
/* Add FASTSLEEP state */
add_powernv_state(nr_idle_states, "FastSleep",
+ "Core and L2 clock gating",
  CPUIDLE_FLAG_TIMER_STOP,
  fastsleep_loop,
  target_residency, exit_latency, 0, 0);
} else if (has_stop_states && stops_timebase) {
-   add_powernv_state(nr_idle_states, names

[PATCH] cpuidle/powernv : Add Description for cpuidle state

2018-05-28 Thread Abhishek Goel
Names of cpuidle states were being used for description of states
in POWER as no descriptions were added in device tree. This patch
reads description for idle states which have been added in device
tree.
The description for idle states in case of POWER can be printed
using "cpupower monitor -l" or "cpupower idle-info".

Signed-off-by: Abhishek Goel 
---

The skiboot patch which adds description for idle states in device tree
can be found here: https://patchwork.ozlabs.org/patch/921637/

 drivers/cpuidle/cpuidle-powernv.c | 17 +
 include/linux/cpuidle.h   |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 1a8234e706bc..d3915a965128 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -133,7 +133,7 @@ static int stop_loop(struct cpuidle_device *dev,
 static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = {
{ /* Snooze */
.name = "snooze",
-   .desc = "snooze",
+   .desc = "idle polling state",
.exit_latency = 0,
.target_residency = 0,
.enter = snooze_loop },
@@ -206,6 +206,7 @@ static int powernv_cpuidle_driver_init(void)
 }
 
 static inline void add_powernv_state(int index, const char *name,
+const char *desc,
 unsigned int flags,
 int (*idle_fn)(struct cpuidle_device *,
struct cpuidle_driver *,
@@ -215,7 +216,7 @@ static inline void add_powernv_state(int index, const char 
*name,
 u64 psscr_val, u64 psscr_mask)
 {
strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
-   strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+   strlcpy(powernv_states[index].desc, desc, CPUIDLE_DESC_LEN);
powernv_states[index].flags = flags;
powernv_states[index].target_residency = target_residency;
powernv_states[index].exit_latency = exit_latency;
@@ -250,6 +251,7 @@ static int powernv_add_idle_states(void)
u64 psscr_val[CPUIDLE_STATE_MAX];
u64 psscr_mask[CPUIDLE_STATE_MAX];
const char *names[CPUIDLE_STATE_MAX];
+   const char *descs[CPUIDLE_STATE_MAX];
u32 has_stop_states = 0;
int i, rc;
u32 supported_flags = pnv_get_supported_cpuidle_states();
@@ -311,6 +313,11 @@ static int powernv_add_idle_states(void)
pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in 
DT\n");
goto out;
}
+   if (of_property_read_string_array(power_mgt,
+   "ibm,cpu-idle-state-descs", descs, dt_idle_states) < 0) {
+   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-descs in 
DT\n");
+   goto out;
+   }
 
/*
 * If the idle states use stop instruction, probe for psscr values
@@ -414,10 +421,11 @@ static int powernv_add_idle_states(void)
target_residency = 100;
/* Add NAP state */
add_powernv_state(nr_idle_states, "Nap",
+ "stop processor execution",
  CPUIDLE_FLAG_NONE, nap_loop,
  target_residency, exit_latency, 0, 0);
} else if (has_stop_states && !stops_timebase) {
-   add_powernv_state(nr_idle_states, names[i],
+   add_powernv_state(nr_idle_states, names[i], descs[i],
  CPUIDLE_FLAG_NONE, stop_loop,
  target_residency, exit_latency,
  psscr_val[i], psscr_mask[i]);
@@ -434,11 +442,12 @@ static int powernv_add_idle_states(void)
target_residency = 30;
/* Add FASTSLEEP state */
add_powernv_state(nr_idle_states, "FastSleep",
+ "Core and L2 clock gating",
  CPUIDLE_FLAG_TIMER_STOP,
  fastsleep_loop,
  target_residency, exit_latency, 0, 0);
} else if (has_stop_states && stops_timebase) {
-   add_powernv_state(nr_idle_states, names[i],
+   add_powernv_state(nr_idle_states, names[i], descs[i],
  CPUIDLE_FLAG_TIMER_STOP, stop_loop,
  target_residency, exit_latency,
   

[PATCH v3] cpufreq: powernv: Add support of frequency domain

2018-01-22 Thread Abhishek Goel
Frequency-domain indicates group of CPUs that would share same frequency.
It is detected using device-tree node "frequency-domain-indicator".
frequency-domain-indicator is a bitmask which will have different value
depending upon the generation of the processor.

CPUs of the same chip for which the result of a bitwise AND between
their PIR and the frequency-domain-indicator is the same share the same
frequency.

In this patch, we define hash-table indexed by the aforementioned
bitwise ANDed value to store the cpumask of the CPUs sharing the same
frequency domain. Further, the cpufreq policy will be created per
frequency-domain

So for POWER9, a cpufreq policy is created per quad while for POWER8 it
is created per core. Governor decides frequency for each policy but
multiple cores may come under same policy. In such case frequency needs
to be set on each core sharing that policy.

Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com>
---

Skiboot patch required for the corresponding device-tree changes have been
posted here : http://patchwork.ozlabs.org/patch/862256/

 drivers/cpufreq/powernv-cpufreq.c | 104 ++
 1 file changed, 95 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c..aab23a4 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -37,6 +37,7 @@
 #include  /* Required for cpu_sibling_mask() in UP configs */
 #include 
 #include 
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
@@ -130,6 +131,9 @@ enum throttle_reason_type {
 static int nr_chips;
 static DEFINE_PER_CPU(struct chip *, chip_info);
 
+static u32 freq_domain_indicator;
+static bool p9_occ_quirk;
+
 /*
  * Note:
  * The set of pstates consists of contiguous integers.
@@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
gpstates->last_gpstate_idx = 0;
 }
 
+#define SIZE NR_CPUS
+#define ORDER_FREQ_MAP ilog2(SIZE)
+
+static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP);
+
+struct hashmap {
+   cpumask_t mask;
+   int chip_id;
+   u32 pir_key;
+   struct hlist_node hash_node;
+};
+
+static void insert(u32 key, int cpu)
+{
+   struct hashmap *data;
+
+   hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) {
+   if (data->chip_id == cpu_to_chip_id(cpu) &&
+   data->pir_key == key) {
+   cpumask_set_cpu(cpu, >mask);
+   return;
+   }
+   }
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   hash_add(freq_domain_map, >hash_node, key%SIZE);
+   cpumask_set_cpu(cpu, >mask);
+   data->chip_id = cpu_to_chip_id(cpu);
+   data->pir_key = key;
+
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -206,6 +242,7 @@ static int init_powernv_pstates(void)
u32 len_ids, len_freqs;
u32 pstate_min, pstate_max, pstate_nominal;
u32 pstate_turbo, pstate_ultra_turbo;
+   u32 key;
 
power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
if (!power_mgt) {
@@ -246,9 +283,18 @@ static int init_powernv_pstates(void)
else
powernv_pstate_info.wof_enabled = true;
 
+   if (of_device_is_compatible(power_mgt, "freq-domain-v1") &&
+   of_property_read_u32(power_mgt, "ibm,freq-domain-indicator",
+   _domain_indicator))
+   pr_warn("ibm,freq-domain-indicator not found\n");
+
+if (of_device_is_compatible(power_mgt, "p9-occ-quirk"))
+   p9_occ_quirk = true;
+
 next:
pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
pstate_nominal, pstate_max);
+   pr_info("frequency domain indicator %d", freq_domain_indicator);
pr_info("Workload Optimized Frequency is %s in the platform\n",
(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
 
@@ -276,6 +322,15 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (freq_domain_indicator) {
+   hash_init(freq_domain_map);
+   for_each_possible_cpu(i) {
+   key = ((u32) get_hard_smp_processor_id(i) &
+   freq_domain_indicator);
+   insert(key, i);
+   }
+   }
+
powernv_pstate_info.nr_pstates = nr_pstates;
pr_debug("NR PStates %d\n", nr_pstates);
for (i = 0; i < nr_pstates; i++) {
@@ -760,25 +815,56 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 
spin_unlock(>gpstate_lock);
 
- 

[PATCH v2] cpufreq: powernv: Add support of frequency domain

2017-12-19 Thread Abhishek Goel
Frequency-domain indicates group of CPUs that would share same frequency.
It is detected using device-tree node "frequency-domain-indicator".
frequency-domain-indicator is a bitmask which will have different value
depending upon the generation of the processor.

CPUs of the same chip for which the result of a bitwise AND between
their PIR and the frequency-domain-indicator is the same share the same
frequency.

In this patch, we define hash-table indexed by the aforementioned
bitwise ANDed value to store the cpumask of the CPUs sharing the same
frequency domain. Further, the cpufreq policy will be created per
frequency-domain

So for POWER9, a cpufreq policy is created per quad while for POWER8 it
is created per core. Governor decides frequency for each policy but
multiple cores may come under same policy. In such case frequency needs
to be set on each core sharing that policy.

Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 110 ++
 1 file changed, 100 insertions(+), 10 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c..fd642bc 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -37,6 +37,7 @@
 #include  /* Required for cpu_sibling_mask() in UP configs */
 #include 
 #include 
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
@@ -130,6 +131,9 @@ static struct chip {
 static int nr_chips;
 static DEFINE_PER_CPU(struct chip *, chip_info);
 
+static u32 freq_domain_indicator;
+static u32 flag;
+
 /*
  * Note:
  * The set of pstates consists of contiguous integers.
@@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
gpstates->last_gpstate_idx = 0;
 }
 
+#define SIZE NR_CPUS
+#define ORDER_FREQ_MAP ilog2(SIZE)
+
+static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP);
+
+struct hashmap {
+   cpumask_t mask;
+   int chip_id;
+   u32 pir_key;
+   struct hlist_node hash_node;
+};
+
+static void insert(u32 key, int cpu)
+{
+   struct hashmap *data;
+
+   hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) {
+   if (data->chip_id == cpu_to_chip_id(cpu) &&
+   data->pir_key == key) {
+   cpumask_set_cpu(cpu, >mask);
+   return;
+   }
+   }
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   hash_add(freq_domain_map, >hash_node, key%SIZE);
+   cpumask_set_cpu(cpu, >mask);
+   data->chip_id = cpu_to_chip_id(cpu);
+   data->pir_key = key;
+
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -206,7 +242,9 @@ static int init_powernv_pstates(void)
u32 len_ids, len_freqs;
u32 pstate_min, pstate_max, pstate_nominal;
u32 pstate_turbo, pstate_ultra_turbo;
+   u32 key;
 
+   flag = 0;
power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
if (!power_mgt) {
pr_warn("power-mgt node not found\n");
@@ -229,6 +267,17 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (of_device_is_compatible(power_mgt, "freq-domain-v1") &&
+   of_property_read_u32(power_mgt, "ibm,freq-domain-indicator",
+_domain_indicator)) {
+   pr_warn("ibm,freq-domain-indicator not found\n");
+   freq_domain_indicator = 0;
+   }
+
+   if (of_device_is_compatible(power_mgt, "P9-occ-quirk")) {
+   flag = 1;
+   }
+
if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo",
 _ultra_turbo)) {
powernv_pstate_info.wof_enabled = false;
@@ -249,6 +298,7 @@ static int init_powernv_pstates(void)
 next:
pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
pstate_nominal, pstate_max);
+   pr_info("frequency domain indicator %d", freq_domain_indicator);
pr_info("Workload Optimized Frequency is %s in the platform\n",
(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
 
@@ -276,6 +326,15 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (freq_domain_indicator) {
+   hash_init(freq_domain_map);
+   for_each_possible_cpu(i) {
+   key = ((u32) get_hard_smp_processor_id(i) &
+   freq_domain_indicator);
+   insert(key, i);
+   }
+   }
+
powernv_pstate_info.nr_pstates = nr_pstates;
pr_debug("NR PStates %d\n", n

[PATCH] cpufreq: powernv: Add support of frequency domain

2017-12-13 Thread Abhishek Goel
Frequency-domain indicates group of CPUs that would share same frequency.
It is detected using device-tree node "frequency-domain-indicator".
frequency-domain-indicator is a bitmask which will have different value
depending upon the generation of the processor.

CPUs of the same chip for which the result of a bitwise AND between
their PIR and the frequency-domain-indicator is the same share the same
frequency.

In this patch, we define hash-table indexed by the aforementioned
bitwise ANDed value to store the cpumask of the CPUs sharing the same
frequency domain. Further, the cpufreq policy will be created per
frequency-domain

So for POWER9, a cpufreq policy is created per quad while for POWER8 it
is created per core. Governor decides frequency for each policy but
multiple cores may come under same policy. In such case frequency needs
to be set on each core sharing that policy.

Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 95 +++
 1 file changed, 87 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c..9384110 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -37,6 +37,7 @@
 #include  /* Required for cpu_sibling_mask() in UP configs */
 #include 
 #include 
+#include 
 
 #define POWERNV_MAX_PSTATES256
 #define PMSR_PSAFE_ENABLE  (1UL << 30)
@@ -130,6 +131,8 @@ static struct chip {
 static int nr_chips;
 static DEFINE_PER_CPU(struct chip *, chip_info);
 
+static u32 freq_domain_indicator;
+
 /*
  * Note:
  * The set of pstates consists of contiguous integers.
@@ -194,6 +197,38 @@ static inline void reset_gpstates(struct cpufreq_policy 
*policy)
gpstates->last_gpstate_idx = 0;
 }
 
+#define SIZE NR_CPUS
+#define ORDER_FREQ_MAP ilog2(SIZE)
+
+static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP);
+
+struct hashmap {
+   cpumask_t mask;
+   int chip_id;
+   u32 pir_key;
+   struct hlist_node hash_node;
+};
+
+static void insert(u32 key, int cpu)
+{
+   struct hashmap *data;
+
+   hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) {
+   if (data->chip_id == cpu_to_chip_id(cpu) &&
+   data->pir_key == key) {
+   cpumask_set_cpu(cpu, >mask);
+   return;
+   }
+   }
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   hash_add(freq_domain_map, >hash_node, key%SIZE);
+   cpumask_set_cpu(cpu, >mask);
+   data->chip_id = cpu_to_chip_id(cpu);
+   data->pir_key = key;
+
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -206,6 +241,7 @@ static int init_powernv_pstates(void)
u32 len_ids, len_freqs;
u32 pstate_min, pstate_max, pstate_nominal;
u32 pstate_turbo, pstate_ultra_turbo;
+   u32 key;
 
power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
if (!power_mgt) {
@@ -229,6 +265,13 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (of_device_is_compatible(power_mgt, "freq-domain-v1") &&
+   of_property_read_u32(power_mgt, "ibm,freq-domain-indicator",
+_domain_indicator)) {
+   pr_warn("ibm,freq-domain-indicator not found\n");
+   freq_domain_indicator = 0;
+   }
+
if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo",
 _ultra_turbo)) {
powernv_pstate_info.wof_enabled = false;
@@ -249,6 +292,7 @@ static int init_powernv_pstates(void)
 next:
pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
pstate_nominal, pstate_max);
+   pr_info("frequency domain indicator %d", freq_domain_indicator);
pr_info("Workload Optimized Frequency is %s in the platform\n",
(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
 
@@ -276,6 +320,15 @@ static int init_powernv_pstates(void)
return -ENODEV;
}
 
+   if (freq_domain_indicator) {
+   hash_init(freq_domain_map);
+   for_each_possible_cpu(i) {
+   key = ((u32) get_hard_smp_processor_id(i) &
+   freq_domain_indicator);
+   insert(key, i);
+   }
+   }
+
powernv_pstate_info.nr_pstates = nr_pstates;
pr_debug("NR PStates %d\n", nr_pstates);
for (i = 0; i < nr_pstates; i++) {
@@ -693,6 +746,8 @@ static int powernv_cpufreq_target_index(struct 
cpufreq_policy *policy,
 {
struct powernv_smp_call_data freq_data;
unsigned in