[RFC v2 2/2] powerpc/powernv : Introduce capability for firmware-enabled-stop
This patch introduces the capability for firmware to handle the stop states instead. A bit is set based on the discovery of the feature and correspondingly also the responsibility to handle the stop states. If Kernel does not know about stop version, it can fallback to opal for idle stop support if firmware-stop-supported property is present. Earlier part of this patch was posted in this series : https://lkml.org/lkml/2020/3/4/589 Signed-off-by: Abhishek Goel Signed-off-by: Pratik Rajesh Sampat --- v1->v2 : Combined patch 2 and 3 from previous iteration and rebased it. arch/powerpc/include/asm/processor.h | 18 ++ arch/powerpc/kernel/dt_cpu_ftrs.c | 13 + arch/powerpc/platforms/powernv/idle.c | 20 drivers/cpuidle/cpuidle-powernv.c | 3 ++- 4 files changed, 49 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index bfa336fbcfeb..b8de7146387c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -428,6 +428,24 @@ extern void power4_idle_nap(void); extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; +#define STOP_ENABLE0x0001 +#define FIRMWARE_STOP_ENABLE 0x0010 + +#define STOP_VERSION_P9 0x1 + +/* + * Classify the dependencies of the stop states + * @idle_stop: function handler to handle the quirk stop version + * @cpuidle_prop: Signify support for stop states through kernel and/or firmware + * @stop_version: Classify quirk versions for stop states + */ +typedef struct { + unsigned long (*idle_stop)(unsigned long psscr, bool mmu_on); + uint8_t cpuidle_prop; + uint8_t stop_version; +} stop_deps_t; +extern stop_deps_t stop_dep; + extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern void power7_idle_type(unsigned long type); diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 36bc0d5c4f3a..737686fae3c7 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -291,6 +291,15 @@ static int __init feat_enable_idle_stop(struct dt_cpu_feature *f) lpcr |= LPCR_PECE1; lpcr |= LPCR_PECE2; mtspr(SPRN_LPCR, lpcr); + stop_dep.cpuidle_prop |= STOP_ENABLE; + stop_dep.stop_version = STOP_VERSION_P9; + + return 1; +} + +static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f) +{ + stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE; return 1; } @@ -589,6 +598,7 @@ static struct dt_cpu_feature_match __initdata {"idle-nap", feat_enable_idle_nap, 0}, {"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0}, {"idle-stop", feat_enable_idle_stop, 0}, + {"firmware-stop-supported", feat_enable_firmware_stop, 0}, {"machine-check-power8", feat_enable_mce_power8, 0}, {"performance-monitor-power8", feat_enable_pmu_power8, 0}, {"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR}, @@ -656,6 +666,9 @@ static void __init cpufeatures_setup_start(u32 isa) } } +stop_deps_t stop_dep = {NULL, 0x0, 0x0}; +EXPORT_SYMBOL(stop_dep); + static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) { const struct dt_cpu_feature_match *m; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 3afd4293f729..3602950f6c08 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -824,7 +824,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); #else /* @@ -840,7 +840,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, false); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; @@ -868,7 +868,7 @@ void power9_idle_type(unsigned long stop_psscr_val, psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); fini_irq_for_idle_irqsoff(); @@ -1365,8 +1365,20 @@ static int __init pnv_init_idle_states(void) nr_pnv_idle_states = 0; supported_cpuidle_states = 0; - if (cpuidle_disable != IDLE_NO_OVERRIDE) + if (cpuidle_disable != IDLE_NO_OVERRIDE || + !(stop_dep.cpuidle_prop
[PATCH] cpuidle/powernv : Remove dead code block
Commit 1961acad2f88559c2cdd2ef67c58c3627f1f6e54 removes usage of function "validate_dt_prop_sizes". This patch removes this unused function. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 14 -- 1 file changed, 14 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1b299e801f74..addaa6e6718b 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -244,20 +244,6 @@ static inline void add_powernv_state(int index, const char *name, stop_psscr_table[index].mask = psscr_mask; } -/* - * Returns 0 if prop1_len == prop2_len. Else returns -1 - */ -static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len, -const char *prop2, int prop2_len) -{ - if (prop1_len == prop2_len) - return 0; - - pr_warn("cpuidle-powernv: array sizes don't match for %s and %s\n", - prop1, prop2); - return -1; -} - extern u32 pnv_get_supported_cpuidle_states(void); static int powernv_add_idle_states(void) { -- 2.17.1
[RFC v2 1/2] powerpc/powernv : Add support for pre-entry and post-exit of stop state using OPAL V4 OS services
This patch provides kernel framework fro opal support of save restore of sprs in idle stop loop. Opal support for stop states is needed to selectively enable stop states or to introduce a quirk quickly in case a buggy stop state is present. We make a opal call from kernel if firmware-stop-support for stop states is present and enabled. All the quirks for pre-entry of stop state is handled inside opal. A call from opal is made into kernel where we execute stop afer saving of NVGPRs. After waking up from 0x100 vector in kernel, we enter back into opal. All the quirks in post exit path, if any, are then handled in opal, from where we return successfully back to kernel. Signed-off-by: Abhishek Goel --- v1->v2 : Rebased the patch on Nick's Opal V4 OS patchset arch/powerpc/include/asm/opal-api.h| 4 +++- arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/platforms/powernv/idle.c | 12 arch/powerpc/platforms/powernv/opal-call.c | 1 + arch/powerpc/platforms/powernv/opal.c | 15 +++ 5 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 97c5e5423827..437b6937685d 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -219,7 +219,8 @@ #define OPAL_REPORT_TRAP 183 #define OPAL_FIND_VM_AREA 184 #define OPAL_REGISTER_OS_OPS 185 -#define OPAL_LAST 185 +#define OPAL_CPU_IDLE 186 +#define OPAL_LAST 186 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -1207,6 +1208,7 @@ struct opal_os_ops { __be64 os_printf; /* void printf(int32_t level, const char *str) */ __be64 os_vm_map; /* int64_t os_vm_map(uint64_t ea, uint64_t pa, uint64_t flags) */ __be64 os_vm_unmap; /* void os_vm_unmap(uint64_t ea) */ + __be64 os_idle_stop; /* void os_idle_stop(uint64_t srr1_addr, uint64_t psscr) */ }; #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 09985b7718b3..1774c056acb8 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -407,6 +407,7 @@ void opal_sensor_groups_init(void); int64_t opal_find_vm_area(uint64_t addr, struct opal_vm_area *opal_vm_area); int64_t opal_register_os_ops(struct opal_os_ops *ops, uint64_t size); +int64_t opal_cpu_idle(uint64_t srr1_addr, uint64_t psscr); #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..3afd4293f729 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -805,6 +805,18 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) return srr1; } +static unsigned long power9_firmware_idle_stop(unsigned long psscr, bool mmu_on) +{ + unsigned long srr1; + int rc; + + rc = opal_cpu_idle(cpu_to_be64(), (uint64_t) psscr); + + if (mmu_on) + mtmsr(MSR_KERNEL); + return srr1; +} + #ifdef CONFIG_HOTPLUG_CPU static unsigned long power9_offline_stop(unsigned long psscr) { diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 11f419e76059..79076ca2de03 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -351,3 +351,4 @@ OPAL_CALL(opal_sym_to_addr, OPAL_SYM_TO_ADDR); OPAL_CALL(opal_report_trap,OPAL_REPORT_TRAP); OPAL_CALL(opal_find_vm_area, OPAL_FIND_VM_AREA); OPAL_CALL(opal_register_os_ops,OPAL_REGISTER_OS_OPS); +OPAL_CALL(opal_cpu_idle, OPAL_CPU_IDLE); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 93b9afaf33b3..1fbf7065f918 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -1150,6 +1150,20 @@ static void os_vm_unmap(uint64_t ea) local_flush_tlb_mm(mm); } +int64_t os_idle_stop(uint64_t srr1_addr, uint64_t psscr) +{ + /* +* For lite state which does not lose even GPRS we call +* idle_stop_noloss while for all other states we call +* idle_stop_mayloss. Saving and restoration of other additional +* SPRs if required is handled in OPAL. All the quirks are also +* handled in OPAL. +*/ + if (!(psscr & (PSSCR_EC|PSSCR_ESL))) + return isa300_idle_stop_noloss(psscr); + return isa300_idle_stop_mayloss(psscr); +} + static int __init opal_init_mm(void) { struct mm_struct *mm; @@ -1231,6 +1245,7 @@ static int __init opal_init_ear
[RFC 3/3] powernv/cpuidle : Introduce capability for firmware-enabled-stop
This patch introduces the capability for firmware to handle the stop states instead. A bit is set based on the discovery of the feature and correspondingly also the responsibility to handle the stop states. If Kernel does not know about stop version, it can fallback to opal for idle stop support if firmware-stop-supported property is present. Earlier this patch was posted as part of this series : https://lkml.org/lkml/2020/3/4/589 Signed-off-by: Pratik Rajesh Sampat Signed-off-by: Abhishek Goel --- v1->v2: This patch is newly added in this series. arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/kernel/dt_cpu_ftrs.c | 8 arch/powerpc/platforms/powernv/idle.c | 27 --- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 66fa20476d0e..d5c6532b11ef 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -430,6 +430,7 @@ extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; #define STOP_ENABLE0x0001 +#define FIRMWARE_STOP_ENABLE 0x0010 #define STOP_VERSION_P9 0x1 diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index db1a525e090d..ff4a87b79702 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -298,6 +298,13 @@ static int __init feat_enable_idle_stop(struct dt_cpu_feature *f) return 1; } +static int __init feat_enable_firmware_stop(struct dt_cpu_feature *f) +{ + stop_dep.cpuidle_prop |= FIRMWARE_STOP_ENABLE; + + return 1; +} + static int __init feat_enable_mmu_hash(struct dt_cpu_feature *f) { u64 lpcr; @@ -592,6 +599,7 @@ static struct dt_cpu_feature_match __initdata {"idle-nap", feat_enable_idle_nap, 0}, {"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0}, {"idle-stop", feat_enable_idle_stop, 0}, + {"firmware-stop-supported", feat_enable_firmware_stop, 0}, {"machine-check-power8", feat_enable_mce_power8, 0}, {"performance-monitor-power8", feat_enable_pmu_power8, 0}, {"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR}, diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 538f0842ac3f..0de5de81902e 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -633,16 +633,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) unsigned long mmcr0 = 0; struct p9_sprs sprs = {}; /* avoid false used-uninitialised */ bool sprs_saved = false; - int rc = 0; - - /* -* Kernel takes decision whether to make OPAL call or not. This logic -* will be combined with the logic for BE opal to take decision. -*/ - if (firmware_stop_supported) { - rc = opal_cpu_idle(cpu_to_be64(__pa()), (uint64_t) psscr); - goto out; - } if (!(psscr & (PSSCR_EC|PSSCR_ESL))) { /* EC=ESL=0 case */ @@ -835,6 +825,19 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) return srr1; } +static unsigned long power9_firmware_idle_stop(unsigned long psscr, bool mmu_on) +{ + unsigned long srr1; + int rc; + + rc = opal_cpu_idle(cpu_to_be64(__pa()), (uint64_t) psscr); + + if (mmu_on) + mtmsr(MSR_KERNEL); + return srr1; + +} + #ifdef CONFIG_HOTPLUG_CPU static unsigned long power9_offline_stop(unsigned long psscr) { @@ -1394,9 +1397,11 @@ static int __init pnv_init_idle_states(void) !(stop_dep.cpuidle_prop & STOP_ENABLE)) goto out; - /* Check for supported version in kernel */ + /* Check for supported version in kernel or fallback to kernel*/ if (stop_dep.stop_version & STOP_VERSION_P9) { stop_dep.idle_stop = power9_idle_stop; + } else if (stop_dep.cpuidle_prop & FIRMWARE_STOP_ENABLE) { + stop_dep.idle_stop = power9_firmware_idle_stop; } else { stop_dep.idle_stop = NULL; goto out; -- 2.17.1
[RFC 2/3] powernv/cpuidle : Interface for an idle-stop dependency structure
This patch introduces the idea of having a dependency structure for idle-stop. The structure encapsulates the following: 1. Bitmask for version of idle-stop 2. Bitmask for propterties like ENABLE/DISABLE 3. Function pointer which helps handle how the stop must be invoked This patch lays a foundation for other idle-stop versions to be added and handled cleanly based on their specified requirments. Currently it handles the existing "idle-stop" version by setting the discovery bits and the function pointer. Earlier this patch was posted as part of this series : https://lkml.org/lkml/2020/3/4/589 Signed-off-by: Pratik Rajesh Sampat Signed-off-by: Abhishek Goel --- v1->v2: This patch is newly added in this series. arch/powerpc/include/asm/processor.h | 17 + arch/powerpc/kernel/dt_cpu_ftrs.c | 5 + arch/powerpc/platforms/powernv/idle.c | 18 ++ drivers/cpuidle/cpuidle-powernv.c | 3 ++- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index eedcbfb9a6ff..66fa20476d0e 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -429,6 +429,23 @@ extern void power4_idle_nap(void); extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; +#define STOP_ENABLE0x0001 + +#define STOP_VERSION_P9 0x1 + +/* + * Classify the dependencies of the stop states + * @idle_stop: function handler to handle the quirk stop version + * @cpuidle_prop: Signify support for stop states through kernel and/or firmware + * @stop_version: Classify quirk versions for stop states + */ +typedef struct { + unsigned long (*idle_stop)(unsigned long psscr, bool mmu_on); + uint8_t cpuidle_prop; + uint8_t stop_version; +} stop_deps_t; +extern stop_deps_t stop_dep; + extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern void power7_idle_type(unsigned long type); diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 182b4047c1ef..db1a525e090d 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -292,6 +292,8 @@ static int __init feat_enable_idle_stop(struct dt_cpu_feature *f) lpcr |= LPCR_PECE1; lpcr |= LPCR_PECE2; mtspr(SPRN_LPCR, lpcr); + stop_dep.cpuidle_prop |= STOP_ENABLE; + stop_dep.stop_version = STOP_VERSION_P9; return 1; } @@ -657,6 +659,9 @@ static void __init cpufeatures_setup_start(u32 isa) } } +stop_deps_t stop_dep = {NULL, 0x0, 0x0}; +EXPORT_SYMBOL(stop_dep); + static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) { const struct dt_cpu_feature_match *m; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 1841027b25c5..538f0842ac3f 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -842,7 +842,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); #else /* @@ -858,7 +858,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, false); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; @@ -886,7 +886,7 @@ void power9_idle_type(unsigned long stop_psscr_val, psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr, true); + srr1 = stop_dep.idle_stop(psscr, true); __ppc64_runlatch_on(); fini_irq_for_idle_irqsoff(); @@ -1390,8 +1390,18 @@ static int __init pnv_init_idle_states(void) nr_pnv_idle_states = 0; supported_cpuidle_states = 0; - if (cpuidle_disable != IDLE_NO_OVERRIDE) + if (cpuidle_disable != IDLE_NO_OVERRIDE || + !(stop_dep.cpuidle_prop & STOP_ENABLE)) goto out; + + /* Check for supported version in kernel */ + if (stop_dep.stop_version & STOP_VERSION_P9) { + stop_dep.idle_stop = power9_idle_stop; + } else { + stop_dep.idle_stop = NULL; + goto out; + } + rc = pnv_parse_cpuidle_dt(); if (rc) return rc; diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1b299e801f74..7430a8edf5c9 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -371,7 +371,8 @@ sta
[RFC 1/3] powernv/cpuidle : Support for pre-entry and post exit of stop state in firmware
This patch provides kernel framework fro opal support of save restore of sprs in idle stop loop. Opal support for stop states is needed to selectively enable stop states or to introduce a quirk quickly in case a buggy stop state is present. We make a opal call from kernel if firmware-stop-support for stop states is present and enabled. All the quirks for pre-entry of stop state is handled inside opal. A call from opal is made into kernel where we execute stop afer saving of NVGPRs. After waking up from 0x100 vector in kernel, we enter back into opal. All the quirks in post exit path, if any, are then handled in opal, from where we return successfully back to kernel. For deep stop states in which additional SPRs are lost, saving and restoration will be done in OPAL. This idea was first proposed by Nick here: https://patchwork.ozlabs.org/patch/1208159/ The corresponding skiboot patch for this kernel patch is here: https://patchwork.ozlabs.org/project/skiboot/list/?series=172831 When we callback from OPAL into kernel, r13 is clobbered. So, to access PACA we need to restore it from HSPRGO. In future we can handle this into OPAL as in here: https://patchwork.ozlabs.org/patch/1245275/ Signed-off-by: Abhishek Goel Signed-off-by: Nicholas Piggin --- v1->v2 : No change in this patch. arch/powerpc/include/asm/opal-api.h| 8 - arch/powerpc/include/asm/opal.h| 3 ++ arch/powerpc/kernel/idle_book3s.S | 5 +++ arch/powerpc/platforms/powernv/idle.c | 37 ++ arch/powerpc/platforms/powernv/opal-call.c | 2 ++ 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index c1f25a760eb1..a2c782c99c9e 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -214,7 +214,9 @@ #define OPAL_SECVAR_GET176 #define OPAL_SECVAR_GET_NEXT 177 #define OPAL_SECVAR_ENQUEUE_UPDATE 178 -#define OPAL_LAST 178 +#define OPAL_REGISTER_OS_OPS 181 +#define OPAL_CPU_IDLE 182 +#define OPAL_LAST 182 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump { struct opal_mpipl_region region[]; } __packed; +struct opal_os_ops { + __be64 os_idle_stop; +}; + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..3c340bc4df8e 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -400,6 +400,9 @@ void opal_powercap_init(void); void opal_psr_init(void); void opal_sensor_groups_init(void); +extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops); +extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_OPAL_H */ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 22f249b6f58d..8d287d1d06c0 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss) */ _GLOBAL(isa300_idle_stop_mayloss) mtspr SPRN_PSSCR,r3 + mr r6, r13 + mfspr r13, SPRN_HSPRG0 std r1,PACAR1(r13) mflrr4 mfcrr5 @@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss) std r31,-8*18(r1) std r4,-8*19(r1) std r5,-8*20(r1) + std r6,-8*21(r1) /* 168 bytes */ PPC_STOP b . /* catch bugs */ @@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss) ld r1,PACAR1(r13) ld r4,-8*19(r1) ld r5,-8*20(r1) + ld r6,-8*21(r1) mtlrr4 mtcrr5 + mr r13,r6 /* * KVM nap requires r2 to be saved, rather than just restoring it * from PACATOC. This could be avoided for that less common case diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..1841027b25c5 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -35,6 +35,7 @@ static u32 supported_cpuidle_states; struct pnv_idle_states_t *pnv_idle_states; int nr_pnv_idle_states; +static bool firmware_stop_supported; /* * The default stop state that will be used by ppc_md.power_save @@ -602,6 +603,25 @@ struct p9_sprs { u64 uamor; }; +/* + * This function is called from OPAL if firmware support for stop + * states is present and enabled. It provides a fallback for idle + * stop states via OPAL. + */ +static uint64_t os_idle_stop(uint64_t psscr, bool save_gprs) +{ + /* +* For l
[RFC] cpuidle/powernv : Support for pre-entry and post exit of stop state in firmware
This patch provides kernel framework fro opal support of save restore of sprs in idle stop loop. Opal support for stop states is needed to selectively enable stop states or to introduce a quirk quickly in case a buggy stop state is present. We make a opal call from kernel if firmware-stop-support for stop states is present and enabled. All the quirks for pre-entry of stop state is handled inside opal. A call from opal is made into kernel where we execute stop afer saving of NVGPRs. After waking up from 0x100 vector in kernel, we enter back into opal. All the quirks in post exit path, if any, are then handled in opal, from where we return successfully back to kernel. For deep stop states in which additional SPRs are lost, saving and restoration will be done in OPAL. This idea was first proposed by Nick here: https://patchwork.ozlabs.org/patch/1208159/ The corresponding skiboot patch for this kernel patch is here: https://patchwork.ozlabs.org/patch/1265959/ When we callback from OPAL into kernel, r13 is clobbered. So, to access PACA we need to restore it from HSPRGO. In future we can handle this into OPAL as in here: https://patchwork.ozlabs.org/patch/1245275/ Signed-off-by: Abhishek Goel Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/opal-api.h| 8 - arch/powerpc/include/asm/opal.h| 3 ++ arch/powerpc/kernel/idle_book3s.S | 5 +++ arch/powerpc/platforms/powernv/idle.c | 37 ++ arch/powerpc/platforms/powernv/opal-call.c | 2 ++ 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index c1f25a760eb1..a2c782c99c9e 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -214,7 +214,9 @@ #define OPAL_SECVAR_GET176 #define OPAL_SECVAR_GET_NEXT 177 #define OPAL_SECVAR_ENQUEUE_UPDATE 178 -#define OPAL_LAST 178 +#define OPAL_REGISTER_OS_OPS 181 +#define OPAL_CPU_IDLE 182 +#define OPAL_LAST 182 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -1181,6 +1183,10 @@ struct opal_mpipl_fadump { struct opal_mpipl_region region[]; } __packed; +struct opal_os_ops { + __be64 os_idle_stop; +}; + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..3c340bc4df8e 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -400,6 +400,9 @@ void opal_powercap_init(void); void opal_psr_init(void); void opal_sensor_groups_init(void); +extern int64_t opal_register_os_ops(struct opal_os_ops *os_ops); +extern int64_t opal_cpu_idle(__be64 srr1_addr, uint64_t psscr); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_OPAL_H */ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 22f249b6f58d..8d287d1d06c0 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -49,6 +49,8 @@ _GLOBAL(isa300_idle_stop_noloss) */ _GLOBAL(isa300_idle_stop_mayloss) mtspr SPRN_PSSCR,r3 + mr r6, r13 + mfspr r13, SPRN_HSPRG0 std r1,PACAR1(r13) mflrr4 mfcrr5 @@ -74,6 +76,7 @@ _GLOBAL(isa300_idle_stop_mayloss) std r31,-8*18(r1) std r4,-8*19(r1) std r5,-8*20(r1) + std r6,-8*21(r1) /* 168 bytes */ PPC_STOP b . /* catch bugs */ @@ -91,8 +94,10 @@ _GLOBAL(idle_return_gpr_loss) ld r1,PACAR1(r13) ld r4,-8*19(r1) ld r5,-8*20(r1) + ld r6,-8*21(r1) mtlrr4 mtcrr5 + mr r13,r6 /* * KVM nap requires r2 to be saved, rather than just restoring it * from PACATOC. This could be avoided for that less common case diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..1841027b25c5 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -35,6 +35,7 @@ static u32 supported_cpuidle_states; struct pnv_idle_states_t *pnv_idle_states; int nr_pnv_idle_states; +static bool firmware_stop_supported; /* * The default stop state that will be used by ppc_md.power_save @@ -602,6 +603,25 @@ struct p9_sprs { u64 uamor; }; +/* + * This function is called from OPAL if firmware support for stop + * states is present and enabled. It provides a fallback for idle + * stop states via OPAL. + */ +static uint64_t os_idle_stop(uint64_t psscr, bool save_gprs) +{ + /* +* For lite state which does not lose even GPRS we call
[RFC v5 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled
The disable callback can be used to compute timeout for other states whenever a state is enabled or disabled. We store the computed timeout in "timeout" defined in cpuidle state strucure. So, we compute timeout only when some state is enabled or disabled and not every time in the fast idle path. We also use the computed timeout to get timeout for snooze, thus getting rid of get_snooze_timeout for snooze loop. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 35 +++ include/linux/cpuidle.h | 1 + 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index d7686ce6e..a75226f52 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -45,7 +45,6 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; static u64 default_snooze_timeout __read_mostly; -static bool snooze_timeout_en __read_mostly; static u64 forced_wakeup_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev, return 0; } -static u64 get_snooze_timeout(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static void pnv_disable_callback(struct cpuidle_device *dev, +struct cpuidle_driver *drv) { int i; - if (unlikely(!snooze_timeout_en)) - return default_snooze_timeout; - - for (i = index + 1; i < drv->state_count; i++) { - struct cpuidle_state *s = >states[i]; - struct cpuidle_state_usage *su = >states_usage[i]; - - if (s->disabled || su->disable) - continue; - - return s->target_residency * tb_ticks_per_usec; - } - - return default_snooze_timeout; + for (i = 0; i < drv->state_count; i++) + drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i); } static int snooze_loop(struct cpuidle_device *dev, @@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { u64 snooze_exit_time; + u64 snooze_timeout = drv->states[index].timeout; + + if (!snooze_timeout) + snooze_timeout = default_snooze_timeout; set_thread_flag(TIF_POLLING_NRFLAG); local_irq_enable(); - snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + if (get_tb() > snooze_exit_time) { /* * Task has not woken up but we are exiting the polling * loop anyway. Require a barrier after polling is @@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev, u64 timeout_tb; bool forced_wakeup = false; - timeout_tb = forced_wakeup_timeout(dev, drv, index); + timeout_tb = drv->states[index].timeout; if (timeout_tb) { /* Ensure that the timeout is at least one microsecond @@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void) */ drv->cpumask = (struct cpumask *)cpu_present_mask; + drv->disable_callback = pnv_disable_callback; return 0; } @@ -422,8 +413,6 @@ static int powernv_idle_probe(void) /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; - if (max_idle_state > 1) - snooze_timeout_en = true; } else return -ENODEV; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 1729a497b..64195861b 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -50,6 +50,7 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ + unsigned long long timeout; /* timeout for exiting out of a state */ int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, -- 2.17.1
[RFC v5 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled
To force wakeup a cpu, we need to compute the timeout in the fast idle path as a state may be enabled or disabled but there did not exist a feedback to driver when a state is enabled or disabled. This patch adds a callback whenever a state_usage records a store for disable attribute Signed-off-by: Abhishek Goel --- drivers/cpuidle/sysfs.c | 15 ++- include/linux/cpuidle.h | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 2bb2683b4..6c9bf2f7b 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -418,8 +418,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); struct cpuidle_device *dev = kobj_to_device(kobj); - if (cattr->store) + if (cattr->store) { ret = cattr->store(state, state_usage, buf, size); + if (ret == size && + strncmp(cattr->attr.name, "disable", + strlen("disable"))) { + struct kobject *cpuidle_kobj = kobj->parent; + struct cpuidle_device *dev = + to_cpuidle_device(cpuidle_kobj); + struct cpuidle_driver *drv = + cpuidle_get_cpu_driver(dev); + + if (drv->disable_callback) + drv->disable_callback(dev, drv); + } + } /* reset poll time cache */ dev->poll_limit_ns = 0; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 4b6b5bea8..1729a497b 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -122,6 +122,9 @@ struct cpuidle_driver { /* the driver handles the cpus in cpumask */ struct cpumask *cpumask; + void (*disable_callback)(struct cpuidle_device *dev, +struct cpuidle_driver *drv); + /* preferred governor to switch at register time */ const char *governor; }; -- 2.17.1
[PATCH v5 1/3] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : Cleanly handle setting/resetting of decrementer so as to not break irq work v4 : Changed type and name of set/reset decrementer fucntion Handled irq_work_pending in try_set_dec_before_idle v5 : Removed forced wakeup for last stop state by changing the checking conditon of timeout_tb arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 +++ drivers/cpuidle/cpuidle-powernv.c | 40 3 files changed, 85 insertions(+) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 08dbe3e68..06a6a2314 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -184,6 +184,8 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp) extern u64 mulhdu(u64, u64); #endif +extern bool try_set_dec_before_idle(u64 timeout); +extern void try_reset_dec_after_idle(void); extern void div128_by_32(u64 dividend_high, u64 dividend_low, unsigned divisor, struct div_result *dr); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308..d004c0d8e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -576,6 +576,49 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ +/* + * This function tries setting decrementer before entering into idle. + * Returns true if we have reprogrammed the decrementer for idle. + * Returns false if the decrementer is unchanged. + */ +bool try_set_dec_before_idle(u64 timeout) +{ + u64 *next_tb = this_cpu_ptr(_next_tb); + u64 now = get_tb_or_rtc(); + + if (now + timeout > *next_tb) + return false; + + set_dec(timeout); + if (test_irq_work_pending()) + set_dec(1); + + return true; +} + +/* + * This function gets called if we have set decrementer before + * entering into idle. It tries to reset/restore the decrementer + * to its original value. + */ +void try_reset_dec_after_idle(void) +{ + u64 now; + u64 *next_tb; + + if (test_irq_work_pending()) + return; + + now = get_tb_or_rtc(); + next_tb = this_cpu_ptr(_next_tb); + if (now >= *next_tb) + return; + + set_dec(*next_tb - now); + if (test_irq_work_pending()) + set_dec(1); +} + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..d7686ce6e 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Expose only those Hardware idle states via the cpuidle framework @@ -46,6 +47,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue;
[PATCH v5 0/3] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : 1) Cleanly handle setting the decrementer after exiting out of stop states. 2) Added a disable_callback feature to compute timeout whenever a state is enbaled or disabled instead of computing everytime in fast idle path. 3) Use disable callback to recompute timeout whenever state usage is changed for a state. Also, cleaned up the get_snooze_timeout function. v4 :Changed the type and name of set/reset decrementer function. Handled irq work pending in try_set_dec_before_idle. No change in patch 2 and 3. v5 :Removed forced wakeup for the last state. We dont want to wakeup unnecessarily when already in deepest state. It was a mistake in previous patches that was found out in recent experiments. No change in patch 2 and 3. Abhishek Goel (3): cpuidle-powernv : forced wakeup for stop states cpuidle : Add callback whenever a state usage is enabled/disabled cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 drivers/cpuidle/cpuidle-powernv.c | 55 +++ drivers/cpuidle/sysfs.c | 15 - include/linux/cpuidle.h | 4 +++ 5 files changed, 105 insertions(+), 14 deletions(-) -- 2.17.1
[RFC 3/3] cpuidle/powernv : Add flags to identify stop state type
Removed threshold latency which was being used to decide if a state is cpuidle type or not. This decision can be taken using flags, as this information has been encapsulated in the state->flags and being read from idle device-tree. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 7 +++ drivers/cpuidle/cpuidle-powernv.c | 16 +--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 383242eb0dea..b9068fee21d8 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -233,6 +233,13 @@ #define OPAL_PM_STOP_INST_FAST 0x0010 #define OPAL_PM_STOP_INST_DEEP 0x0020 +/* + * Flags for stop states to distinguish between cpuidle and + * cpuoffline type of states. + */ +#define OPAL_PM_STOP_CPUIDLE 0x0100 +#define OPAL_PM_STOP_CPUHOTPLUG0x0200 + /* * OPAL_CONFIG_CPU_IDLE_STATE parameters */ diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1b6c84d4ac77..1a33a548b769 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -270,8 +270,13 @@ static int powernv_add_idle_states(void) goto out; } - /* TODO: Count only states which are eligible for cpuidle */ - dt_idle_states = nr_pnv_idle_states; + /* Count only cpuidle states*/ + for (i = 0; i < nr_pnv_idle_states; i++) { + if (pnv_idle_states[i].flags & OPAL_PM_STOP_CPUIDLE) + dt_idle_states++; + } + pr_info("idle states in dt = %d , states with idle flag = %d", + nr_pnv_idle_states, dt_idle_states); /* * Since snooze is used as first idle state, max idle states allowed is @@ -300,11 +305,8 @@ static int powernv_add_idle_states(void) continue; } - /* -* If an idle state has exit latency beyond -* POWERNV_THRESHOLD_LATENCY_NS then don't use it in cpu-idle. -*/ - if (state->latency_ns > POWERNV_THRESHOLD_LATENCY_NS) { + /* Check whether a state is of cpuidle type */ + if ((state->flags & OPAL_PM_STOP_CPUIDLE) != state->flags) { pr_info("State %d is not idletype\n", i); continue; } -- 2.17.1
[RFC 2/3] cpuidle/powernv: Add support for versioned stop states
New device tree format adds a compatible flag which has version corresponding to every state, so that only kernel which has the capability to handle the version of stop state will enable it. Drawback of the array based dt node is that versioning of idle states is not possible. Older kernel will still see stop0 and stop0_lite in older format and we will deprecate it after some time. Consider a case that stop4 has a bug. We take the following steps to mitigate the problem. 1) Change compatible string for stop4 in OPAL to "stop4,v2" from "stop4,v1", i.e. basicallly bump up the previous version and ship the new firmware. 2) The kernel will ignore stop4 as it won't be able to recognize this new version. Kernel will also ignore all the deeper states because its possible that a cpu have requested for a deeper state but was never able to enter into it. But we will still have shallower states that are there before stop 4. This, thus prevents from completely disabling stop states. Linux kernel can now look at the version string and decide if it has the ability to handle that idle state. Henceforth, if kernel does not know about a version, it will skip that state and all the deeper state. Once when the workaround are implemented into the kernel, we can bump up the known version in kernel for that state, so that support can be enabled once again in kernel. New Device-tree : Final output power-mgt { ... ibm,enabled-stop-levels = <0xec00>; ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>; ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>; ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>; ibm,cpu-idle-state-flags = <0x10 0x101000>; ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>; ibm,idle-states { stop4 { flags = <0x207000>; compatible = "stop4,v1", psscr-mask = <0x0 0x3003ff>; latency-ns = <0x186a0>; residency-ns = <0x989680>; psscr = <0x0 0x300374>; ... }; ... stop11 { ... compatible = "stop11,v1", ... }; }; Since state pointer is being passed to stop loop, I have separated idle fast path for lite, shallow and deep states. This improves the readability of the code and also future maintainability of the code. Stop handle corresponding to each state can be called directly since state pointer is being passed now. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/cpuidle.h| 8 +- arch/powerpc/platforms/powernv/idle.c | 331 +++--- 2 files changed, 299 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 9844b3ded187..5eb9a98fcb86 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -70,15 +70,19 @@ #ifndef __ASSEMBLY__ -#define PNV_IDLE_NAME_LEN16 +#define PNV_VERSION_LEN25 +#define PNV_IDLE_NAME_LEN 16 struct pnv_idle_states_t { char name[PNV_IDLE_NAME_LEN]; + char compat_version[PNV_VERSION_LEN]; u32 latency_ns; u32 residency_ns; u64 psscr_val; u64 psscr_mask; - u32 flags; + u64 flags; bool valid; + unsigned long (*stop_handle)(struct pnv_idle_states_t *state, +bool mmu_on); }; extern struct pnv_idle_states_t *pnv_idle_states; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index db810c0a16e1..7398a66e1ddb 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -49,6 +49,12 @@ static bool default_stop_found; static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1; static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1; +struct stop_version_t { + const char compat_version[PNV_VERSION_LEN]; + unsigned long (*stop_handle)(struct pnv_idle_states_t *state, +bool mmu_on); +}; + /* * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. @@ -599,40 +605,152 @@ struct p9_sprs { u64 uamor; }; -static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) +/* Splitting the previous power9_idle_stop into three functions + * to separately handle lite, shallow and deep states. + */ + +static unsigned long power9_stop_lite(struct pnv_idle_states_t *state, + bool mmu_on) { - int cpu = raw_smp_processor_id(); - int first = cpu_first_thread_sib
[RFC 1/3] cpuidle/powernv : Pass state pointer instead of values to stop loop
Instead of passing psscr_val and psscr_mask, we will pass state pointer to stop loop. This will help to figure out the method to enter/exit idle state. Removed psscr_mask and psscr_val from driver idle code. Same has also been done in platform idle code. Also, added some cleanups and debugging info. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/processor.h | 5 +- arch/powerpc/platforms/powernv/idle.c | 50 --- drivers/cpuidle/cpuidle-powernv.c | 69 +-- 3 files changed, 55 insertions(+), 69 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index a9993e7a443b..855666e8d271 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -39,6 +39,7 @@ #include #include #include +#include /* We do _not_ want to define new machine types at all, those must die * in favor of using the device-tree @@ -419,9 +420,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern void power7_idle_type(unsigned long type); -extern void power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask); - +extern void power9_idle_type(struct pnv_idle_states_t *state); extern void flush_instruction_cache(void); extern void hard_reset_now(void); extern void poweroff_now(void); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 09f49eed7fb8..db810c0a16e1 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -40,8 +40,7 @@ int nr_pnv_idle_states; * The default stop state that will be used by ppc_md.power_save * function on platforms that support stop instruction. */ -static u64 pnv_default_stop_val; -static u64 pnv_default_stop_mask; +struct pnv_idle_states_t *pnv_default_state; static bool default_stop_found; /* @@ -54,9 +53,7 @@ static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1; * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. */ -static u64 pnv_deepest_stop_psscr_val; -static u64 pnv_deepest_stop_psscr_mask; -static u64 pnv_deepest_stop_flag; +static struct pnv_idle_states_t *pnv_deepest_state; static bool deepest_stop_found; static unsigned long power7_offline_type; @@ -78,7 +75,7 @@ static int pnv_save_sprs_for_deep_states(void) uint64_t hid5_val = mfspr(SPRN_HID5); uint64_t hmeer_val = mfspr(SPRN_HMEER); uint64_t msr_val = MSR_IDLE; - uint64_t psscr_val = pnv_deepest_stop_psscr_val; + uint64_t psscr_val = pnv_deepest_state->psscr_val; for_each_present_cpu(cpu) { uint64_t pir = get_hard_smp_processor_id(cpu); @@ -804,9 +801,13 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) } #ifdef CONFIG_HOTPLUG_CPU -static unsigned long power9_offline_stop(unsigned long psscr) +static unsigned long power9_offline_stop(struct pnv_idle_states_t *state) { unsigned long srr1; + unsigned long psscr; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~state->psscr_mask) | state->psscr_val; #ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE __ppc64_runlatch_off(); @@ -841,8 +842,7 @@ static unsigned long power9_offline_stop(unsigned long psscr) } #endif -void power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask) +void power9_idle_type(struct pnv_idle_states_t *state) { unsigned long psscr; unsigned long srr1; @@ -851,7 +851,7 @@ void power9_idle_type(unsigned long stop_psscr_val, return; psscr = mfspr(SPRN_PSSCR); - psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + psscr = (psscr & ~state->psscr_mask) | state->psscr_val; __ppc64_runlatch_off(); srr1 = power9_idle_stop(psscr, true); @@ -867,7 +867,7 @@ void power9_idle_type(unsigned long stop_psscr_val, */ void power9_idle(void) { - power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask); + power9_idle_type(pnv_default_state); } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -970,12 +970,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) __ppc64_runlatch_off(); if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { - unsigned long psscr; - - psscr = mfspr(SPRN_PSSCR); - psscr = (psscr & ~pnv_deepest_stop_psscr_mask) | - pnv_deepest_stop_psscr_val; - srr1 = power9_offline_stop(psscr); + srr1 = power9_offline_stop(pnv_deepest_state); } else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) { srr1 = power7_offline(); } el
[RFC 0/3] New idle device-tree format and support for versioned stop state
Background -- Previously if a older kernel runs on a newer firmware, it may enable all available states irrespective of its capability of handling it. Consider a case that some stop state has a bug, we end up disabling all the stop states. This patch introduces selective control to solve this problem. Previous version of these patches can be found at: https://lkml.org/lkml/2018/10/11/544 These patch however also had patches for support of opal save-restore which now I am decoupling and will take them seperately. I have posted the corresponding skiboot patches for this kernel patchset here : https://patchwork.ozlabs.org/cover/1144587/ What's new? Add stop states under ibm,idle-states in addition to the current array based device tree properties. New device tree format adds a compatible flag which has version corresponding to every state, so that only kernel which has the capability to handle the version of stop state will enable it. Drawback of the array based dt node is that versioning of idle states is not possible. Older kernel will still see stop0 and stop0_lite in older format and we will deprecate it after some time. Consider a case that stop4 has a bug. We take the following steps to mitigate the problem. 1) Change compatible string for stop4 in OPAL to "stop4,v2" from "stop4,v1", i.e. basicallly bump up the previous version and ship the new firmware. 2) The kernel will ignore stop4 as it won't be able to recognize this new version. Kernel will also ignore all the deeper states because its possible that a cpu have requested for a deeper state but was never able to enter into it. But we will still have shallower states that are there before stop 4. This, thus prevents from completely disabling stop states. Linux kernel can now look at the version string and decide if it has the ability to handle that idle state. Henceforth, if kernel does not know about a version, it will skip that state and all the deeper state. Once when the workaround are implemented into the kernel, we can bump up the known version in kernel for that state, so that support can be enabled once again in kernel. New Device-tree : Final output power-mgt { ... ibm,enabled-stop-levels = <0xec00>; ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>; ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>; ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>; ibm,cpu-idle-state-flags = <0x10 0x101000>; ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>; ibm,idle-states { stop4 { flags = <0x207000>; compatible = "stop4,v1", psscr-mask = <0x0 0x3003ff>; latency-ns = <0x186a0>; residency-ns = <0x989680>; psscr = <0x0 0x300374>; ... }; ... stop11 { ... compatible = "stop11,v1", ... }; }; Abhishek Goel (3): cpuidle/powernv : Pass state pointer instead of values to stop loop cpuidle/powernv: Add support for versioned stop states cpuidle/powernv : Add flags to identify stop state type arch/powerpc/include/asm/cpuidle.h| 8 +- arch/powerpc/include/asm/opal-api.h | 7 + arch/powerpc/include/asm/processor.h | 5 +- arch/powerpc/platforms/powernv/idle.c | 371 +- drivers/cpuidle/cpuidle-powernv.c | 81 +++--- 5 files changed, 363 insertions(+), 109 deletions(-) -- 2.17.1
[RFC v4 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled
The disable callback can be used to compute timeout for other states whenever a state is enabled or disabled. We store the computed timeout in "timeout" defined in cpuidle state strucure. So, we compute timeout only when some state is enabled or disabled and not every time in the fast idle path. We also use the computed timeout to get timeout for snooze, thus getting rid of get_snooze_timeout for snooze loop. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 35 +++ include/linux/cpuidle.h | 1 + 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 17e20e408ffe..29add322d0c4 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -45,7 +45,6 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; static u64 default_snooze_timeout __read_mostly; -static bool snooze_timeout_en __read_mostly; static u64 forced_wakeup_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev, return 0; } -static u64 get_snooze_timeout(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static void pnv_disable_callback(struct cpuidle_device *dev, +struct cpuidle_driver *drv) { int i; - if (unlikely(!snooze_timeout_en)) - return default_snooze_timeout; - - for (i = index + 1; i < drv->state_count; i++) { - struct cpuidle_state *s = >states[i]; - struct cpuidle_state_usage *su = >states_usage[i]; - - if (s->disabled || su->disable) - continue; - - return s->target_residency * tb_ticks_per_usec; - } - - return default_snooze_timeout; + for (i = 0; i < drv->state_count; i++) + drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i); } static int snooze_loop(struct cpuidle_device *dev, @@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { u64 snooze_exit_time; + u64 snooze_timeout = drv->states[index].timeout; + + if (!snooze_timeout) + snooze_timeout = default_snooze_timeout; set_thread_flag(TIF_POLLING_NRFLAG); local_irq_enable(); - snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + if (get_tb() > snooze_exit_time) { /* * Task has not woken up but we are exiting the polling * loop anyway. Require a barrier after polling is @@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev, u64 timeout_tb; bool forced_wakeup = false; - timeout_tb = forced_wakeup_timeout(dev, drv, index); + timeout_tb = drv->states[index].timeout; /* Ensure that the timeout is at least one microsecond * greater than current decrement value. Else, we will @@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void) */ drv->cpumask = (struct cpumask *)cpu_present_mask; + drv->disable_callback = pnv_disable_callback; return 0; } @@ -422,8 +413,6 @@ static int powernv_idle_probe(void) /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; - if (max_idle_state > 1) - snooze_timeout_en = true; } else return -ENODEV; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 8a0e54bd0d5d..31662b657b9c 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -50,6 +50,7 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ + unsigned long long timeout; /* timeout for exiting out of a state */ int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, -- 2.17.1
[RFC v4 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled
To force wakeup a cpu, we need to compute the timeout in the fast idle path as a state may be enabled or disabled but there did not exist a feedback to driver when a state is enabled or disabled. This patch adds a callback whenever a state_usage records a store for disable attribute. Signed-off-by: Abhishek Goel --- drivers/cpuidle/sysfs.c | 15 ++- include/linux/cpuidle.h | 4 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index eb20adb5de23..141671a53967 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); - if (cattr->store) + if (cattr->store) { ret = cattr->store(state, state_usage, buf, size); + if (ret == size && + strncmp(cattr->attr.name, "disable", + strlen("disable"))) { + struct kobject *cpuidle_kobj = kobj->parent; + struct cpuidle_device *dev = + to_cpuidle_device(cpuidle_kobj); + struct cpuidle_driver *drv = + cpuidle_get_cpu_driver(dev); + + if (drv->disable_callback) + drv->disable_callback(dev, drv); + } + } return ret; } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb9a0db89f1a..8a0e54bd0d5d 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -119,6 +119,10 @@ struct cpuidle_driver { /* the driver handles the cpus in cpumask */ struct cpumask *cpumask; + + void (*disable_callback)(struct cpuidle_device *dev, + struct cpuidle_driver *drv); + }; #ifdef CONFIG_CPU_IDLE -- 2.17.1
[PATCH v4 1/3] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : Cleanly handle setting/resetting of decrementer so as to not break irq work v4 : Changed type and name of set/reset decrementer fucntion Handled irq_work_pending in try_set_dec_before_idle arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 +++ drivers/cpuidle/cpuidle-powernv.c | 40 3 files changed, 85 insertions(+) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 54f4ec1f9fab..294a472ce161 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp) extern u64 mulhdu(u64, u64); #endif +extern bool try_set_dec_before_idle(u64 timeout); +extern void try_reset_dec_after_idle(void); extern void div128_by_32(u64 dividend_high, u64 dividend_low, unsigned divisor, struct div_result *dr); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308cd5..d004c0d8e099 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -576,6 +576,49 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ +/* + * This function tries setting decrementer before entering into idle. + * Returns true if we have reprogrammed the decrementer for idle. + * Returns false if the decrementer is unchanged. + */ +bool try_set_dec_before_idle(u64 timeout) +{ + u64 *next_tb = this_cpu_ptr(_next_tb); + u64 now = get_tb_or_rtc(); + + if (now + timeout > *next_tb) + return false; + + set_dec(timeout); + if (test_irq_work_pending()) + set_dec(1); + + return true; +} + +/* + * This function gets called if we have set decrementer before + * entering into idle. It tries to reset/restore the decrementer + * to its original value. + */ +void try_reset_dec_after_idle(void) +{ + u64 now; + u64 *next_tb; + + if (test_irq_work_pending()) + return; + + now = get_tb_or_rtc(); + next_tb = this_cpu_ptr(_next_tb); + if (now >= *next_tb) + return; + + set_dec(*next_tb - now); + if (test_irq_work_pending()) + set_dec(1); +} + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe212b3..17e20e408ffe 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Expose only those Hardware idle states via the cpuidle framework @@ -46,6 +47,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return (s->target_residency + 2 * s->exit_latency) * +
[PATCH v4 0/3] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : 1) Cleanly handle setting the decrementer after exiting out of stop states. 2) Added a disable_callback feature to compute timeout whenever a state is enbaled or disabled instead of computing everytime in fast idle path. 3) Use disable callback to recompute timeout whenever state usage is changed for a state. Also, cleaned up the get_snooze_timeout function. v4 :Changed the type and name of set/reset decrementer function. Handled irq work pending in try_set_dec_before_idle. No change in patch 2 and 3. Abhishek Goel (3): cpuidle-powernv : forced wakeup for stop states cpuidle : Add callback whenever a state usage is enabled/disabled cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 drivers/cpuidle/cpuidle-powernv.c | 55 +++ drivers/cpuidle/sysfs.c | 15 - include/linux/cpuidle.h | 5 +++ 5 files changed, 106 insertions(+), 14 deletions(-) -- 2.17.1
[RFC v3 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled
The disable callback can be used to compute timeout for other states whenever a state is enabled or disabled. We store the computed timeout in "timeout" defined in cpuidle state strucure. So, we compute timeout only when some state is enabled or disabled and not every time in the fast idle path. We also use the computed timeout to get timeout for snooze, thus getting rid of get_snooze_timeout for snooze loop. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 35 +++ include/linux/cpuidle.h | 1 + 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index f51478460..7350f404a 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -45,7 +45,6 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; static u64 default_snooze_timeout __read_mostly; -static bool snooze_timeout_en __read_mostly; static u64 forced_wakeup_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev, return 0; } -static u64 get_snooze_timeout(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static void pnv_disable_callback(struct cpuidle_device *dev, +struct cpuidle_driver *drv) { int i; - if (unlikely(!snooze_timeout_en)) - return default_snooze_timeout; - - for (i = index + 1; i < drv->state_count; i++) { - struct cpuidle_state *s = >states[i]; - struct cpuidle_state_usage *su = >states_usage[i]; - - if (s->disabled || su->disable) - continue; - - return s->target_residency * tb_ticks_per_usec; - } - - return default_snooze_timeout; + for (i = 0; i < drv->state_count; i++) + drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i); } static int snooze_loop(struct cpuidle_device *dev, @@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { u64 snooze_exit_time; + u64 snooze_timeout = drv->states[index].timeout; + + if (!snooze_timeout) + snooze_timeout = default_snooze_timeout; set_thread_flag(TIF_POLLING_NRFLAG); local_irq_enable(); - snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + if (get_tb() > snooze_exit_time) { /* * Task has not woken up but we are exiting the polling * loop anyway. Require a barrier after polling is @@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev, u64 timeout_tb; int forced_wakeup = 0; - timeout_tb = forced_wakeup_timeout(dev, drv, index); + timeout_tb = drv->states[index].timeout; if (timeout_tb) forced_wakeup = set_dec_before_idle(timeout_tb); @@ -255,6 +245,7 @@ static int powernv_cpuidle_driver_init(void) */ drv->cpumask = (struct cpumask *)cpu_present_mask; + drv->disable_callback = pnv_disable_callback; return 0; } @@ -414,8 +405,6 @@ static int powernv_idle_probe(void) /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; - if (max_idle_state > 1) - snooze_timeout_en = true; } else return -ENODEV; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 8a0e54bd0..31662b657 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -50,6 +50,7 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ + unsigned long long timeout; /* timeout for exiting out of a state */ int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, -- 2.17.1
[RFC v3 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled
To force wakeup a cpu, we need to compute the timeout in the fast idle path as a state may be enabled or disabled but there did not exist a feedback to driver when a state is enabled or disabled. This patch adds a callback whenever a state_usage records a store for disable attribute. Signed-off-by: Abhishek Goel --- drivers/cpuidle/sysfs.c | 15 ++- include/linux/cpuidle.h | 4 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index eb20adb5d..141671a53 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); - if (cattr->store) + if (cattr->store) { ret = cattr->store(state, state_usage, buf, size); + if (ret == size && + strncmp(cattr->attr.name, "disable", + strlen("disable"))) { + struct kobject *cpuidle_kobj = kobj->parent; + struct cpuidle_device *dev = + to_cpuidle_device(cpuidle_kobj); + struct cpuidle_driver *drv = + cpuidle_get_cpu_driver(dev); + + if (drv->disable_callback) + drv->disable_callback(dev, drv); + } + } return ret; } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb9a0db89..8a0e54bd0 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -119,6 +119,10 @@ struct cpuidle_driver { /* the driver handles the cpus in cpumask */ struct cpumask *cpumask; + + void (*disable_callback)(struct cpuidle_device *dev, + struct cpuidle_driver *drv); + }; #ifdef CONFIG_CPU_IDLE -- 2.17.1
[PATCH v3 1/3] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : Cleanly handle setting/resetting of decrementer so as to not break irq work arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 40 +++ drivers/cpuidle/cpuidle-powernv.c | 32 + 3 files changed, 74 insertions(+) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 54f4ec1f9..a3bd4f3c0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp) extern u64 mulhdu(u64, u64); #endif +extern int set_dec_before_idle(u64 timeout); +extern void reset_dec_after_idle(void); extern void div128_by_32(u64 dividend_high, u64 dividend_low, unsigned divisor, struct div_result *dr); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308..814de3469 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -576,6 +576,46 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ +/* + * Returns 1 if we have reprogrammed the decrementer for idle. + * Returns 0 if the decrementer is unchanged. + */ +int set_dec_before_idle(u64 timeout) +{ + u64 *next_tb = this_cpu_ptr(_next_tb); + u64 now = get_tb_or_rtc(); + + /* +* Ensure that the timeout is at least one microsecond +* before the current decrement value. Else, we will +* unnecesarily wakeup again within a microsecond. +*/ + if (now + timeout + 512 > *next_tb) + return 0; + + set_dec(timeout); + + return 1; +} + +void reset_dec_after_idle(void) +{ + u64 now; + u64 *next_tb; + + if (test_irq_work_pending()) + return; + + now = get_tb_or_rtc(); + next_tb = this_cpu_ptr(_next_tb); + if (now >= *next_tb) + return; + + set_dec(*next_tb - now); + if (test_irq_work_pending()) + set_dec(1); +} + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..f51478460 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Expose only those Hardware idle states via the cpuidle framework @@ -46,6 +47,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return (s->target_residency + 2 * s->exit_latency) * + tb_ticks_per_usec; + } + + return 0; +} + static u64 get_snooze_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -144,8 +165,19 @@ static int stop_l
[PATCH v3 0/3] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : 1) Cleanly handle setting the decrementer after exiting out of stop states. 2) Added a disable_callback feature to compute timeout whenever a state is enbaled or disabled instead of computing everytime in fast idle path. 3) Use disable callback to recompute timeout whenever state usage is changed for a state. Also, cleaned up the get_snooze_timeout function. Abhishek Goel (3): cpuidle-powernv : forced wakeup for stop states cpuidle : Add callback whenever a state usage is enabled/disabled cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 40 ++ drivers/cpuidle/cpuidle-powernv.c | 47 ++- drivers/cpuidle/sysfs.c | 15 +- include/linux/cpuidle.h | 5 5 files changed, 95 insertions(+), 14 deletions(-) -- 2.17.1
[PATCH v2 1/1] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. drivers/cpuidle/cpuidle-powernv.c | 38 +++ 1 file changed, 38 insertions(+) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe212b3..bc9ca18ae7e3 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -46,6 +46,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = >states[i]; + struct cpuidle_state_usage *su = >states_usage[i]; + + if (s->disabled || su->disable) + continue; + + return (s->target_residency + 2 * s->exit_latency) * + tb_ticks_per_usec; + } + + return 0; +} + static u64 get_snooze_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -144,8 +164,26 @@ static int stop_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { + u64 dec_expiry_tb, dec, timeout_tb, forced_wakeup; + + dec = mfspr(SPRN_DEC); + timeout_tb = forced_wakeup_timeout(dev, drv, index); + forced_wakeup = 0; + + if (timeout_tb && timeout_tb < dec) { + forced_wakeup = 1; + dec_expiry_tb = mftb() + dec; + } + + if (forced_wakeup) + mtspr(SPRN_DEC, timeout_tb); + power9_idle_type(stop_psscr_table[index].val, stop_psscr_table[index].mask); + + if (forced_wakeup) + mtspr(SPRN_DEC, dec_expiry_tb - mftb()); + return index; } -- 2.17.1
[PATCH v2 0/1] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 of auto-promotion : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 of forced-wakeup : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. Abhishek Goel (1): cpuidle-powernv : forced wakeup for stop states drivers/cpuidle/cpuidle-powernv.c | 38 +++ 1 file changed, 38 insertions(+) -- 2.17.1
[PATCH 1/1] cpuidle-powernv : forced wakeup for stop lite states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. So we do not want to get stucked in such states for longer duration. To address this, the cpuidle-core can queue timer to correspond with the residency value of the next available state. This timer will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Cpu will be kicked out of the lite state and end up in a non-lite state. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/cpuidle-powernv.c | 71 - 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b23..735dec731 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -226,6 +226,7 @@ */ #define OPAL_PM_TIMEBASE_STOP 0x0002 +#define OPAL_PM_LOSE_USER_CONTEXT 0x1000 #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000 #define OPAL_PM_LOSE_FULL_CONTEXT 0x4000 #define OPAL_PM_NAP_ENABLED0x0001 diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..30b877962 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,40 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; +DEFINE_PER_CPU(struct hrtimer, forced_wakeup_timer); + +static int forced_wakeup_time_compute(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int i, timeout_us = 0; + + for (i = index + 1; i < drv->state_count; i++) { + if (drv->states[i].disabled || dev->states_usage[i].disable) + continue; + timeout_us = drv->states[i].target_residency + + 2 * drv->states[i].exit_latency; + break; + } + + return timeout_us; +} + +enum hrtimer_restart forced_wakeup_hrtimer_callback(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +static void forced_wakeup_timer_init(int cpu, struct cpuidle_driver *drv) +{ + struct hrtimer *cpu_forced_wakeup_timer = _cpu(forced_wakeup_timer, + cpu); + + hrtimer_init(cpu_forced_wakeup_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + cpu_forced_wakeup_timer->function = forced_wakeup_hrtimer_callback; +} + static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; @@ -103,6 +138,28 @@ static int snooze_loop(struct cpuidle_device *dev, return index; } +static int stop_lite_loop(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int timeout_us; + struct hrtimer *this_timer = _cpu(forced_wakeup_timer, dev->cpu); + + timeout_us = forced_wakeup_time_compute(dev, drv, index); + + if (timeout_us > 0) + hrtimer_start(this_timer, ns_to_ktime(timeout_us * 1000), + HRTIMER_MODE_REL_PINNED); + + power9_idle_type(stop_psscr_table[index].val, +stop_psscr_table[index].mask); + + if (unlikely(hrtimer_is_queued(this_timer))) + hrtimer_cancel(this_timer); + + return index; +} + static int nap_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -190,7 +247,7 @@ static int powernv_cpuidle_cpu_dead(unsigned int cpu) */ static int powernv_cpuidle_driver_init(void) { - int idle_state; + int idle_state, cpu; struct cpuidle_driver *drv = _idle_driver; drv->state_count = 0; @@ -224,6 +281,9 @@ static int powernv_cpuidle_driver_init(void) drv->cpumask = (struct cpumask *)cpu_present_mask; + for_each_cpu(cpu, drv->cpumask) + forced_wakeup_time
[PATCH 0/1] Forced-wakeup for stop lite states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. So we do not want to get stucked in such states for longer duration. To address this, the cpuidle-core can queue timer to correspond with the residency value of the next available state. This timer will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Cpu will be kicked out of the lite state and end up in a non-lite state. Experiment -- I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us In this patch, we queue a timer just before entering into a stop0_lite state. The timer fires at (residency of next available state + exit latency of next available state * 2). Let's say if next state(stop0) is available which has residency of 20us, it should get out in as low as (20+2*2)*8 [Based on the forumla (residency + 2xlatency)*history length] microseconds = 192us. Ideally we would expect 8 iterations, it was observed to get out in 6-7 iterations. Even if let's say stop2 is next available state(stop0 and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get into stop2. So, We are able to get out of stop0_lite generally in 150us(with this patch) as compared to 4ms(with tick retained). As stated earlier, we do not want to get stuck into stop0_lite as it inhibits SMT folding for other sibling threads, depriving them of core resources. Current patch is using forced-wakeup only for stop0_lite, as it gives performance benefit(primary reason) along with lowering down power consumption. We may extend this model for other states in future. Abhishek Goel (1): cpuidle-powernv : forced wakeup for stop lite states arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/cpuidle-powernv.c | 71 - 2 files changed, 71 insertions(+), 1 deletion(-) -- 2.17.1
[PATCH v2 2/2] cpuidle : Add auto-promotion flag to cpuidle flags
This patch sets up flags for the state which needs to be auto-promoted. On POWERNV system, only lite states need to be autopromoted. We identify lite states by those which do not lose user context. That information has been used to set the flag for lite states. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b23..735dec731 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -226,6 +226,7 @@ */ #define OPAL_PM_TIMEBASE_STOP 0x0002 +#define OPAL_PM_LOSE_USER_CONTEXT 0x1000 #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000 #define OPAL_PM_LOSE_FULL_CONTEXT 0x4000 #define OPAL_PM_NAP_ENABLED0x0001 diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 8caccbbd7..9b8e9b96a 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -35,6 +35,10 @@ config CPU_IDLE_GOV_TEO config DT_IDLE_STATES bool +config CPU_IDLE_AUTO_PROMOTION + bool + default y if PPC_POWERNV + menu "ARM CPU Idle Drivers" depends on ARM || ARM64 source "drivers/cpuidle/Kconfig.arm" diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..0dd767270 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -299,6 +299,7 @@ static int powernv_add_idle_states(void) for (i = 0; i < dt_idle_states; i++) { unsigned int exit_latency, target_residency; bool stops_timebase = false; + bool lose_user_context = false; struct pnv_idle_states_t *state = _idle_states[i]; /* @@ -324,6 +325,9 @@ static int powernv_add_idle_states(void) if (has_stop_states && !(state->valid)) continue; + if (state->flags & OPAL_PM_LOSE_USER_CONTEXT) + lose_user_context = true; + if (state->flags & OPAL_PM_TIMEBASE_STOP) stops_timebase = true; @@ -332,12 +336,17 @@ static int powernv_add_idle_states(void) add_powernv_state(nr_idle_states, "Nap", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); + } else if (has_stop_states && !lose_user_context) { + add_powernv_state(nr_idle_states, state->name, + CPUIDLE_FLAG_AUTO_PROMOTION, + stop_loop, target_residency, + exit_latency, state->psscr_val, + state->psscr_mask); } else if (has_stop_states && !stops_timebase) { add_powernv_state(nr_idle_states, state->name, CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, - state->psscr_val, - state->psscr_mask); + state->psscr_val, state->psscr_mask); } /* -- 2.17.1
[PATCH v2 1/2] cpuidle : auto-promotion for cpuidle states
Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Signed-off-by: Abhishek Goel --- v1->v2 : Removed timeout_needed and rebased to current upstream kernel drivers/cpuidle/cpuidle.c | 68 +- drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c | 22 +- include/linux/cpuidle.h| 10 - 4 files changed, 99 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 7f108309e..11ce43f19 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -36,6 +36,11 @@ static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +struct auto_promotion { + struct hrtimer hrtimer; + unsigned long timeout_us; +}; + int cpuidle_disabled(void) { return off; @@ -188,6 +193,54 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) } #endif /* CONFIG_SUSPEND */ +enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION +DEFINE_PER_CPU(struct auto_promotion, ap); + +static void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state *state) +{ + struct auto_promotion *this_ap = _cpu(ap, cpu); + + if (state->flags & CPUIDLE_FLAG_AUTO_PROMOTION) + hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout_us + * 1000), HRTIMER_MODE_REL_PINNED); +} + +static void cpuidle_auto_promotion_cancel(int cpu) +{ + struct hrtimer *hrtimer; + + hrtimer = _cpu(ap, cpu).hrtimer; + if (hrtimer_is_queued(hrtimer)) + hrtimer_cancel(hrtimer); +} + +static void cpuidle_auto_promotion_update(int cpu, unsigned long timeout) +{ + per_cpu(ap, cpu).timeout_us = timeout; +} + +static void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver *drv) +{ + struct auto_promotion *this_ap = _cpu(ap, cpu); + + hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + this_ap->hrtimer.function = auto_promotion_hrtimer_callback; +} +#else +static inline void cpuidle_auto_promotion_start(int cpu, struct cpuidle_state + *state) { } +static inline void cpuidle_auto_promotion_cancel(int cpu) { } +static inline void cpuidle_auto_promotion_update(int cpu, unsigned long + timeout) { } +static inline void cpuidle_auto_promotion_init(int cpu, struct cpuidle_driver + *drv) { } +#endif + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -225,12 +278,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ns_to_ktime(local_clock()); + cpuidle_auto_promotion_start(dev->cpu, target_state); + stop_critical_timings(); entered_state = target_state->enter(dev, drv, index); start_critical_timings(); sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); + + cpuidle_auto_promotion_cancel(dev->cpu); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ @@ -312,7 +370,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { - return cpuidle_curr_governor->select(drv, dev, stop_tick); + unsigned long timeout_us, ret; + + timeout_us = UINT_MAX; + ret = cpuidle_curr_governor->select(drv, dev, stop_tick, _us); + cpuidle_auto_promotion_update(dev->cpu, timeout_us); + + return ret; } /** @@ -658,6 +722,8 @@ int cpuidle_register(struct cpuidle_driver *drv, device = _cpu(cpuidl
[PATCH v2 0/2] Auto-promotion logic for cpuidle states
Currently, the cpuidle governors (menu/ladder) determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle-core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Experiment -- I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained(as suggested) - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = *ms = milliseconds It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us *us = microseconds In this patch, we queue a timer just before entering into a stop0_lite state. The timer fires at (residency of next available state + exit latency of next available state * 2). Let's say if next state(stop0) is available which has residency of 20us, it should get out in as low as (20+2*2)*8 [Based on the forumla (residency + 2xlatency)*history length] microseconds = 192us. Ideally we would expect 8 iterations, it was observed to get out in 6-7 iterations. Even if let's say stop2 is next available state(stop0 and stop1 both are unavailable), it would take (100+2*10)*8 = 960us to get into stop2. So, We are able to get out of stop0_lite generally in 150us(with this patch) as compared to 4ms(with tick retained). As stated earlier, we do not want to get stuck into stop0_lite as it inhibits SMT folding for other sibling threads, depriving them of core resources. Current patch is using auto-promotion only for stop0_lite, as it gives performance benefit(primary reason) along with lowering down power consumption. We may extend this model for other states in future. Abhishek Goel (2): cpuidle : auto-promotion for cpuidle states cpuidle : Add auto-promotion flag to cpuidle flags arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 ++ drivers/cpuidle/cpuidle-powernv.c | 13 +- drivers/cpuidle/cpuidle.c | 68 - drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c| 22 +- include/linux/cpuidle.h | 10 - 7 files changed, 115 insertions(+), 6 deletions(-) -- 2.17.1
[PATCH 2/2] cpuidle : Add auto-promotion flag to cpuidle flags
This patch sets up flags for the state which needs to be auto-promoted. For powernv systems, lite states do not even lose user context. That information has been used to set the flag for lite states. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b23..735dec731 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -226,6 +226,7 @@ */ #define OPAL_PM_TIMEBASE_STOP 0x0002 +#define OPAL_PM_LOSE_USER_CONTEXT 0x1000 #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000 #define OPAL_PM_LOSE_FULL_CONTEXT 0x4000 #define OPAL_PM_NAP_ENABLED0x0001 diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 7e48eb5bf..0ece62684 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -26,6 +26,10 @@ config CPU_IDLE_GOV_MENU config DT_IDLE_STATES bool +config CPU_IDLE_AUTO_PROMOTION + bool + default y if PPC_POWERNV + menu "ARM CPU Idle Drivers" depends on ARM || ARM64 source "drivers/cpuidle/Kconfig.arm" diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..e351f5f9c 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -299,6 +299,7 @@ static int powernv_add_idle_states(void) for (i = 0; i < dt_idle_states; i++) { unsigned int exit_latency, target_residency; bool stops_timebase = false; + bool lose_user_context = false; struct pnv_idle_states_t *state = _idle_states[i]; /* @@ -324,6 +325,9 @@ static int powernv_add_idle_states(void) if (has_stop_states && !(state->valid)) continue; + if (state->flags & OPAL_PM_LOSE_USER_CONTEXT) + lose_user_context = true; + if (state->flags & OPAL_PM_TIMEBASE_STOP) stops_timebase = true; @@ -332,12 +336,17 @@ static int powernv_add_idle_states(void) add_powernv_state(nr_idle_states, "Nap", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); + } else if (has_stop_states & !lose_user_context) { + add_powernv_state(nr_idle_states, state->name, + CPUIDLE_FLAG_AUTO_PROMOTION, + stop_loop, target_residency, + exit_latency, state->psscr_val, + state->psscr_mask); } else if (has_stop_states && !stops_timebase) { add_powernv_state(nr_idle_states, state->name, CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, - state->psscr_val, - state->psscr_mask); + state->psscr_val, state->psscr_mask); } /* -- 2.17.1
[PATCH 1/2] cpuidle : auto-promotion for cpuidle states
Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle.c | 79 +- drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c | 23 - include/linux/cpuidle.h| 12 +++-- 4 files changed, 111 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 7f108309e..c4d1c1b38 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -36,6 +36,12 @@ static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +struct auto_promotion { + struct hrtimer hrtimer; + int timeout; + booltimeout_needed; +}; + int cpuidle_disabled(void) { return off; @@ -188,6 +194,64 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) } #endif /* CONFIG_SUSPEND */ +enum hrtimer_restart auto_promotion_hrtimer_callback(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_CPU_IDLE_AUTO_PROMOTION +DEFINE_PER_CPU(struct auto_promotion, ap); + +static void cpuidle_auto_promotion_start(struct cpuidle_state *state, int cpu) +{ + struct auto_promotion *this_ap = _cpu(ap, cpu); + + if (this_ap->timeout_needed && (state->flags & + CPUIDLE_FLAG_AUTO_PROMOTION)) + hrtimer_start(_ap->hrtimer, ns_to_ktime(this_ap->timeout + * 1000), HRTIMER_MODE_REL_PINNED); +} + +static void cpuidle_auto_promotion_cancel(int cpu) +{ + struct hrtimer *hrtimer; + + hrtimer = _cpu(ap, cpu).hrtimer; + if (hrtimer_is_queued(hrtimer)) + hrtimer_cancel(hrtimer); +} + +static void cpuidle_auto_promotion_update(int time, int cpu) +{ + per_cpu(ap, cpu).timeout = time; +} + +static void cpuidle_auto_promotion_init(struct cpuidle_driver *drv, int cpu) +{ + int i; + struct auto_promotion *this_ap = _cpu(ap, cpu); + + this_ap->timeout_needed = 0; + + for (i = 0; i < drv->state_count; i++) { + if (drv->states[i].flags & CPUIDLE_FLAG_AUTO_PROMOTION) { + this_ap->timeout_needed = 1; + break; + } + } + + hrtimer_init(_ap->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + this_ap->hrtimer.function = auto_promotion_hrtimer_callback; +} +#else +static inline void cpuidle_auto_promotion_start(struct cpuidle_state *state, + int cpu) { } +static inline void cpuidle_auto_promotion_cancel(int cpu) { } +static inline void cpuidle_auto_promotion_update(int timeout, int cpu) { } +static inline void cpuidle_auto_promotion_init(struct cpuidle_driver *drv, + int cpu) { } +#endif + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -225,12 +289,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ns_to_ktime(local_clock()); + cpuidle_auto_promotion_start(target_state, dev->cpu); + stop_critical_timings(); entered_state = target_state->enter(dev, drv, index); start_critical_timings(); sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); + + cpuidle_auto_promotion_cancel(dev->cpu); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ @@ -312,7 +381,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { - return cpuidle_curr_governor->select(drv, dev, stop_tick); + int timeout, ret; + + timeout = INT_MAX; + ret = cpuidle_curr_governor->select
[PATCH 0/2] Auto-promotion logic for cpuidle states
Currently, the cpuidle governors (menu/ladder) determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle-core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Experiment -- Without this patch - It was seen that for a idle system, a cpu may remain in stop0_lite for few seconds and then directly goes to a deeper state such as stop2. With this patch - A cpu will not remain in stop0_lite for more than the residency of next available state, and thus it will go to a deeper state in conservative fashion. Using this, we may spent even less than 20 milliseconds if susbsequent stop states are enabled. In the worst case, we may end up spending more than a second, as was the case without this patch. The worst case will occur in the scenario when no other shallow states are enbaled, and only deep states are available for auto-promotion. Abhishek Goel (2): cpuidle : auto-promotion for cpuidle states cpuidle : Add auto-promotion flag to cpuidle flags arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 ++ drivers/cpuidle/cpuidle-powernv.c | 13 - drivers/cpuidle/cpuidle.c | 79 - drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c| 23 - include/linux/cpuidle.h | 12 +++-- 7 files changed, 127 insertions(+), 8 deletions(-) -- 2.17.1
[PATCH 0/2] Auto-promotion logic for cpuidle states
Currently, the cpuidle governors (menu/ladder) determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle-core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Experiment -- Without this patch - It was seen that for a idle system, a cpu may remain in stop0_lite for few seconds and then directly goes to a deeper state such as stop2. With this patch - A cpu will not remain in stop0_lite for more than the residency of next available state, and thus it will go to a deeper state in conservative fashion. Using this, we may spent even less than 20 milliseconds if susbsequent stop states are enabled. In the worst case, we may end up spending more than a second, as was the case without this patch. The worst case will occur in the scenario when no other shallow states are enbaled, and only deep states are available for auto-promotion. Abhishek Goel (2): cpuidle : auto-promotion for cpuidle states cpuidle : Add auto-promotion flag to cpuidle flags arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- drivers/cpuidle/cpuidle.c | 3 --- 4 files changed, 16 insertions(+), 5 deletions(-) -- 2.17.1
[PATCH 2/2] cpuidle : Add auto-promotion flag to cpuidle flags
This patch sets up flags for the state which needs to be auto-promoted. For powernv systems, lite states do not even lose user context. That information has been used to set the flag for lite states. Signed-off-by: Abhishek Goel --- arch/powerpc/include/asm/opal-api.h | 1 + drivers/cpuidle/Kconfig | 4 drivers/cpuidle/cpuidle-powernv.c | 13 +++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b23..735dec731 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -226,6 +226,7 @@ */ #define OPAL_PM_TIMEBASE_STOP 0x0002 +#define OPAL_PM_LOSE_USER_CONTEXT 0x1000 #define OPAL_PM_LOSE_HYP_CONTEXT 0x2000 #define OPAL_PM_LOSE_FULL_CONTEXT 0x4000 #define OPAL_PM_NAP_ENABLED0x0001 diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 7e48eb5bf..0ece62684 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -26,6 +26,10 @@ config CPU_IDLE_GOV_MENU config DT_IDLE_STATES bool +config CPU_IDLE_AUTO_PROMOTION + bool + default y if PPC_POWERNV + menu "ARM CPU Idle Drivers" depends on ARM || ARM64 source "drivers/cpuidle/Kconfig.arm" diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..e351f5f9c 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -299,6 +299,7 @@ static int powernv_add_idle_states(void) for (i = 0; i < dt_idle_states; i++) { unsigned int exit_latency, target_residency; bool stops_timebase = false; + bool lose_user_context = false; struct pnv_idle_states_t *state = _idle_states[i]; /* @@ -324,6 +325,9 @@ static int powernv_add_idle_states(void) if (has_stop_states && !(state->valid)) continue; + if (state->flags & OPAL_PM_LOSE_USER_CONTEXT) + lose_user_context = true; + if (state->flags & OPAL_PM_TIMEBASE_STOP) stops_timebase = true; @@ -332,12 +336,17 @@ static int powernv_add_idle_states(void) add_powernv_state(nr_idle_states, "Nap", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); + } else if (has_stop_states & !lose_user_context) { + add_powernv_state(nr_idle_states, state->name, + CPUIDLE_FLAG_AUTO_PROMOTION, + stop_loop, target_residency, + exit_latency, state->psscr_val, + state->psscr_mask); } else if (has_stop_states && !stops_timebase) { add_powernv_state(nr_idle_states, state->name, CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, - state->psscr_val, - state->psscr_mask); + state->psscr_val, state->psscr_mask); } /* -- 2.17.1
[PATCH 1/2] cpuidle : auto-promotion for cpuidle states
Currently, the cpuidle governors (menu /ladder) determine what idle state an idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a lite stop state, as such lite states will inhibit SMT folding, thereby depriving the other threads in the core from using the core resources. To address this, such lite states need to be autopromoted. The cpuidle- core can queue timer to correspond with the residency value of the next available state. Thus leading to auto-promotion to a deeper idle state as soon as possible. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 2406e2655..c4d1c1b38 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -584,11 +584,8 @@ static void __cpuidle_unregister_device(struct cpuidle_device *dev) static void __cpuidle_device_init(struct cpuidle_device *dev) { - int i; memset(dev->states_usage, 0, sizeof(dev->states_usage)); dev->last_residency = 0; - for (i = 0; i < CPUIDLE_STATE_MAX; i++) - dev->states_usage[i].disable = true; } /** -- 2.17.1
[RFC 1/1] cpuidle : Move saving and restoring of sprs to opal
This patch moves the saving and restoring of sprs for P9 cpuidle from kernel to opal. This patch still uses existing code to detect first thread in core. In an attempt to make the powernv idle code backward compatible, and to some extent forward compatible, add support for pre-stop entry and post-stop exit actions in OPAL. If a kernel knows about this opal call, then just a firmware supporting newer hardware is required, instead of waiting for kernel updates. Signed-off-by: Abhishek Goel --- Link to the Skiboot patch corresponding to this patch: http://patchwork.ozlabs.org/patch/947568/ arch/powerpc/include/asm/cpuidle.h| 10 -- arch/powerpc/include/asm/opal-api.h | 4 +- arch/powerpc/include/asm/opal.h | 3 + arch/powerpc/include/asm/paca.h | 5 +- arch/powerpc/kernel/asm-offsets.c | 10 +- arch/powerpc/kernel/idle_book3s.S | 130 ++ arch/powerpc/platforms/powernv/idle.c | 4 + .../powerpc/platforms/powernv/opal-wrappers.S | 2 + arch/powerpc/xmon/xmon.c | 12 +- 9 files changed, 61 insertions(+), 119 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index e210a83eb196..c10f47af9a55 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -68,16 +68,6 @@ #define ERR_DEEP_STATE_ESL_MISMATCH-2 #ifndef __ASSEMBLY__ -/* Additional SPRs that need to be saved/restored during stop */ -struct stop_sprs { - u64 pid; - u64 ldbar; - u64 fscr; - u64 hfscr; - u64 mmcr1; - u64 mmcr2; - u64 mmcra; -}; extern u32 pnv_fastsleep_workaround_at_entry[]; extern u32 pnv_fastsleep_workaround_at_exit[]; diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 3bab299eda49..6792a737bc9a 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -208,7 +208,9 @@ #define OPAL_SENSOR_READ_U64 162 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165 -#define OPAL_LAST 165 +#define OPAL_IDLE_SAVE 168 +#define OPAL_IDLE_RESTORE 169 +#define OPAL_LAST 169 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index e1b2910c6e81..12d57aeacde2 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -356,6 +356,9 @@ extern void opal_kmsg_init(void); extern int opal_event_request(unsigned int opal_event_nr); +extern int opal_cpuidle_save(u64 *stop_sprs, int scope, u64 psscr); +extern int opal_cpuidle_restore(u64 *stop_sprs, int scope, u64 psscr, u64 srr1); + struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, unsigned long vmalloc_size); void opal_free_sg_list(struct opal_sg_list *sg); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6d34bd71139d..765524e76beb 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -195,11 +195,12 @@ struct paca_struct { /* The PSSCR value that the kernel requested before going to stop */ u64 requested_psscr; + u64 wakeup_psscr; /* -* Save area for additional SPRs that need to be +* Save area for SPRs that need to be * saved/restored during cpuidle stop. */ - struct stop_sprs stop_sprs; + u64 *opal_stop_sprs; #endif #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0a0544335950..65a3d8582017 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -769,14 +769,8 @@ int main(void) OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); OFFSET(PACA_DONT_STOP, paca_struct, dont_stop); -#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f) - STOP_SPR(STOP_PID, pid); - STOP_SPR(STOP_LDBAR, ldbar); - STOP_SPR(STOP_FSCR, fscr); - STOP_SPR(STOP_HFSCR, hfscr); - STOP_SPR(STOP_MMCR1, mmcr1); - STOP_SPR(STOP_MMCR2, mmcr2); - STOP_SPR(STOP_MMCRA, mmcra); + OFFSET(PACA_WAKEUP_PSSCR, paca_struct, wakeup_psscr); + OFFSET(STOP_SPRS, paca_struct, opal_stop_sprs); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index e734f6e45abc..66fc955abee3 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -45,6 +45,9 @@ #define
[PATCH v2] cpuidle/powernv : Add Description for cpuidle state
Names of cpuidle states were being used for description of states in POWER as no descriptions were added in device tree. This patch reads description for idle states which have been added in device tree. The description for idle states in case of POWER can be printed using "cpupower monitor -l" or "cpupower idle-info". Signed-off-by: Abhishek Goel --- The skiboot patch which adds description for idle states in device tree can be found here: https://patchwork.ozlabs.org/patch/924879/ drivers/cpuidle/cpuidle-powernv.c | 19 +++ include/linux/cpuidle.h | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1a8234e706bc..08d8b0953a14 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -133,7 +133,7 @@ static int stop_loop(struct cpuidle_device *dev, static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = { { /* Snooze */ .name = "snooze", - .desc = "snooze", + .desc = "idle polling state", .exit_latency = 0, .target_residency = 0, .enter = snooze_loop }, @@ -206,6 +206,7 @@ static int powernv_cpuidle_driver_init(void) } static inline void add_powernv_state(int index, const char *name, +const char *desc, unsigned int flags, int (*idle_fn)(struct cpuidle_device *, struct cpuidle_driver *, @@ -215,7 +216,7 @@ static inline void add_powernv_state(int index, const char *name, u64 psscr_val, u64 psscr_mask) { strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN); - strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN); + strlcpy(powernv_states[index].desc, desc, CPUIDLE_DESC_LEN); powernv_states[index].flags = flags; powernv_states[index].target_residency = target_residency; powernv_states[index].exit_latency = exit_latency; @@ -250,6 +251,7 @@ static int powernv_add_idle_states(void) u64 psscr_val[CPUIDLE_STATE_MAX]; u64 psscr_mask[CPUIDLE_STATE_MAX]; const char *names[CPUIDLE_STATE_MAX]; + const char *descs[CPUIDLE_STATE_MAX]; u32 has_stop_states = 0; int i, rc; u32 supported_flags = pnv_get_supported_cpuidle_states(); @@ -311,6 +313,13 @@ static int powernv_add_idle_states(void) pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in DT\n"); goto out; } + if (of_property_read_string_array(power_mgt, + "ibm,cpu-idle-state-descs", descs, dt_idle_states) < 0) { + of_property_read_string_array(power_mgt, + "ibm,cpu-idle-state-names", descs, dt_idle_states); + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-descs in DT\n" + "Name will be used for description\n"); + } /* * If the idle states use stop instruction, probe for psscr values @@ -414,10 +423,11 @@ static int powernv_add_idle_states(void) target_residency = 100; /* Add NAP state */ add_powernv_state(nr_idle_states, "Nap", + "stop processor execution", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); } else if (has_stop_states && !stops_timebase) { - add_powernv_state(nr_idle_states, names[i], + add_powernv_state(nr_idle_states, names[i], descs[i], CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, psscr_val[i], psscr_mask[i]); @@ -434,11 +444,12 @@ static int powernv_add_idle_states(void) target_residency = 30; /* Add FASTSLEEP state */ add_powernv_state(nr_idle_states, "FastSleep", + "Core and L2 clock gating", CPUIDLE_FLAG_TIMER_STOP, fastsleep_loop, target_residency, exit_latency, 0, 0); } else if (has_stop_states && stops_timebase) { - add_powernv_state(nr_idle_states, names
[PATCH] cpuidle/powernv : Add Description for cpuidle state
Names of cpuidle states were being used for description of states in POWER as no descriptions were added in device tree. This patch reads description for idle states which have been added in device tree. The description for idle states in case of POWER can be printed using "cpupower monitor -l" or "cpupower idle-info". Signed-off-by: Abhishek Goel --- The skiboot patch which adds description for idle states in device tree can be found here: https://patchwork.ozlabs.org/patch/921637/ drivers/cpuidle/cpuidle-powernv.c | 17 + include/linux/cpuidle.h | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 1a8234e706bc..d3915a965128 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -133,7 +133,7 @@ static int stop_loop(struct cpuidle_device *dev, static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = { { /* Snooze */ .name = "snooze", - .desc = "snooze", + .desc = "idle polling state", .exit_latency = 0, .target_residency = 0, .enter = snooze_loop }, @@ -206,6 +206,7 @@ static int powernv_cpuidle_driver_init(void) } static inline void add_powernv_state(int index, const char *name, +const char *desc, unsigned int flags, int (*idle_fn)(struct cpuidle_device *, struct cpuidle_driver *, @@ -215,7 +216,7 @@ static inline void add_powernv_state(int index, const char *name, u64 psscr_val, u64 psscr_mask) { strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN); - strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN); + strlcpy(powernv_states[index].desc, desc, CPUIDLE_DESC_LEN); powernv_states[index].flags = flags; powernv_states[index].target_residency = target_residency; powernv_states[index].exit_latency = exit_latency; @@ -250,6 +251,7 @@ static int powernv_add_idle_states(void) u64 psscr_val[CPUIDLE_STATE_MAX]; u64 psscr_mask[CPUIDLE_STATE_MAX]; const char *names[CPUIDLE_STATE_MAX]; + const char *descs[CPUIDLE_STATE_MAX]; u32 has_stop_states = 0; int i, rc; u32 supported_flags = pnv_get_supported_cpuidle_states(); @@ -311,6 +313,11 @@ static int powernv_add_idle_states(void) pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in DT\n"); goto out; } + if (of_property_read_string_array(power_mgt, + "ibm,cpu-idle-state-descs", descs, dt_idle_states) < 0) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-descs in DT\n"); + goto out; + } /* * If the idle states use stop instruction, probe for psscr values @@ -414,10 +421,11 @@ static int powernv_add_idle_states(void) target_residency = 100; /* Add NAP state */ add_powernv_state(nr_idle_states, "Nap", + "stop processor execution", CPUIDLE_FLAG_NONE, nap_loop, target_residency, exit_latency, 0, 0); } else if (has_stop_states && !stops_timebase) { - add_powernv_state(nr_idle_states, names[i], + add_powernv_state(nr_idle_states, names[i], descs[i], CPUIDLE_FLAG_NONE, stop_loop, target_residency, exit_latency, psscr_val[i], psscr_mask[i]); @@ -434,11 +442,12 @@ static int powernv_add_idle_states(void) target_residency = 30; /* Add FASTSLEEP state */ add_powernv_state(nr_idle_states, "FastSleep", + "Core and L2 clock gating", CPUIDLE_FLAG_TIMER_STOP, fastsleep_loop, target_residency, exit_latency, 0, 0); } else if (has_stop_states && stops_timebase) { - add_powernv_state(nr_idle_states, names[i], + add_powernv_state(nr_idle_states, names[i], descs[i], CPUIDLE_FLAG_TIMER_STOP, stop_loop, target_residency, exit_latency,
[PATCH v3] cpufreq: powernv: Add support of frequency domain
Frequency-domain indicates group of CPUs that would share same frequency. It is detected using device-tree node "frequency-domain-indicator". frequency-domain-indicator is a bitmask which will have different value depending upon the generation of the processor. CPUs of the same chip for which the result of a bitwise AND between their PIR and the frequency-domain-indicator is the same share the same frequency. In this patch, we define hash-table indexed by the aforementioned bitwise ANDed value to store the cpumask of the CPUs sharing the same frequency domain. Further, the cpufreq policy will be created per frequency-domain So for POWER9, a cpufreq policy is created per quad while for POWER8 it is created per core. Governor decides frequency for each policy but multiple cores may come under same policy. In such case frequency needs to be set on each core sharing that policy. Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com> --- Skiboot patch required for the corresponding device-tree changes have been posted here : http://patchwork.ozlabs.org/patch/862256/ drivers/cpufreq/powernv-cpufreq.c | 104 ++ 1 file changed, 95 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index b6d7c4c..aab23a4 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -37,6 +37,7 @@ #include /* Required for cpu_sibling_mask() in UP configs */ #include #include +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -130,6 +131,9 @@ enum throttle_reason_type { static int nr_chips; static DEFINE_PER_CPU(struct chip *, chip_info); +static u32 freq_domain_indicator; +static bool p9_occ_quirk; + /* * Note: * The set of pstates consists of contiguous integers. @@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy *policy) gpstates->last_gpstate_idx = 0; } +#define SIZE NR_CPUS +#define ORDER_FREQ_MAP ilog2(SIZE) + +static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP); + +struct hashmap { + cpumask_t mask; + int chip_id; + u32 pir_key; + struct hlist_node hash_node; +}; + +static void insert(u32 key, int cpu) +{ + struct hashmap *data; + + hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) { + if (data->chip_id == cpu_to_chip_id(cpu) && + data->pir_key == key) { + cpumask_set_cpu(cpu, >mask); + return; + } + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + hash_add(freq_domain_map, >hash_node, key%SIZE); + cpumask_set_cpu(cpu, >mask); + data->chip_id = cpu_to_chip_id(cpu); + data->pir_key = key; + +} + /* * Initialize the freq table based on data obtained * from the firmware passed via device-tree @@ -206,6 +242,7 @@ static int init_powernv_pstates(void) u32 len_ids, len_freqs; u32 pstate_min, pstate_max, pstate_nominal; u32 pstate_turbo, pstate_ultra_turbo; + u32 key; power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); if (!power_mgt) { @@ -246,9 +283,18 @@ static int init_powernv_pstates(void) else powernv_pstate_info.wof_enabled = true; + if (of_device_is_compatible(power_mgt, "freq-domain-v1") && + of_property_read_u32(power_mgt, "ibm,freq-domain-indicator", + _domain_indicator)) + pr_warn("ibm,freq-domain-indicator not found\n"); + +if (of_device_is_compatible(power_mgt, "p9-occ-quirk")) + p9_occ_quirk = true; + next: pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, pstate_nominal, pstate_max); + pr_info("frequency domain indicator %d", freq_domain_indicator); pr_info("Workload Optimized Frequency is %s in the platform\n", (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled"); @@ -276,6 +322,15 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (freq_domain_indicator) { + hash_init(freq_domain_map); + for_each_possible_cpu(i) { + key = ((u32) get_hard_smp_processor_id(i) & + freq_domain_indicator); + insert(key, i); + } + } + powernv_pstate_info.nr_pstates = nr_pstates; pr_debug("NR PStates %d\n", nr_pstates); for (i = 0; i < nr_pstates; i++) { @@ -760,25 +815,56 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, spin_unlock(>gpstate_lock); -
[PATCH v2] cpufreq: powernv: Add support of frequency domain
Frequency-domain indicates group of CPUs that would share same frequency. It is detected using device-tree node "frequency-domain-indicator". frequency-domain-indicator is a bitmask which will have different value depending upon the generation of the processor. CPUs of the same chip for which the result of a bitwise AND between their PIR and the frequency-domain-indicator is the same share the same frequency. In this patch, we define hash-table indexed by the aforementioned bitwise ANDed value to store the cpumask of the CPUs sharing the same frequency domain. Further, the cpufreq policy will be created per frequency-domain So for POWER9, a cpufreq policy is created per quad while for POWER8 it is created per core. Governor decides frequency for each policy but multiple cores may come under same policy. In such case frequency needs to be set on each core sharing that policy. Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com> --- drivers/cpufreq/powernv-cpufreq.c | 110 ++ 1 file changed, 100 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index b6d7c4c..fd642bc 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -37,6 +37,7 @@ #include /* Required for cpu_sibling_mask() in UP configs */ #include #include +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -130,6 +131,9 @@ static struct chip { static int nr_chips; static DEFINE_PER_CPU(struct chip *, chip_info); +static u32 freq_domain_indicator; +static u32 flag; + /* * Note: * The set of pstates consists of contiguous integers. @@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy *policy) gpstates->last_gpstate_idx = 0; } +#define SIZE NR_CPUS +#define ORDER_FREQ_MAP ilog2(SIZE) + +static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP); + +struct hashmap { + cpumask_t mask; + int chip_id; + u32 pir_key; + struct hlist_node hash_node; +}; + +static void insert(u32 key, int cpu) +{ + struct hashmap *data; + + hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) { + if (data->chip_id == cpu_to_chip_id(cpu) && + data->pir_key == key) { + cpumask_set_cpu(cpu, >mask); + return; + } + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + hash_add(freq_domain_map, >hash_node, key%SIZE); + cpumask_set_cpu(cpu, >mask); + data->chip_id = cpu_to_chip_id(cpu); + data->pir_key = key; + +} + /* * Initialize the freq table based on data obtained * from the firmware passed via device-tree @@ -206,7 +242,9 @@ static int init_powernv_pstates(void) u32 len_ids, len_freqs; u32 pstate_min, pstate_max, pstate_nominal; u32 pstate_turbo, pstate_ultra_turbo; + u32 key; + flag = 0; power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); if (!power_mgt) { pr_warn("power-mgt node not found\n"); @@ -229,6 +267,17 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (of_device_is_compatible(power_mgt, "freq-domain-v1") && + of_property_read_u32(power_mgt, "ibm,freq-domain-indicator", +_domain_indicator)) { + pr_warn("ibm,freq-domain-indicator not found\n"); + freq_domain_indicator = 0; + } + + if (of_device_is_compatible(power_mgt, "P9-occ-quirk")) { + flag = 1; + } + if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo", _ultra_turbo)) { powernv_pstate_info.wof_enabled = false; @@ -249,6 +298,7 @@ static int init_powernv_pstates(void) next: pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, pstate_nominal, pstate_max); + pr_info("frequency domain indicator %d", freq_domain_indicator); pr_info("Workload Optimized Frequency is %s in the platform\n", (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled"); @@ -276,6 +326,15 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (freq_domain_indicator) { + hash_init(freq_domain_map); + for_each_possible_cpu(i) { + key = ((u32) get_hard_smp_processor_id(i) & + freq_domain_indicator); + insert(key, i); + } + } + powernv_pstate_info.nr_pstates = nr_pstates; pr_debug("NR PStates %d\n", n
[PATCH] cpufreq: powernv: Add support of frequency domain
Frequency-domain indicates group of CPUs that would share same frequency. It is detected using device-tree node "frequency-domain-indicator". frequency-domain-indicator is a bitmask which will have different value depending upon the generation of the processor. CPUs of the same chip for which the result of a bitwise AND between their PIR and the frequency-domain-indicator is the same share the same frequency. In this patch, we define hash-table indexed by the aforementioned bitwise ANDed value to store the cpumask of the CPUs sharing the same frequency domain. Further, the cpufreq policy will be created per frequency-domain So for POWER9, a cpufreq policy is created per quad while for POWER8 it is created per core. Governor decides frequency for each policy but multiple cores may come under same policy. In such case frequency needs to be set on each core sharing that policy. Signed-off-by: Abhishek Goel <hunt...@linux.vnet.ibm.com> --- drivers/cpufreq/powernv-cpufreq.c | 95 +++ 1 file changed, 87 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index b6d7c4c..9384110 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -37,6 +37,7 @@ #include /* Required for cpu_sibling_mask() in UP configs */ #include #include +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -130,6 +131,8 @@ static struct chip { static int nr_chips; static DEFINE_PER_CPU(struct chip *, chip_info); +static u32 freq_domain_indicator; + /* * Note: * The set of pstates consists of contiguous integers. @@ -194,6 +197,38 @@ static inline void reset_gpstates(struct cpufreq_policy *policy) gpstates->last_gpstate_idx = 0; } +#define SIZE NR_CPUS +#define ORDER_FREQ_MAP ilog2(SIZE) + +static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP); + +struct hashmap { + cpumask_t mask; + int chip_id; + u32 pir_key; + struct hlist_node hash_node; +}; + +static void insert(u32 key, int cpu) +{ + struct hashmap *data; + + hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) { + if (data->chip_id == cpu_to_chip_id(cpu) && + data->pir_key == key) { + cpumask_set_cpu(cpu, >mask); + return; + } + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + hash_add(freq_domain_map, >hash_node, key%SIZE); + cpumask_set_cpu(cpu, >mask); + data->chip_id = cpu_to_chip_id(cpu); + data->pir_key = key; + +} + /* * Initialize the freq table based on data obtained * from the firmware passed via device-tree @@ -206,6 +241,7 @@ static int init_powernv_pstates(void) u32 len_ids, len_freqs; u32 pstate_min, pstate_max, pstate_nominal; u32 pstate_turbo, pstate_ultra_turbo; + u32 key; power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); if (!power_mgt) { @@ -229,6 +265,13 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (of_device_is_compatible(power_mgt, "freq-domain-v1") && + of_property_read_u32(power_mgt, "ibm,freq-domain-indicator", +_domain_indicator)) { + pr_warn("ibm,freq-domain-indicator not found\n"); + freq_domain_indicator = 0; + } + if (of_property_read_u32(power_mgt, "ibm,pstate-ultra-turbo", _ultra_turbo)) { powernv_pstate_info.wof_enabled = false; @@ -249,6 +292,7 @@ static int init_powernv_pstates(void) next: pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min, pstate_nominal, pstate_max); + pr_info("frequency domain indicator %d", freq_domain_indicator); pr_info("Workload Optimized Frequency is %s in the platform\n", (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled"); @@ -276,6 +320,15 @@ static int init_powernv_pstates(void) return -ENODEV; } + if (freq_domain_indicator) { + hash_init(freq_domain_map); + for_each_possible_cpu(i) { + key = ((u32) get_hard_smp_processor_id(i) & + freq_domain_indicator); + insert(key, i); + } + } + powernv_pstate_info.nr_pstates = nr_pstates; pr_debug("NR PStates %d\n", nr_pstates); for (i = 0; i < nr_pstates; i++) { @@ -693,6 +746,8 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, { struct powernv_smp_call_data freq_data; unsigned in