On Thu, 2025-12-18 at 17:21 +0200, Mika Kahola wrote: > From: Srinivas Pandruvada <[email protected]> > > With the RAPL PMU addition, there is a recursive locking when CPU > online > callback function calls rapl_package_add_pmu(). Here cpu_hotplug_lock > is already acquired by cpuhp_thread_fun() and rapl_package_add_pmu() > tries to acquire again. > > <4>[ 8.197433] ============================================ > <4>[ 8.197437] WARNING: possible recursive locking detected > <4>[ 8.197440] 6.19.0-rc1-lgci-xe-xe-4242-05b7c58b3367dca84+ #1 Not > tainted > <4>[ 8.197444] -------------------------------------------- > <4>[ 8.197447] cpuhp/0/20 is trying to acquire lock: > <4>[ 8.197450] ffffffff83487870 (cpu_hotplug_lock){++++}-{0:0}, at: > rapl_package_add_pmu+0x37/0x370 [intel_rapl_common] > <4>[ 8.197463] > but task is already holding lock: > <4>[ 8.197466] ffffffff83487870 (cpu_hotplug_lock){++++}-{0:0}, at: > cpuhp_thread_fun+0x6d/0x290 > <4>[ 8.197477] > other info that might help us debug this: > <4>[ 8.197480] Possible unsafe locking scenario: > > <4>[ 8.197483] CPU0 > <4>[ 8.197485] ---- > <4>[ 8.197487] lock(cpu_hotplug_lock); > <4>[ 8.197490] lock(cpu_hotplug_lock); > <4>[ 8.197493] > *** DEADLOCK *** > .. > .. > <4>[ 8.197542] __lock_acquire+0x146e/0x2790 > <4>[ 8.197548] lock_acquire+0xc4/0x2c0 > <4>[ 8.197550] ? rapl_package_add_pmu+0x37/0x370 [intel_rapl_common] > <4>[ 8.197556] cpus_read_lock+0x41/0x110 > <4>[ 8.197558] ? rapl_package_add_pmu+0x37/0x370 [intel_rapl_common] > <4>[ 8.197561] rapl_package_add_pmu+0x37/0x370 [intel_rapl_common] > <4>[ 8.197565] rapl_cpu_online+0x85/0x87 [intel_rapl_msr] > <4>[ 8.197568] ? __pfx_rapl_cpu_online+0x10/0x10 [intel_rapl_msr] > <4>[ 8.197570] cpuhp_invoke_callback+0x41f/0x6c0 > <4>[ 8.197573] ? cpuhp_thread_fun+0x6d/0x290 > <4>[ 8.197575] cpuhp_thread_fun+0x1e2/0x290 > <4>[ 8.197578] ? smpboot_thread_fn+0x26/0x290 > <4>[ 8.197581] smpboot_thread_fn+0x12f/0x290 > <4>[ 8.197584] ? __pfx_smpboot_thread_fn+0x10/0x10 > <4>[ 8.197586] kthread+0x11f/0x250 > <4>[ 8.197589] ? __pfx_kthread+0x10/0x10 > <4>[ 8.197592] ret_from_fork+0x344/0x3a0 > <4>[ 8.197595] ? __pfx_kthread+0x10/0x10 > <4>[ 8.197597] ret_from_fork_asm+0x1a/0x30 > <4>[ 8.197604] </TASK> > > Fix this issue in the same way as rapl powercap package domain is > added > from the same CPU online callback by introducing another interface > which > doesn't call cpus_read_lock(). Add rapl_package_add_pmu_locked() and > rapl_package_remove_pmu_locked() which don't call cpus_read_lock().
Acked-by: Jouni Högander <[email protected]> > > Fixes: 748d6ba43afd ("powercap: intel_rapl: Enable MSR-based RAPL PMU > support") > Reported-by: Borah, Chaitanya Kumar <[email protected]> > Closes: > https://lore.kernel.org/linux-pm/[email protected]/T/#u > Tested-by: Kuppuswamy Sathyanarayanan > <[email protected]> > Tested-by: RavitejaX Veesam <[email protected]> > Signed-off-by: Srinivas Pandruvada > <[email protected]> > Link: > https://patch.msgid.link/[email protected] > Signed-off-by: Rafael J. Wysocki <[email protected]> > Signed-off-by: Mika Kahola <[email protected]> > --- > drivers/powercap/intel_rapl_common.c | 24 ++++++++++++++++++------ > drivers/powercap/intel_rapl_msr.c | 4 ++-- > include/linux/intel_rapl.h | 4 ++++ > 3 files changed, 24 insertions(+), 8 deletions(-) > > diff --git a/drivers/powercap/intel_rapl_common.c > b/drivers/powercap/intel_rapl_common.c > index b9d87e56cbbc..3ff6da3bf4e6 100644 > --- a/drivers/powercap/intel_rapl_common.c > +++ b/drivers/powercap/intel_rapl_common.c > @@ -2032,7 +2032,7 @@ static int rapl_pmu_update(struct rapl_package > *rp) > return ret; > } > > -int rapl_package_add_pmu(struct rapl_package *rp) > +int rapl_package_add_pmu_locked(struct rapl_package *rp) > { > struct rapl_package_pmu_data *data = &rp->pmu_data; > int idx; > @@ -2040,8 +2040,6 @@ int rapl_package_add_pmu(struct rapl_package > *rp) > if (rp->has_pmu) > return -EEXIST; > > - guard(cpus_read_lock)(); > - > for (idx = 0; idx < rp->nr_domains; idx++) { > struct rapl_domain *rd = &rp->domains[idx]; > int domain = rd->id; > @@ -2091,17 +2089,23 @@ int rapl_package_add_pmu(struct rapl_package > *rp) > > return rapl_pmu_update(rp); > } > +EXPORT_SYMBOL_GPL(rapl_package_add_pmu_locked); > + > +int rapl_package_add_pmu(struct rapl_package *rp) > +{ > + guard(cpus_read_lock)(); > + > + return rapl_package_add_pmu_locked(rp); > +} > EXPORT_SYMBOL_GPL(rapl_package_add_pmu); > > -void rapl_package_remove_pmu(struct rapl_package *rp) > +void rapl_package_remove_pmu_locked(struct rapl_package *rp) > { > struct rapl_package *pos; > > if (!rp->has_pmu) > return; > > - guard(cpus_read_lock)(); > - > list_for_each_entry(pos, &rapl_packages, plist) { > /* PMU is still needed */ > if (pos->has_pmu && pos != rp) > @@ -2111,6 +2115,14 @@ void rapl_package_remove_pmu(struct > rapl_package *rp) > perf_pmu_unregister(&rapl_pmu.pmu); > memset(&rapl_pmu, 0, sizeof(struct rapl_pmu)); > } > +EXPORT_SYMBOL_GPL(rapl_package_remove_pmu_locked); > + > +void rapl_package_remove_pmu(struct rapl_package *rp) > +{ > + guard(cpus_read_lock)(); > + > + rapl_package_remove_pmu_locked(rp); > +} > EXPORT_SYMBOL_GPL(rapl_package_remove_pmu); > #endif > > diff --git a/drivers/powercap/intel_rapl_msr.c > b/drivers/powercap/intel_rapl_msr.c > index 0ce1096b6314..9a7e150b3536 100644 > --- a/drivers/powercap/intel_rapl_msr.c > +++ b/drivers/powercap/intel_rapl_msr.c > @@ -82,7 +82,7 @@ static int rapl_cpu_online(unsigned int cpu) > if (IS_ERR(rp)) > return PTR_ERR(rp); > if (rapl_msr_pmu) > - rapl_package_add_pmu(rp); > + rapl_package_add_pmu_locked(rp); > } > cpumask_set_cpu(cpu, &rp->cpumask); > return 0; > @@ -101,7 +101,7 @@ static int rapl_cpu_down_prep(unsigned int cpu) > lead_cpu = cpumask_first(&rp->cpumask); > if (lead_cpu >= nr_cpu_ids) { > if (rapl_msr_pmu) > - rapl_package_remove_pmu(rp); > + rapl_package_remove_pmu_locked(rp); > rapl_remove_package_cpuslocked(rp); > } else if (rp->lead_cpu == cpu) { > rp->lead_cpu = lead_cpu; > diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h > index e9ade2ff4af6..f479ef5b3341 100644 > --- a/include/linux/intel_rapl.h > +++ b/include/linux/intel_rapl.h > @@ -214,10 +214,14 @@ void rapl_remove_package(struct rapl_package > *rp); > > #ifdef CONFIG_PERF_EVENTS > int rapl_package_add_pmu(struct rapl_package *rp); > +int rapl_package_add_pmu_locked(struct rapl_package *rp); > void rapl_package_remove_pmu(struct rapl_package *rp); > +void rapl_package_remove_pmu_locked(struct rapl_package *rp); > #else > static inline int rapl_package_add_pmu(struct rapl_package *rp) { > return 0; } > +static inline int rapl_package_add_pmu_locked(struct rapl_package > *rp) { return 0; } > static inline void rapl_package_remove_pmu(struct rapl_package *rp) > { } > +static inline void rapl_package_remove_pmu_locked(struct > rapl_package *rp) { } > #endif > > #endif /* __INTEL_RAPL_H__ */
