Re: [PATCHv8 2/2] watchdog/softlockup: report the most frequent interrupts

2024-02-20 Thread Bitao Hu

Hi,

On 2024/2/20 17:35, Thomas Gleixner wrote:

On Tue, Feb 20 2024 at 00:19, Bitao Hu wrote:

  arch/mips/dec/setup.c|   2 +-
  arch/parisc/kernel/smp.c |   2 +-
  arch/powerpc/kvm/book3s_hv_rm_xics.c |   2 +-
  include/linux/irqdesc.h  |   9 ++-
  include/linux/kernel_stat.h  |   4 +
  kernel/irq/internals.h   |   2 +-
  kernel/irq/irqdesc.c |  34 ++--
  kernel/irq/proc.c|   9 +--


This really wants to be split into two patches. Interrupt infrastructure
first and then the actual usage site in the watchdog code.


Okay, I will split it into two patches.


Re: [PATCHv8 2/2] watchdog/softlockup: report the most frequent interrupts

2024-02-20 Thread Thomas Gleixner
On Tue, Feb 20 2024 at 00:19, Bitao Hu wrote:
>  arch/mips/dec/setup.c|   2 +-
>  arch/parisc/kernel/smp.c |   2 +-
>  arch/powerpc/kvm/book3s_hv_rm_xics.c |   2 +-
>  include/linux/irqdesc.h  |   9 ++-
>  include/linux/kernel_stat.h  |   4 +
>  kernel/irq/internals.h   |   2 +-
>  kernel/irq/irqdesc.c |  34 ++--
>  kernel/irq/proc.c|   9 +--

This really wants to be split into two patches. Interrupt infrastructure
first and then the actual usage site in the watchdog code.

Thanks,

tglx


[PATCHv8 2/2] watchdog/softlockup: report the most frequent interrupts

2024-02-19 Thread Bitao Hu
When the watchdog determines that the current soft lockup is due
to an interrupt storm based on CPU utilization, reporting the
most frequent interrupts could be good enough for further
troubleshooting.

Below is an example of interrupt storm. The call tree does not
provide useful information, but we can analyze which interrupt
caused the soft lockup by comparing the counts of interrupts.

[ 2987.488075] watchdog: BUG: soft lockup - CPU#9 stuck for 23s! 
[kworker/9:1:214]
[ 2987.488607] CPU#9 Utilization every 4s during lockup:
[ 2987.488941]  #1:   0% system,  0% softirq,   100% hardirq, 0% 
idle
[ 2987.489357]  #2:   0% system,  0% softirq,   100% hardirq, 0% 
idle
[ 2987.489771]  #3:   0% system,  0% softirq,   100% hardirq, 0% 
idle
[ 2987.490186]  #4:   0% system,  0% softirq,   100% hardirq, 0% 
idle
[ 2987.490601]  #5:   0% system,  0% softirq,   100% hardirq, 0% 
idle
[ 2987.491034] CPU#9 Detect HardIRQ Time exceeds 50%. Most frequent HardIRQs:
[ 2987.491493]  #1: 330985  irq#7
[ 2987.491743]  #2: 5000irq#10
[ 2987.492039]  #3: 9   irq#91
[ 2987.492318]  #4: 3   irq#118
...
[ 2987.492728] Call trace:
[ 2987.492729]  __do_softirq+0xa8/0x364

Signed-off-by: Bitao Hu 
---
 arch/mips/dec/setup.c|   2 +-
 arch/parisc/kernel/smp.c |   2 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c |   2 +-
 include/linux/irqdesc.h  |   9 ++-
 include/linux/kernel_stat.h  |   4 +
 kernel/irq/internals.h   |   2 +-
 kernel/irq/irqdesc.c |  34 ++--
 kernel/irq/proc.c|   9 +--
 kernel/watchdog.c| 115 ++-
 scripts/gdb/linux/interrupts.py  |   6 +-
 10 files changed, 159 insertions(+), 26 deletions(-)

diff --git a/arch/mips/dec/setup.c b/arch/mips/dec/setup.c
index 6c3704f51d0d..87f0a1436bf9 100644
--- a/arch/mips/dec/setup.c
+++ b/arch/mips/dec/setup.c
@@ -756,7 +756,7 @@ void __init arch_init_irq(void)
NULL))
pr_err("Failed to register fpu interrupt\n");
desc_fpu = irq_to_desc(irq_fpu);
-   fpu_kstat_irq = this_cpu_ptr(desc_fpu->kstat_irqs);
+   fpu_kstat_irq = this_cpu_ptr(&desc_fpu->kstat_irqs->cnt);
}
if (dec_interrupt[DEC_IRQ_CASCADE] >= 0) {
if (request_irq(dec_interrupt[DEC_IRQ_CASCADE], no_action,
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 444154271f23..800eb64e91ad 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -344,7 +344,7 @@ static int smp_boot_one_cpu(int cpuid, struct task_struct 
*idle)
struct irq_desc *desc = irq_to_desc(i);
 
if (desc && desc->kstat_irqs)
-   *per_cpu_ptr(desc->kstat_irqs, cpuid) = 0;
+   *per_cpu_ptr(desc->kstat_irqs, cpuid) = (struct 
irqstat) { };
}
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e42984878503..f2636414d82a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -837,7 +837,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu 
*addr)
  */
 static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
 {
-   this_cpu_inc_rm(desc->kstat_irqs);
+   this_cpu_inc_rm(&desc->kstat_irqs->cnt);
__this_cpu_inc(kstat.irqs_sum);
 }
 
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index d9451d456a73..2912b1998670 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -17,6 +17,11 @@ struct irq_desc;
 struct irq_domain;
 struct pt_regs;
 
+struct irqstat {
+   unsigned intcnt;
+   unsigned intref;
+};
+
 /**
  * struct irq_desc - interrupt descriptor
  * @irq_common_data:   per irq and chip data passed down to chip functions
@@ -55,7 +60,7 @@ struct pt_regs;
 struct irq_desc {
struct irq_common_data  irq_common_data;
struct irq_data irq_data;
-   unsigned int __percpu   *kstat_irqs;
+   struct irqstat __percpu *kstat_irqs;
irq_flow_handler_t  handle_irq;
struct irqaction*action;/* IRQ action list */
unsigned intstatus_use_accessors;
@@ -119,7 +124,7 @@ extern struct irq_desc irq_desc[NR_IRQS];
 static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
  unsigned int cpu)
 {
-   return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+   return desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
 }
 
 static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 9935f7ecbfb9..9cbb1361f957 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,10 @@ static