This is intended to easily detect irq spinlock self-deadlocks like:

  spin_lock_irq(A);
  spin_lock_irq(B);
  spin_unlock_irq(B);
    IRQ {
      spin_lock(A); <- deadlocks
      spin_unlock(A);
    }
  spin_unlock_irq(A);

Recently we saw this kind of deadlock on our partner's node:

PID: 408      TASK: ffff8eee0870ca00  CPU: 36   COMMAND: "kworker/36:1H"
 #0 [fffffe3861831e60] crash_nmi_callback at ffffffff97269e31
 #1 [fffffe3861831e68] nmi_handle at ffffffff972300bb
 #2 [fffffe3861831eb0] default_do_nmi at ffffffff97e9e000
 #3 [fffffe3861831ed0] exc_nmi at ffffffff97e9e211
 #4 [fffffe3861831ef0] end_repeat_nmi at ffffffff98001639
    [exception RIP: native_queued_spin_lock_slowpath+638]
    RIP: ffffffff97eb31ae  RSP: ffffb1c8cd2a4d40  RFLAGS: 00000046
    RAX: 0000000000000000  RBX: ffff8f2dffb34780  RCX: 0000000000940000
    RDX: 000000000000002a  RSI: 0000000000ac0000  RDI: ffff8eaed4eb81c0
    RBP: ffff8eaed4eb81c0   R8: 0000000000000000   R9: ffff8f2dffaf3438
    R10: 0000000000000000  R11: 0000000000000000  R12: 0000000000000000
    R13: 0000000000000024  R14: 0000000000000000  R15: ffffd1c8bfb24b80
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
    <NMI exception stack>
 #5 [ffffb1c8cd2a4d40] native_queued_spin_lock_slowpath at ffffffff97eb31ae
 #6 [ffffb1c8cd2a4d60] _raw_spin_lock_irqsave at ffffffff97eb2730
 #7 [ffffb1c8cd2a4d70] __wake_up at ffffffff9737c02d
 #8 [ffffb1c8cd2a4da0] sbitmap_queue_wake_up at ffffffff9786c74d
 #9 [ffffb1c8cd2a4dc8] sbitmap_queue_clear at ffffffff9786cc97
    <IRQ stack>
    [exception RIP: _raw_spin_unlock_irq+20]
    RIP: ffffffff97eb2e84  RSP: ffffb1c8cd90fd18  RFLAGS: 00000283
    RAX: 0000000000000001  RBX: ffff8eafb68efb40  RCX: 0000000000000001
    RDX: 0000000000000008  RSI: 0000000000000061  RDI: ffff8eafb06c3c70
    RBP: ffff8eee7af43000   R8: ffff8eaed4eb81c8   R9: ffff8eaed4eb81c8
    R10: 0000000000000008  R11: 0000000000000008  R12: 0000000000000000
    R13: ffff8eafb06c3bd0  R14: ffff8eafb06c3bc0  R15: ffff8eaed4eb81c0
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018

Luckily it was already fixed in mainstream by:
commit b313a8c83551 ("block: Fix lockdep warning in blk_mq_mark_tag_wait")

Currently if we are unlucky we may miss such a deadlock on our testing
system as it is racy and it depends on the specific interrupt handler
appearing at the right place and at the right time. So this patch tries
to detect the problem despite the absence of the interrupt.

If we see spin_lock_irq under interrupts already disabled we can assume
that it has paired spin_unlock_irq which would reenable interrupts where
they should not be reenabled. So we report a warning for it.

Same thing on spin_unlock_irq even if we were lucky and there was no
deadlock let's report if interrupts were enabled.

Let's make this functionality catch one problem and then be disabled, to
prevent from spamming kernel log with warnings. Also let's add sysctl
kernel.debug_spin_lock_irq_with_disabled_interrupts to reenable it if
needed. Also let's add a by default enabled configuration option
DEBUG_SPINLOCK_IRQ_WITH_DISABLED_INTERRUPTS_BY_DEFAULT, in case we will
need this on boot.

Yes Lockdep can detect that, if it sees both the interrupt stack and the
regular stack where we can get into interrupt with spinlock held. But
with this approach we can detect the problem even without ever getting
into interrupt stack. And also this functionality seems to be more
lightweight then Lockdep as it does not need to maintain lock dependency
graph.

Tested with https://github.com/Snorch/spinlock-irq-test-module.

LKML:https://lore.kernel.org/all/20250606095741.46775-1-ptikhomi...@virtuozzo.com/
https://virtuozzo.atlassian.net/browse/VSTOR-108245
Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com>
---
 include/linux/spinlock.h  | 21 +++++++++++++++++++++
 kernel/locking/spinlock.c |  6 ++++++
 kernel/sysctl.c           |  9 +++++++++
 lib/Kconfig.debug         | 12 ++++++++++++
 4 files changed, 48 insertions(+)

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 31d3d747a9db..bb87b1c6e45b 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -371,8 +371,21 @@ do {                                                       
                \
        raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);       \
 } while (0)
 
+#ifdef CONFIG_DEBUG_SPINLOCK
+DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_SPINLOCK_IRQ_WITH_DISABLED_INTERRUPTS_BY_DEFAULT,
+                        debug_spin_lock_irq_with_disabled_interrupts);
+#endif
+
 static __always_inline void spin_lock_irq(spinlock_t *lock)
 {
+#ifdef CONFIG_DEBUG_SPINLOCK
+       if 
(static_branch_unlikely(&debug_spin_lock_irq_with_disabled_interrupts)) {
+               if (raw_irqs_disabled()) {
+                       
static_branch_disable(&debug_spin_lock_irq_with_disabled_interrupts);
+                       WARN(1, "spin_lock_irq() called with irqs disabled!\n");
+               }
+       }
+#endif
        raw_spin_lock_irq(&lock->rlock);
 }
 
@@ -398,6 +411,14 @@ static __always_inline void spin_unlock_bh(spinlock_t 
*lock)
 
 static __always_inline void spin_unlock_irq(spinlock_t *lock)
 {
+#ifdef CONFIG_DEBUG_SPINLOCK
+       if 
(static_branch_unlikely(&debug_spin_lock_irq_with_disabled_interrupts)) {
+               if (!raw_irqs_disabled()) {
+                       
static_branch_disable(&debug_spin_lock_irq_with_disabled_interrupts);
+                       WARN(1, "spin_unlock_irq() called with irqs 
enabled!\n");
+               }
+       }
+#endif
        raw_spin_unlock_irq(&lock->rlock);
 }
 
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 8475a0794f8c..b01ef68cccf5 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -22,6 +22,12 @@
 #include <linux/debug_locks.h>
 #include <linux/export.h>
 
+#ifdef CONFIG_DEBUG_SPINLOCK
+DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_SPINLOCK_IRQ_WITH_DISABLED_INTERRUPTS_BY_DEFAULT,
+                       debug_spin_lock_irq_with_disabled_interrupts);
+EXPORT_SYMBOL(debug_spin_lock_irq_with_disabled_interrupts);
+#endif
+
 #ifdef CONFIG_MMIOWB
 #ifndef arch_mmiowb_state
 DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 532a13d25394..50142e21f38c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,7 @@
 #include <linux/pid.h>
 #include <linux/delayacct.h>
 #include <linux/ve.h>
+#include <linux/spinlock.h>
 
 #include "../lib/kstrtox.h"
 
@@ -2273,6 +2274,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = SYSCTL_ZERO,
        },
+#ifdef CONFIG_DEBUG_SPINLOCK
+       {
+               .procname       = 
"debug_spin_lock_irq_with_disabled_interrupts",
+               .data           = &debug_spin_lock_irq_with_disabled_interrupts,
+               .mode           = 0644,
+               .proc_handler   = proc_do_static_key,
+       },
+#endif
        { }
 };
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a710b336058e..9add2f91a1ed 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1349,6 +1349,18 @@ config DEBUG_SPINLOCK
          best used in conjunction with the NMI watchdog so that spinlock
          deadlocks are also debuggable.
 
+config DEBUG_SPINLOCK_IRQ_WITH_DISABLED_INTERRUPTS_BY_DEFAULT
+       bool "Detect spin_(un)lock_irq() call with disabled(enabled) interrupts"
+       depends on DEBUG_SPINLOCK
+       help
+         Say Y here to detect spin_lock_irq() and spin_unlock_irq() calls
+         with disabled (enabled) interrupts. This helps detecting bugs
+         where the code is not using the right locking primitives. E.g.
+         using spin_lock_irq() twice in a row (on different locks). And thus
+         code can reenable interrupts where they should be disabled and lead
+         to deadlock.
+         Say N if you are unsure.
+
 config DEBUG_MUTEXES
        bool "Mutex debugging: basic checks"
        depends on DEBUG_KERNEL && !PREEMPT_RT
-- 
2.49.0

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to