There is a race window between the point when __spin_lock_debug() detects spinlock lockup and the time when CPU that caused the lockup receives its backtrace interrupt.
Before __spin_lock_debug() triggers all_cpu_backtrace() it calls spin_dump() to printk() the current state of the lock and CPU backtrace. These printk() calls can take some time to print the messages to serial console, for instance (we are not talking about console_unlock() loop and a flood of messages from other CPUs, but just spin_dump() printk() and serial console). All those preparation steps can give CPU that caused the lockup enough time to run away, so when it receives a backtrace interrupt it can look completely innocent. The patch extends `struct raw_spinlock' with additional variable that stores jiffies of successful do_raw_spin_lock() and checks in debug_spin_unlock() whether the spin_lock has been locked for too long. So we will have a reliable backtrace from CPU that locked up and a reliable backtrace from CPU that caused the lockup. Missed spin_lock unlock deadline report (example): BUG: spinlock missed unlock deadline on CPU#0, bash/327 lock: lock.25562+0x0/0x60, .magic: dead4ead, .owner: bash/327, .owner_cpu: 0 CPU: 0 PID: 327 Comm: bash Call Trace: dump_stack+0x4f/0x65 spin_dump+0x8a/0x8f spin_bug+0x2b/0x2d do_raw_spin_unlock+0x92/0xa3 _raw_spin_unlock+0x27/0x44 ... Signed-off-by: Sergey Senozhatsky <sergey.senozhat...@gmail.com> --- include/linux/spinlock_types.h | 4 +++- kernel/locking/spinlock_debug.c | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 73548eb13a5d..8972e56eeefb 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -25,6 +25,7 @@ typedef struct raw_spinlock { #ifdef CONFIG_DEBUG_SPINLOCK unsigned int magic, owner_cpu; void *owner; + unsigned long acquire_tstamp; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -45,7 +46,8 @@ typedef struct raw_spinlock { # define SPIN_DEBUG_INIT(lockname) \ .magic = SPINLOCK_MAGIC, \ .owner_cpu = -1, \ - .owner = SPINLOCK_OWNER_INIT, + .owner = SPINLOCK_OWNER_INIT, \ + .acquire_tstamp = 0, #else # define SPIN_DEBUG_INIT(lockname) #endif diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 0374a596cffa..daeab4bc86ff 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -12,6 +12,7 @@ #include <linux/debug_locks.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/jiffies.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key) @@ -27,6 +28,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, lock->magic = SPINLOCK_MAGIC; lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; + lock->acquire_tstamp = 0; } EXPORT_SYMBOL(__raw_spin_lock_init); @@ -90,6 +92,7 @@ static inline void debug_spin_lock_after(raw_spinlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); lock->owner = current; + lock->acquire_tstamp = jiffies; } static inline void debug_spin_unlock(raw_spinlock_t *lock) @@ -99,6 +102,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock) SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); + SPIN_BUG_ON(time_after_eq(jiffies, lock->acquire_tstamp + HZ), + lock, "missed unlock deadline"); lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } -- 2.11.0