I'd like to commit this.  I've received positive reports from a few
amd64 users and an i386 and softraid user, and all of the locking bugs
exposed so far have already been fixed.

I plan to remove the "#define panic()" hacks and let future locking
problems actually panic; if anyone thinks they should stay in for a
bit longer, let me know.

ok?


On Mon, Sep 20, 2010 at 08:00:27PM -0700, Matthew Dempsky wrote:
> I'd really appreciate if people could test the diffs below and report
> back to me.  This adds some warnings about lock API misuse that has
> already caught two bugs in DRM.  Eventually, I want these to be panics
> instead, but using just warnings for now should make it easier for
> people to test.
> 
> Please apply the diff below to an up-to-date -current kernel and check
> dmesg for any stack backtraces.  If you see any, please send them to
> me.
> 
> I'd especially like reports from both i386 and amd64, and also from
> softraid users.
> 
> 
> More details on the diff: this adds per-processor mutex lock counting
> to i386 and amd64, and updates assertwaitok() to check that this value
> is 0.  They also now check that the process that calls mtx_leave() is
> the same that called mtx_enter().  rw_exit*() are changed to call
> rw_assert_{rd,wr}lock() as appropriate.  Finally, mi_switch() also
> calls assertwaitok() now.
> 
> 
> Index: kern/subr_xxx.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/subr_xxx.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 subr_xxx.c
> --- kern/subr_xxx.c   21 Sep 2010 01:09:10 -0000      1.10
> +++ kern/subr_xxx.c   21 Sep 2010 01:45:59 -0000
> @@ -153,6 +153,9 @@ blktochr(dev_t dev)
>       return (NODEV);
>  }
>  
> +#include <ddb/db_output.h>
> +#define panic(x...) do { printf(x); db_stack_dump(); } while (0);
> +
>  /*
>   * Check that we're in a context where it's okay to sleep.
>   */
> @@ -160,4 +163,9 @@ void
>  assertwaitok(void)
>  {
>       splassert(IPL_NONE);
> +#ifdef __HAVE_CPU_MUTEX_LEVEL
> +     if (curcpu()->ci_mutex_level != 0)
> +             panic("assertwaitok: non-zero mutex count: %d",
> +                 curcpu()->ci_mutex_level);
> +#endif
>  }
> Index: kern/kern_rwlock.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
> retrieving revision 1.15
> diff -u -p -r1.15 kern_rwlock.c
> --- kern/kern_rwlock.c        13 Aug 2009 23:12:15 -0000      1.15
> +++ kern/kern_rwlock.c        21 Sep 2010 01:45:59 -0000
> @@ -107,6 +107,8 @@ rw_exit_read(struct rwlock *rwl)
>  {
>       unsigned long owner = rwl->rwl_owner;
>  
> +     rw_assert_rdlock(rwl);
> +
>       if (__predict_false((owner & RWLOCK_WAIT) ||
>           rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
>               rw_exit(rwl);
> @@ -117,6 +119,8 @@ rw_exit_write(struct rwlock *rwl)
>  {
>       unsigned long owner = rwl->rwl_owner;
>  
> +     rw_assert_wrlock(rwl);
> +
>       if (__predict_false((owner & RWLOCK_WAIT) ||
>           rw_cas(&rwl->rwl_owner, owner, 0)))
>               rw_exit(rwl);
> @@ -236,6 +240,11 @@ rw_exit(struct rwlock *rwl)
>       int wrlock = owner & RWLOCK_WRLOCK;
>       unsigned long set;
>  
> +     if (wrlock)
> +             rw_assert_wrlock(rwl);
> +     else
> +             rw_assert_rdlock(rwl);
> +
>       do {
>               owner = rwl->rwl_owner;
>               if (wrlock)
> @@ -249,6 +258,10 @@ rw_exit(struct rwlock *rwl)
>               wakeup(rwl);
>  }
>  
> +#include <ddb/db_output.h>
> +#define panic(x...) do { printf(x); db_stack_dump(); } while (0);
> +
> +#ifdef DIAGNOSTIC
>  void
>  rw_assert_wrlock(struct rwlock *rwl)
>  {
> @@ -272,3 +285,4 @@ rw_assert_unlocked(struct rwlock *rwl)
>       if (rwl->rwl_owner != 0L)
>               panic("%s: lock held", rwl->rwl_name);
>  }
> +#endif
> Index: kern/sched_bsd.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/sched_bsd.c,v
> retrieving revision 1.23
> diff -u -p -r1.23 sched_bsd.c
> --- kern/sched_bsd.c  30 Jun 2010 22:38:17 -0000      1.23
> +++ kern/sched_bsd.c  21 Sep 2010 01:45:59 -0000
> @@ -356,6 +356,7 @@ mi_switch(void)
>       int sched_count;
>  #endif
>  
> +     assertwaitok();
>       KASSERT(p->p_stat != SONPROC);
>  
>       SCHED_ASSERT_LOCKED();
> Index: arch/amd64/include/cpu.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
> retrieving revision 1.55
> diff -u -p -r1.55 cpu.h
> --- arch/amd64/include/cpu.h  11 Aug 2010 21:22:44 -0000      1.55
> +++ arch/amd64/include/cpu.h  21 Sep 2010 01:46:00 -0000
> @@ -87,6 +87,10 @@ struct cpu_info {
>       int             ci_idepth;
>       u_int32_t       ci_imask[NIPL];
>       u_int32_t       ci_iunmask[NIPL];
> +#ifdef DIAGNOSTIC
> +     int             ci_mutex_level;
> +#define __HAVE_CPU_MUTEX_LEVEL
> +#endif
>  
>       u_int           ci_flags;
>       u_int32_t       ci_ipis;
> Index: arch/amd64/amd64/genassym.cf
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/amd64/genassym.cf,v
> retrieving revision 1.22
> diff -u -p -r1.22 genassym.cf
> --- arch/amd64/amd64/genassym.cf      1 Apr 2010 19:47:59 -0000       1.22
> +++ arch/amd64/amd64/genassym.cf      21 Sep 2010 01:46:00 -0000
> @@ -105,6 +105,9 @@ member    CPU_INFO_IDEPTH         ci_idepth
>  member       CPU_INFO_ISOURCES       ci_isources
>  member       CPU_INFO_IPENDING       ci_ipending
>  member       CPU_INFO_IUNMASK        ci_iunmask
> +ifdef DIAGNOSTIC
> +member       CPU_INFO_MUTEX_LEVEL    ci_mutex_level
> +endif
>  member       CPU_INFO_GDT            ci_gdt
>  
>  struct       intrsource
> Index: arch/amd64/amd64/mutex.S
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/amd64/mutex.S,v
> retrieving revision 1.7
> diff -u -p -r1.7 mutex.S
> --- arch/amd64/amd64/mutex.S  13 Aug 2009 13:24:55 -0000      1.7
> +++ arch/amd64/amd64/mutex.S  21 Sep 2010 01:46:00 -0000
> @@ -62,6 +62,9 @@ ENTRY(mtx_enter)
>       cmpxchgq        %rcx, MTX_OWNER(%rdi)   # test_and_set(mtx->mtx_owner)
>       jne     2f
>       movl    %edx, MTX_OLDIPL(%rdi)
> +#ifdef DIAGNOSTIC
> +     incl    CPU_INFO_MUTEX_LEVEL(%rcx)
> +#endif
>       ret
>  
>       /* We failed to obtain the lock. splx, spin and retry. */
> @@ -103,6 +106,9 @@ ENTRY(mtx_enter_try)
>       cmpxchgq        %rcx, MTX_OWNER(%rdi)   # test_and_set(mtx->mtx_owner)
>       jne     2f
>       movl    %edx, MTX_OLDIPL(%rdi)
> +#ifdef DIAGNOSTIC
> +     incl    CPU_INFO_MUTEX_LEVEL(%rcx)
> +#endif
>       movq    $1, %rax
>       ret
>  
> @@ -122,12 +128,18 @@ ENTRY(mtx_enter_try)
>  #ifdef DIAGNOSTIC
>  3:   movq    $4f, %rdi
>       call    _C_LABEL(panic)
> -4:   .asciz  "mtx_enter: locking against myself"
> +4:   .asciz  "mtx_enter_try: locking against myself"
>  #endif
>  
>  
>  ENTRY(mtx_leave)
>       movq    %rdi, %rax
> +#ifdef DIAGNOSTIC
> +     movq    CPUVAR(SELF), %rcx
> +     cmpq    MTX_OWNER(%rax), %rcx
> +     jne     2f
> +     decl    CPU_INFO_MUTEX_LEVEL(%rcx)
> +#endif
>       xorq    %rcx, %rcx
>       movl    MTX_OLDIPL(%rax), %edi
>       movl    %ecx, MTX_OLDIPL(%rax)
> @@ -137,3 +149,9 @@ ENTRY(mtx_leave)
>       call    _C_LABEL(spllower)
>  1:
>       ret
> +
> +#ifdef DIAGNOSTIC
> +2:   movq    $3f, %rdi
> +     call    _C_LABEL(panic)
> +3:   .asciz  "mtx_leave: lock not held"
> +#endif
> Index: arch/i386/include/cpu.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/include/cpu.h,v
> retrieving revision 1.114
> diff -u -p -r1.114 cpu.h
> --- arch/i386/include/cpu.h   11 Aug 2010 21:22:44 -0000      1.114
> +++ arch/i386/include/cpu.h   21 Sep 2010 01:46:00 -0000
> @@ -106,6 +106,10 @@ struct cpu_info {
>       int             ci_idepth;
>       u_int32_t       ci_imask[NIPL];
>       u_int32_t       ci_iunmask[NIPL];
> +#ifdef DIAGNOSTIC
> +     int             ci_mutex_level;
> +#define __HAVE_CPU_MUTEX_LEVEL
> +#endif
>  
>       paddr_t         ci_idle_pcb_paddr; /* PA of idle PCB */
>       u_long          ci_flags;       /* flags; see below */
> Index: arch/i386/i386/genassym.cf
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/i386/genassym.cf,v
> retrieving revision 1.31
> diff -u -p -r1.31 genassym.cf
> --- arch/i386/i386/genassym.cf        3 Jul 2010 04:54:32 -0000       1.31
> +++ arch/i386/i386/genassym.cf        21 Sep 2010 01:46:00 -0000
> @@ -207,6 +207,9 @@ define    CPU_INFO_IUNMASK        offsetof(struct 
>  define       CPU_INFO_ILEVEL         offsetof(struct cpu_info, ci_ilevel)
>  define       CPU_INFO_IDEPTH         offsetof(struct cpu_info, ci_idepth)
>  define       CPU_INFO_ISOURCES       offsetof(struct cpu_info, ci_isources)
> +ifdef DIAGNOSTIC
> +define       CPU_INFO_MUTEX_LEVEL    offsetof(struct cpu_info, 
> ci_mutex_level)
> +endif
>  define       CPU_INFO_CURPMAP        offsetof(struct cpu_info, ci_curpmap)
>  
>  struct pmap
> Index: arch/i386/i386/mutex.S
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/i386/mutex.S,v
> retrieving revision 1.7
> diff -u -p -r1.7 mutex.S
> --- arch/i386/i386/mutex.S    13 Aug 2009 13:24:55 -0000      1.7
> +++ arch/i386/i386/mutex.S    21 Sep 2010 01:46:00 -0000
> @@ -63,6 +63,9 @@ ENTRY(mtx_enter)
>       testl   %eax, %eax              # if (already held)
>       jnz     3f
>       movl    CPUVAR(SELF), %eax
> +#ifdef DIAGNOSTIC
> +     incl    CPU_INFO_MUTEX_LEVEL(%eax)
> +#endif
>       movl    %eax, MTX_OWNER(%ecx)
>       movl    %edx, MTX_OLDIPL(%ecx)
>       leave
> @@ -107,6 +110,9 @@ ENTRY(mtx_enter_try)
>       testl   %eax, %eax              # if (already held)
>       jnz     3f
>       movl    CPUVAR(SELF), %eax
> +#ifdef DIAGNOSTIC
> +     incl    CPU_INFO_MUTEX_LEVEL(%eax)
> +#endif
>       movl    %eax, MTX_OWNER(%ecx)
>       movl    %edx, MTX_OLDIPL(%ecx)
>       movl    $1, %eax
> @@ -130,7 +136,7 @@ ENTRY(mtx_enter_try)
>  #ifdef DIAGNOSTIC
>  4:   pushl   $5f
>       call    _C_LABEL(panic)
> -5:   .asciz  "mtx_enter: locking against myself"
> +5:   .asciz  "mtx_enter_try: locking against myself"
>  #endif
>  
>       
> @@ -138,6 +144,12 @@ ENTRY(mtx_leave)
>       pushl   %ebp
>       movl    %esp, %ebp
>       movl    SOFF(%ebp), %ecx
> +#ifdef DIAGNOSTIC
> +     movl    CPUVAR(SELF), %eax
> +     cmpl    %eax, MTX_OWNER(%ecx)
> +     jne     1f
> +     decl    CPU_INFO_MUTEX_LEVEL(%eax)
> +#endif
>       xorl    %eax, %eax
>       movl    %eax, MTX_OWNER(%ecx)
>       pushl   MTX_OLDIPL(%ecx)
> @@ -146,3 +158,9 @@ ENTRY(mtx_leave)
>       call    _C_LABEL(splx)
>       leave
>       ret
> +
> +#ifdef DIAGNOSTIC
> +1:   pushl   $2f
> +     call    _C_LABEL(panic)
> +2:   .asciz  "mtx_leave: lock not held"
> +#endif
> Index: sys/rwlock.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/rwlock.h,v
> retrieving revision 1.12
> diff -u -p -r1.12 rwlock.h
> --- sys/rwlock.h      13 Aug 2009 21:22:29 -0000      1.12
> +++ sys/rwlock.h      21 Sep 2010 01:46:00 -0000
> @@ -95,9 +95,15 @@ void rw_enter_write(struct rwlock *);
>  void rw_exit_read(struct rwlock *);
>  void rw_exit_write(struct rwlock *);
>  
> +#ifdef DIAGNOSTIC
>  void rw_assert_wrlock(struct rwlock *);
>  void rw_assert_rdlock(struct rwlock *);
>  void rw_assert_unlocked(struct rwlock *);
> +#else
> +#define rw_assert_wrlock(rwl) ((void)0)
> +#define rw_assert_rdlock(rwl) ((void)0)
> +#define rw_assert_unlocked(rwl) ((void)0)
> +#endif
>  
>  int rw_enter(struct rwlock *, int);
>  void rw_exit(struct rwlock *);

Reply via email to