On 2/29/20, Visa Hankala <wild.prickly.sh...@gmail.com> wrote:
> There is a bug in how rw_exit_read() and rw_exit_write() use
> WITNESS_UNLOCK(). The fast paths call WITNESS_UNLOCK() after releasing
> the actual lock. This leads to a use-after-free in the checker if the
> lock is dynamically allocated and another thread happens to free it
> too early.
>
> The following diff splits rw_exit() into two separate functions, one
> which is the rw_exit(9) frontend, and the other, rw_do_exit(), which is
> common between rw_exit(9), rw_exit_read(9) and rw_exit_write(9). This
> makes it possible to call WITNESS_UNLOCK() before the actual lock
> release without turning the code into an #ifdef maze.
>
> The diff additionally does the following tweaks:
>
> * Invoke membar_exit_before_atomic() only once even if the slow path is
>   taken. The initial barrier is enough for the release semantics.
>
> * Read rwl_owner for rw_cas() as late as possible so that the CAS
>   operation would be less likely to fail. Note that in rw_exit() the
>   purpose of the read is different; it determines the type of the lock
>   (read / write).
>
> * Put the rw_cas() in rw_exit() (now rw_do_exit()) inside
>   __predict_false(). This compacts the resulting machine code a bit.
>
> The code could be simplified by removing the fast path from
> rw_exit_read() and rw_exit_write(), and always calling rw_do_exit().
> However, that would reduce performance a tiny amount. I saw about 0.5%
> increase in kernel build time in a test setup. The patch below does
> not seem to affect performance negatively.
>
> OK?

If the slowdown is repeatable it most likely happens because of the
immediate re-read in the loop in the slow path.

Also note performance bugs in unlock:
- *any* read unlock wakes up everyone, so if the lock has many blocked
waiters they will all wake up only be likely to go back to sleep. On
the other hand this may be an anomaly which sometimes improves
performance because it in corner cases it can mitigate lack of
adaptive spinning
- the loop keeps re-reading the word instead of using the value
returned by cmpxchg. so happens rw_cas macro explicitly throws it away

Splitting unlock paths between reader and writer cases is unavoidable
in the long term (to allow handling different wake up policies) and I
think this is a great opportunity to do it all the way.

That said, I propose you do roughly this:

static void __always_inline
rw_exit_read_impl(struct rwlock *rwl, unsigned long v)
{

        rw_assert_rdlock(rwl);
        WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);

        .... cas loop + wake up come here
}

void
rw_exit_read(struct rwlock *rwl)
{

        membar_exit_before_atomic();
        rw_exit_read_impl(rwl, rwl->rwl_owner);
}

void
rw_exit(struct rwlock *rwl)
{
        unsigned long v;

        membar_exit_before_atomic();
        v = rwl->rwl_owner;
        if (v & RWLOCK_WRLOCK)
                rw_exit_write_impl(rwl, v);
        else
                rw_exit_read_impl(rwl, v);
}

And similarly for write.

That is, the bottom routines assume the fence was posted and they got
a "fresh" value.

>
> Index: kern/kern_rwlock.c
> ===================================================================
> RCS file: src/sys/kern/kern_rwlock.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 kern_rwlock.c
> --- kern/kern_rwlock.c        30 Nov 2019 11:19:17 -0000      1.44
> +++ kern/kern_rwlock.c        29 Feb 2020 16:24:59 -0000
> @@ -25,6 +25,8 @@
>  #include <sys/atomic.h>
>  #include <sys/witness.h>
>
> +void rw_do_exit(struct rwlock *, unsigned long);
> +
>  /* XXX - temporary measure until proc0 is properly aligned */
>  #define RW_PROC(p) (((long)p) & ~RWLOCK_MASK)
>
> @@ -129,31 +131,31 @@ rw_enter_write(struct rwlock *rwl)
>  void
>  rw_exit_read(struct rwlock *rwl)
>  {
> -     unsigned long owner = rwl->rwl_owner;
> +     unsigned long owner;
>
>       rw_assert_rdlock(rwl);
> +     WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
>
>       membar_exit_before_atomic();
> +     owner = rwl->rwl_owner;
>       if (__predict_false((owner & RWLOCK_WAIT) ||
>           rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
> -             rw_exit(rwl);
> -     else
> -             WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
> +             rw_do_exit(rwl, 0);
>  }
>
>  void
>  rw_exit_write(struct rwlock *rwl)
>  {
> -     unsigned long owner = rwl->rwl_owner;
> +     unsigned long owner;
>
>       rw_assert_wrlock(rwl);
> +     WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
>
>       membar_exit_before_atomic();
> +     owner = rwl->rwl_owner;
>       if (__predict_false((owner & RWLOCK_WAIT) ||
>           rw_cas(&rwl->rwl_owner, owner, 0)))
> -             rw_exit(rwl);
> -     else
> -             WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
> +             rw_do_exit(rwl, RWLOCK_WRLOCK);
>  }
>
>  #ifdef DIAGNOSTIC
> @@ -314,22 +316,29 @@ retry:
>  void
>  rw_exit(struct rwlock *rwl)
>  {
> -     unsigned long owner = rwl->rwl_owner;
> -     int wrlock = owner & RWLOCK_WRLOCK;
> -     unsigned long set;
> +     unsigned long wrlock;
>
>       /* Avoid deadlocks after panic or in DDB */
>       if (panicstr || db_active)
>               return;
>
> +     wrlock = rwl->rwl_owner & RWLOCK_WRLOCK;
>       if (wrlock)
>               rw_assert_wrlock(rwl);
>       else
>               rw_assert_rdlock(rwl);
> -
>       WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0);
>
>       membar_exit_before_atomic();
> +     rw_do_exit(rwl, wrlock);
> +}
> +
> +/* membar_exit_before_atomic() has to precede call of this function. */
> +void
> +rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
> +{
> +     unsigned long owner, set;
> +
>       do {
>               owner = rwl->rwl_owner;
>               if (wrlock)
> @@ -337,7 +346,7 @@ rw_exit(struct rwlock *rwl)
>               else
>                       set = (owner - RWLOCK_READ_INCR) &
>                               ~(RWLOCK_WAIT|RWLOCK_WRWANT);
> -     } while (rw_cas(&rwl->rwl_owner, owner, set));
> +     } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set)));
>
>       if (owner & RWLOCK_WAIT)
>               wakeup(rwl);
>
>


-- 
Mateusz Guzik <mjguzik gmail.com>

Reply via email to