Re: Fix use of WITNESS_UNLOCK() in rw_exit_{read,write}()

2020-03-02 Thread Mateusz Guzik
Oops, sorry for the mixup below. I got the e-mail bounced from someone
and it used their 'From' instead of the original. Regardless,
technical contend stands. :)

On 3/2/20, Mateusz Guzik  wrote:
> On 2/29/20, Visa Hankala  wrote:
>> There is a bug in how rw_exit_read() and rw_exit_write() use
>> WITNESS_UNLOCK(). The fast paths call WITNESS_UNLOCK() after releasing
>> the actual lock. This leads to a use-after-free in the checker if the
>> lock is dynamically allocated and another thread happens to free it
>> too early.
>>
>> The following diff splits rw_exit() into two separate functions, one
>> which is the rw_exit(9) frontend, and the other, rw_do_exit(), which is
>> common between rw_exit(9), rw_exit_read(9) and rw_exit_write(9). This
>> makes it possible to call WITNESS_UNLOCK() before the actual lock
>> release without turning the code into an #ifdef maze.
>>
>> The diff additionally does the following tweaks:
>>
>> * Invoke membar_exit_before_atomic() only once even if the slow path is
>>   taken. The initial barrier is enough for the release semantics.
>>
>> * Read rwl_owner for rw_cas() as late as possible so that the CAS
>>   operation would be less likely to fail. Note that in rw_exit() the
>>   purpose of the read is different; it determines the type of the lock
>>   (read / write).
>>
>> * Put the rw_cas() in rw_exit() (now rw_do_exit()) inside
>>   __predict_false(). This compacts the resulting machine code a bit.
>>
>> The code could be simplified by removing the fast path from
>> rw_exit_read() and rw_exit_write(), and always calling rw_do_exit().
>> However, that would reduce performance a tiny amount. I saw about 0.5%
>> increase in kernel build time in a test setup. The patch below does
>> not seem to affect performance negatively.
>>
>> OK?
>
> If the slowdown is repeatable it most likely happens because of the
> immediate re-read in the loop in the slow path.
>
> Also note performance bugs in unlock:
> - *any* read unlock wakes up everyone, so if the lock has many blocked
> waiters they will all wake up only be likely to go back to sleep. On
> the other hand this may be an anomaly which sometimes improves
> performance because it in corner cases it can mitigate lack of
> adaptive spinning
> - the loop keeps re-reading the word instead of using the value
> returned by cmpxchg. so happens rw_cas macro explicitly throws it away
>
> Splitting unlock paths between reader and writer cases is unavoidable
> in the long term (to allow handling different wake up policies) and I
> think this is a great opportunity to do it all the way.
>
> That said, I propose you do roughly this:
>
> static void __always_inline
> rw_exit_read_impl(struct rwlock *rwl, unsigned long v)
> {
>
> rw_assert_rdlock(rwl);
> WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
>
>  cas loop + wake up come here
> }
>
> void
> rw_exit_read(struct rwlock *rwl)
> {
>
> membar_exit_before_atomic();
> rw_exit_read_impl(rwl, rwl->rwl_owner);
> }
>
> void
> rw_exit(struct rwlock *rwl)
> {
> unsigned long v;
>
> membar_exit_before_atomic();
> v = rwl->rwl_owner;
> if (v & RWLOCK_WRLOCK)
> rw_exit_write_impl(rwl, v);
> else
> rw_exit_read_impl(rwl, v);
> }
>
> And similarly for write.
>
> That is, the bottom routines assume the fence was posted and they got
> a "fresh" value.
>
>>
>> Index: kern/kern_rwlock.c
>> ===
>> RCS file: src/sys/kern/kern_rwlock.c,v
>> retrieving revision 1.44
>> diff -u -p -r1.44 kern_rwlock.c
>> --- kern/kern_rwlock.c   30 Nov 2019 11:19:17 -  1.44
>> +++ kern/kern_rwlock.c   29 Feb 2020 16:24:59 -
>> @@ -25,6 +25,8 @@
>>  #include 
>>  #include 
>>
>> +voidrw_do_exit(struct rwlock *, unsigned long);
>> +
>>  /* XXX - temporary measure until proc0 is properly aligned */
>>  #define RW_PROC(p) (((long)p) & ~RWLOCK_MASK)
>>
>> @@ -129,31 +131,31 @@ rw_enter_write(struct rwlock *rwl)
>>  void
>>  rw_exit_read(struct rwlock *rwl)
>>  {
>> -unsigned long owner = rwl->rwl_owner;
>> +unsigned long owner;
>>
>>  rw_assert_rdlock(rwl);
>> +WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
>>
>>  membar_exit_before_atomic();
>> +owner = rwl->rwl_owner;
>>  if (__predict_false((owner & RWLOCK_WAIT) ||
>>  rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
>> -rw_exit(rwl);
>> -else
>> -WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
>> +rw_do_exit(rwl, 0);
>>  }
>>
>>  void
>>  rw_exit_write(struct rwlock *rwl)
>>  {
>> -unsigned long owner = rwl->rwl_owner;
>> +unsigned long owner;
>>
>>  rw_assert_wrlock(rwl);
>> +WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
>>
>>  membar_exit_before_atomic();
>> +owner = rwl->rwl_owner;
>>  if (__predict_false((owner & RWLOCK_WAIT) ||
>>  rw_cas(&rwl->rwl_owner, owner, 0)))
>> - 

Re: Fix use of WITNESS_UNLOCK() in rw_exit_{read,write}()

2020-03-02 Thread Mateusz Guzik
On 2/29/20, Visa Hankala  wrote:
> There is a bug in how rw_exit_read() and rw_exit_write() use
> WITNESS_UNLOCK(). The fast paths call WITNESS_UNLOCK() after releasing
> the actual lock. This leads to a use-after-free in the checker if the
> lock is dynamically allocated and another thread happens to free it
> too early.
>
> The following diff splits rw_exit() into two separate functions, one
> which is the rw_exit(9) frontend, and the other, rw_do_exit(), which is
> common between rw_exit(9), rw_exit_read(9) and rw_exit_write(9). This
> makes it possible to call WITNESS_UNLOCK() before the actual lock
> release without turning the code into an #ifdef maze.
>
> The diff additionally does the following tweaks:
>
> * Invoke membar_exit_before_atomic() only once even if the slow path is
>   taken. The initial barrier is enough for the release semantics.
>
> * Read rwl_owner for rw_cas() as late as possible so that the CAS
>   operation would be less likely to fail. Note that in rw_exit() the
>   purpose of the read is different; it determines the type of the lock
>   (read / write).
>
> * Put the rw_cas() in rw_exit() (now rw_do_exit()) inside
>   __predict_false(). This compacts the resulting machine code a bit.
>
> The code could be simplified by removing the fast path from
> rw_exit_read() and rw_exit_write(), and always calling rw_do_exit().
> However, that would reduce performance a tiny amount. I saw about 0.5%
> increase in kernel build time in a test setup. The patch below does
> not seem to affect performance negatively.
>
> OK?

If the slowdown is repeatable it most likely happens because of the
immediate re-read in the loop in the slow path.

Also note performance bugs in unlock:
- *any* read unlock wakes up everyone, so if the lock has many blocked
waiters they will all wake up only be likely to go back to sleep. On
the other hand this may be an anomaly which sometimes improves
performance because it in corner cases it can mitigate lack of
adaptive spinning
- the loop keeps re-reading the word instead of using the value
returned by cmpxchg. so happens rw_cas macro explicitly throws it away

Splitting unlock paths between reader and writer cases is unavoidable
in the long term (to allow handling different wake up policies) and I
think this is a great opportunity to do it all the way.

That said, I propose you do roughly this:

static void __always_inline
rw_exit_read_impl(struct rwlock *rwl, unsigned long v)
{

rw_assert_rdlock(rwl);
WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);

 cas loop + wake up come here
}

void
rw_exit_read(struct rwlock *rwl)
{

membar_exit_before_atomic();
rw_exit_read_impl(rwl, rwl->rwl_owner);
}

void
rw_exit(struct rwlock *rwl)
{
unsigned long v;

membar_exit_before_atomic();
v = rwl->rwl_owner;
if (v & RWLOCK_WRLOCK)
rw_exit_write_impl(rwl, v);
else
rw_exit_read_impl(rwl, v);
}

And similarly for write.

That is, the bottom routines assume the fence was posted and they got
a "fresh" value.

>
> Index: kern/kern_rwlock.c
> ===
> RCS file: src/sys/kern/kern_rwlock.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 kern_rwlock.c
> --- kern/kern_rwlock.c30 Nov 2019 11:19:17 -  1.44
> +++ kern/kern_rwlock.c29 Feb 2020 16:24:59 -
> @@ -25,6 +25,8 @@
>  #include 
>  #include 
>
> +void rw_do_exit(struct rwlock *, unsigned long);
> +
>  /* XXX - temporary measure until proc0 is properly aligned */
>  #define RW_PROC(p) (((long)p) & ~RWLOCK_MASK)
>
> @@ -129,31 +131,31 @@ rw_enter_write(struct rwlock *rwl)
>  void
>  rw_exit_read(struct rwlock *rwl)
>  {
> - unsigned long owner = rwl->rwl_owner;
> + unsigned long owner;
>
>   rw_assert_rdlock(rwl);
> + WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
>
>   membar_exit_before_atomic();
> + owner = rwl->rwl_owner;
>   if (__predict_false((owner & RWLOCK_WAIT) ||
>   rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
> - rw_exit(rwl);
> - else
> - WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
> + rw_do_exit(rwl, 0);
>  }
>
>  void
>  rw_exit_write(struct rwlock *rwl)
>  {
> - unsigned long owner = rwl->rwl_owner;
> + unsigned long owner;
>
>   rw_assert_wrlock(rwl);
> + WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
>
>   membar_exit_before_atomic();
> + owner = rwl->rwl_owner;
>   if (__predict_false((owner & RWLOCK_WAIT) ||
>   rw_cas(&rwl->rwl_owner, owner, 0)))
> - rw_exit(rwl);
> - else
> - WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
> + rw_do_exit(rwl, RWLOCK_WRLOCK);
>  }
>
>  #ifdef DIAGNOSTIC
> @@ -314,22 +316,29 @@ retry:
>  void
>  rw_exit(struct rwlock *rwl)
>  {
> - unsigned long owner = rwl->rwl_owner;
> - int wrlock = owner & RWLOCK_WRLOCK;
> - unsigned lo

Fix use of WITNESS_UNLOCK() in rw_exit_{read,write}()

2020-02-29 Thread Visa Hankala
There is a bug in how rw_exit_read() and rw_exit_write() use
WITNESS_UNLOCK(). The fast paths call WITNESS_UNLOCK() after releasing
the actual lock. This leads to a use-after-free in the checker if the
lock is dynamically allocated and another thread happens to free it
too early.

The following diff splits rw_exit() into two separate functions, one
which is the rw_exit(9) frontend, and the other, rw_do_exit(), which is
common between rw_exit(9), rw_exit_read(9) and rw_exit_write(9). This
makes it possible to call WITNESS_UNLOCK() before the actual lock
release without turning the code into an #ifdef maze.

The diff additionally does the following tweaks:

* Invoke membar_exit_before_atomic() only once even if the slow path is
  taken. The initial barrier is enough for the release semantics.

* Read rwl_owner for rw_cas() as late as possible so that the CAS
  operation would be less likely to fail. Note that in rw_exit() the
  purpose of the read is different; it determines the type of the lock
  (read / write).

* Put the rw_cas() in rw_exit() (now rw_do_exit()) inside
  __predict_false(). This compacts the resulting machine code a bit.

The code could be simplified by removing the fast path from
rw_exit_read() and rw_exit_write(), and always calling rw_do_exit().
However, that would reduce performance a tiny amount. I saw about 0.5%
increase in kernel build time in a test setup. The patch below does
not seem to affect performance negatively.

OK?

Index: kern/kern_rwlock.c
===
RCS file: src/sys/kern/kern_rwlock.c,v
retrieving revision 1.44
diff -u -p -r1.44 kern_rwlock.c
--- kern/kern_rwlock.c  30 Nov 2019 11:19:17 -  1.44
+++ kern/kern_rwlock.c  29 Feb 2020 16:24:59 -
@@ -25,6 +25,8 @@
 #include 
 #include 
 
+void   rw_do_exit(struct rwlock *, unsigned long);
+
 /* XXX - temporary measure until proc0 is properly aligned */
 #define RW_PROC(p) (((long)p) & ~RWLOCK_MASK)
 
@@ -129,31 +131,31 @@ rw_enter_write(struct rwlock *rwl)
 void
 rw_exit_read(struct rwlock *rwl)
 {
-   unsigned long owner = rwl->rwl_owner;
+   unsigned long owner;
 
rw_assert_rdlock(rwl);
+   WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
 
membar_exit_before_atomic();
+   owner = rwl->rwl_owner;
if (__predict_false((owner & RWLOCK_WAIT) ||
rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
-   rw_exit(rwl);
-   else
-   WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
+   rw_do_exit(rwl, 0);
 }
 
 void
 rw_exit_write(struct rwlock *rwl)
 {
-   unsigned long owner = rwl->rwl_owner;
+   unsigned long owner;
 
rw_assert_wrlock(rwl);
+   WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
 
membar_exit_before_atomic();
+   owner = rwl->rwl_owner;
if (__predict_false((owner & RWLOCK_WAIT) ||
rw_cas(&rwl->rwl_owner, owner, 0)))
-   rw_exit(rwl);
-   else
-   WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
+   rw_do_exit(rwl, RWLOCK_WRLOCK);
 }
 
 #ifdef DIAGNOSTIC
@@ -314,22 +316,29 @@ retry:
 void
 rw_exit(struct rwlock *rwl)
 {
-   unsigned long owner = rwl->rwl_owner;
-   int wrlock = owner & RWLOCK_WRLOCK;
-   unsigned long set;
+   unsigned long wrlock;
 
/* Avoid deadlocks after panic or in DDB */
if (panicstr || db_active)
return;
 
+   wrlock = rwl->rwl_owner & RWLOCK_WRLOCK;
if (wrlock)
rw_assert_wrlock(rwl);
else
rw_assert_rdlock(rwl);
-
WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0);
 
membar_exit_before_atomic();
+   rw_do_exit(rwl, wrlock);
+}
+
+/* membar_exit_before_atomic() has to precede call of this function. */
+void
+rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
+{
+   unsigned long owner, set;
+
do {
owner = rwl->rwl_owner;
if (wrlock)
@@ -337,7 +346,7 @@ rw_exit(struct rwlock *rwl)
else
set = (owner - RWLOCK_READ_INCR) &
~(RWLOCK_WAIT|RWLOCK_WRWANT);
-   } while (rw_cas(&rwl->rwl_owner, owner, set));
+   } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set)));
 
if (owner & RWLOCK_WAIT)
wakeup(rwl);