Re: rwlock tweak

2017-10-16 Thread Mark Kettenis
> Date: Mon, 16 Oct 2017 11:53:30 +0200
> From: Martin Pieuchot 
> 
> On 16/10/17(Mon) 11:36, Mark Kettenis wrote:
> > > Date: Mon, 16 Oct 2017 10:52:09 +0200
> > > From: Martin Pieuchot 
> > > 
> > > As pointed by Mateusz Guzik [0], on x86 the cmpxchg{q,l} used by
> > > rw_enter(9) and rw_exit(9) already include an implicit memory
> > > barrier, so we can avoids using an explicit expensive one by
> > > using the following variants.
> > > 
> > > [0] https://marc.info/?l=openbsd-tech&m=150765959923113&w=2
> > > 
> > > ok?
> > 
> > Is this really safe?  The atomic instructions are executed
> > conditionally...
> 
> I guess you're talking about the "if ()" dance in the "optimized"
> versions: rw_{enter,exit}_{read,write}().  If no cmpxchg{q,l} is
> executed in theses functions _rw_{enter,exit}() will be called,
> which contains another barrier and another CAS. 

Then it should be fine.

> > > Index: kern/kern_rwlock.c
> > > ===
> > > RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
> > > retrieving revision 1.31
> > > diff -u -p -r1.31 kern_rwlock.c
> > > --- kern/kern_rwlock.c12 Oct 2017 09:19:45 -  1.31
> > > +++ kern/kern_rwlock.c16 Oct 2017 08:24:27 -
> > > @@ -96,7 +96,7 @@ _rw_enter_read(struct rwlock *rwl LOCK_F
> > >   rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR)))
> > >   _rw_enter(rwl, RW_READ LOCK_FL_ARGS);
> > >   else {
> > > - membar_enter();
> > > + membar_enter_after_atomic();
> > >   WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, file, line,
> > >   NULL);
> > >   WITNESS_LOCK(&rwl->rwl_lock_obj, 0, file, line);
> > > @@ -112,7 +112,7 @@ _rw_enter_write(struct rwlock *rwl LOCK_
> > >   RW_PROC(p) | RWLOCK_WRLOCK)))
> > >   _rw_enter(rwl, RW_WRITE LOCK_FL_ARGS);
> > >   else {
> > > - membar_enter();
> > > + membar_enter_after_atomic();
> > >   WITNESS_CHECKORDER(&rwl->rwl_lock_obj,
> > >   LOP_EXCLUSIVE | LOP_NEWORDER, file, line, NULL);
> > >   WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE, file, line);
> > > @@ -126,7 +126,7 @@ _rw_exit_read(struct rwlock *rwl LOCK_FL
> > >  
> > >   rw_assert_rdlock(rwl);
> > >  
> > > - membar_exit();
> > > + membar_exit_before_atomic();
> > >   if (__predict_false((owner & RWLOCK_WAIT) ||
> > >   rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
> > >   _rw_exit(rwl LOCK_FL_ARGS);
> > > @@ -141,7 +141,7 @@ _rw_exit_write(struct rwlock *rwl LOCK_F
> > >  
> > >   rw_assert_wrlock(rwl);
> > >  
> > > - membar_exit();
> > > + membar_exit_before_atomic();
> > >   if (__predict_false((owner & RWLOCK_WAIT) ||
> > >   rw_cas(&rwl->rwl_owner, owner, 0)))
> > >   _rw_exit(rwl LOCK_FL_ARGS);
> > > @@ -261,7 +261,7 @@ retry:
> > >  
> > >   if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc)))
> > >   goto retry;
> > > - membar_enter();
> > > + membar_enter_after_atomic();
> > >  
> > >   /*
> > >* If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we
> > > @@ -295,7 +295,7 @@ _rw_exit(struct rwlock *rwl LOCK_FL_VARS
> > >   WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0,
> > >   file, line);
> > >  
> > > - membar_exit();
> > > + membar_exit_before_atomic();
> > >   do {
> > >   owner = rwl->rwl_owner;
> > >   if (wrlock)
> > > 
> > > 
> > 
> 



Re: rwlock tweak

2017-10-16 Thread Mike Belopuhov
On Mon, Oct 16, 2017 at 09:36 +, Mark Kettenis wrote:
> > Date: Mon, 16 Oct 2017 10:52:09 +0200
> > From: Martin Pieuchot 
> > 
> > As pointed by Mateusz Guzik [0], on x86 the cmpxchg{q,l} used by
> > rw_enter(9) and rw_exit(9) already include an implicit memory
> > barrier, so we can avoids using an explicit expensive one by
> > using the following variants.
> > 
> > [0] https://marc.info/?l=openbsd-tech&m=150765959923113&w=2
> > 
> > ok?
> 
> Is this really safe?  The atomic instructions are executed
> conditionally...
>

I've been running without membars on amd64 for about 3 weeks now
and haven't noticed any adverse effects.  On my laptop and APU2
serving internet for a hundred of people.  It appears to me that
access pattern in rwlock.c allows for that: no reads to be
reordered that matter.  But please take this with a grain of salt.

A few people noted that a single membar per packet hugely affects
forwarding performance, so a macro-benchmark can be set up to
quantify the cost.

> > Index: kern/kern_rwlock.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
> > retrieving revision 1.31
> > diff -u -p -r1.31 kern_rwlock.c
> > --- kern/kern_rwlock.c  12 Oct 2017 09:19:45 -  1.31
> > +++ kern/kern_rwlock.c  16 Oct 2017 08:24:27 -
> > @@ -96,7 +96,7 @@ _rw_enter_read(struct rwlock *rwl LOCK_F
> > rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR)))
> > _rw_enter(rwl, RW_READ LOCK_FL_ARGS);
> > else {
> > -   membar_enter();
> > +   membar_enter_after_atomic();
> > WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, file, line,
> > NULL);
> > WITNESS_LOCK(&rwl->rwl_lock_obj, 0, file, line);
> > @@ -112,7 +112,7 @@ _rw_enter_write(struct rwlock *rwl LOCK_
> > RW_PROC(p) | RWLOCK_WRLOCK)))
> > _rw_enter(rwl, RW_WRITE LOCK_FL_ARGS);
> > else {
> > -   membar_enter();
> > +   membar_enter_after_atomic();
> > WITNESS_CHECKORDER(&rwl->rwl_lock_obj,
> > LOP_EXCLUSIVE | LOP_NEWORDER, file, line, NULL);
> > WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE, file, line);
> > @@ -126,7 +126,7 @@ _rw_exit_read(struct rwlock *rwl LOCK_FL
> >  
> > rw_assert_rdlock(rwl);
> >  
> > -   membar_exit();
> > +   membar_exit_before_atomic();
> > if (__predict_false((owner & RWLOCK_WAIT) ||
> > rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
> > _rw_exit(rwl LOCK_FL_ARGS);
> > @@ -141,7 +141,7 @@ _rw_exit_write(struct rwlock *rwl LOCK_F
> >  
> > rw_assert_wrlock(rwl);
> >  
> > -   membar_exit();
> > +   membar_exit_before_atomic();
> > if (__predict_false((owner & RWLOCK_WAIT) ||
> > rw_cas(&rwl->rwl_owner, owner, 0)))
> > _rw_exit(rwl LOCK_FL_ARGS);
> > @@ -261,7 +261,7 @@ retry:
> >  
> > if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc)))
> > goto retry;
> > -   membar_enter();
> > +   membar_enter_after_atomic();
> >  
> > /*
> >  * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we
> > @@ -295,7 +295,7 @@ _rw_exit(struct rwlock *rwl LOCK_FL_VARS
> > WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0,
> > file, line);
> >  
> > -   membar_exit();
> > +   membar_exit_before_atomic();
> > do {
> > owner = rwl->rwl_owner;
> > if (wrlock)
> > 
> > 
> 



Re: rwlock tweak

2017-10-16 Thread Martin Pieuchot
On 16/10/17(Mon) 11:36, Mark Kettenis wrote:
> > Date: Mon, 16 Oct 2017 10:52:09 +0200
> > From: Martin Pieuchot 
> > 
> > As pointed by Mateusz Guzik [0], on x86 the cmpxchg{q,l} used by
> > rw_enter(9) and rw_exit(9) already include an implicit memory
> > barrier, so we can avoids using an explicit expensive one by
> > using the following variants.
> > 
> > [0] https://marc.info/?l=openbsd-tech&m=150765959923113&w=2
> > 
> > ok?
> 
> Is this really safe?  The atomic instructions are executed
> conditionally...

I guess you're talking about the "if ()" dance in the "optimized"
versions: rw_{enter,exit}_{read,write}().  If no cmpxchg{q,l} is
executed in theses functions _rw_{enter,exit}() will be called,
which contains another barrier and another CAS. 

> > Index: kern/kern_rwlock.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
> > retrieving revision 1.31
> > diff -u -p -r1.31 kern_rwlock.c
> > --- kern/kern_rwlock.c  12 Oct 2017 09:19:45 -  1.31
> > +++ kern/kern_rwlock.c  16 Oct 2017 08:24:27 -
> > @@ -96,7 +96,7 @@ _rw_enter_read(struct rwlock *rwl LOCK_F
> > rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR)))
> > _rw_enter(rwl, RW_READ LOCK_FL_ARGS);
> > else {
> > -   membar_enter();
> > +   membar_enter_after_atomic();
> > WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, file, line,
> > NULL);
> > WITNESS_LOCK(&rwl->rwl_lock_obj, 0, file, line);
> > @@ -112,7 +112,7 @@ _rw_enter_write(struct rwlock *rwl LOCK_
> > RW_PROC(p) | RWLOCK_WRLOCK)))
> > _rw_enter(rwl, RW_WRITE LOCK_FL_ARGS);
> > else {
> > -   membar_enter();
> > +   membar_enter_after_atomic();
> > WITNESS_CHECKORDER(&rwl->rwl_lock_obj,
> > LOP_EXCLUSIVE | LOP_NEWORDER, file, line, NULL);
> > WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE, file, line);
> > @@ -126,7 +126,7 @@ _rw_exit_read(struct rwlock *rwl LOCK_FL
> >  
> > rw_assert_rdlock(rwl);
> >  
> > -   membar_exit();
> > +   membar_exit_before_atomic();
> > if (__predict_false((owner & RWLOCK_WAIT) ||
> > rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
> > _rw_exit(rwl LOCK_FL_ARGS);
> > @@ -141,7 +141,7 @@ _rw_exit_write(struct rwlock *rwl LOCK_F
> >  
> > rw_assert_wrlock(rwl);
> >  
> > -   membar_exit();
> > +   membar_exit_before_atomic();
> > if (__predict_false((owner & RWLOCK_WAIT) ||
> > rw_cas(&rwl->rwl_owner, owner, 0)))
> > _rw_exit(rwl LOCK_FL_ARGS);
> > @@ -261,7 +261,7 @@ retry:
> >  
> > if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc)))
> > goto retry;
> > -   membar_enter();
> > +   membar_enter_after_atomic();
> >  
> > /*
> >  * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we
> > @@ -295,7 +295,7 @@ _rw_exit(struct rwlock *rwl LOCK_FL_VARS
> > WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0,
> > file, line);
> >  
> > -   membar_exit();
> > +   membar_exit_before_atomic();
> > do {
> > owner = rwl->rwl_owner;
> > if (wrlock)
> > 
> > 
> 



Re: rwlock tweak

2017-10-16 Thread Mark Kettenis
> Date: Mon, 16 Oct 2017 10:52:09 +0200
> From: Martin Pieuchot 
> 
> As pointed by Mateusz Guzik [0], on x86 the cmpxchg{q,l} used by
> rw_enter(9) and rw_exit(9) already include an implicit memory
> barrier, so we can avoids using an explicit expensive one by
> using the following variants.
> 
> [0] https://marc.info/?l=openbsd-tech&m=150765959923113&w=2
> 
> ok?

Is this really safe?  The atomic instructions are executed
conditionally...

> Index: kern/kern_rwlock.c
> ===
> RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
> retrieving revision 1.31
> diff -u -p -r1.31 kern_rwlock.c
> --- kern/kern_rwlock.c12 Oct 2017 09:19:45 -  1.31
> +++ kern/kern_rwlock.c16 Oct 2017 08:24:27 -
> @@ -96,7 +96,7 @@ _rw_enter_read(struct rwlock *rwl LOCK_F
>   rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR)))
>   _rw_enter(rwl, RW_READ LOCK_FL_ARGS);
>   else {
> - membar_enter();
> + membar_enter_after_atomic();
>   WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, file, line,
>   NULL);
>   WITNESS_LOCK(&rwl->rwl_lock_obj, 0, file, line);
> @@ -112,7 +112,7 @@ _rw_enter_write(struct rwlock *rwl LOCK_
>   RW_PROC(p) | RWLOCK_WRLOCK)))
>   _rw_enter(rwl, RW_WRITE LOCK_FL_ARGS);
>   else {
> - membar_enter();
> + membar_enter_after_atomic();
>   WITNESS_CHECKORDER(&rwl->rwl_lock_obj,
>   LOP_EXCLUSIVE | LOP_NEWORDER, file, line, NULL);
>   WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE, file, line);
> @@ -126,7 +126,7 @@ _rw_exit_read(struct rwlock *rwl LOCK_FL
>  
>   rw_assert_rdlock(rwl);
>  
> - membar_exit();
> + membar_exit_before_atomic();
>   if (__predict_false((owner & RWLOCK_WAIT) ||
>   rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
>   _rw_exit(rwl LOCK_FL_ARGS);
> @@ -141,7 +141,7 @@ _rw_exit_write(struct rwlock *rwl LOCK_F
>  
>   rw_assert_wrlock(rwl);
>  
> - membar_exit();
> + membar_exit_before_atomic();
>   if (__predict_false((owner & RWLOCK_WAIT) ||
>   rw_cas(&rwl->rwl_owner, owner, 0)))
>   _rw_exit(rwl LOCK_FL_ARGS);
> @@ -261,7 +261,7 @@ retry:
>  
>   if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc)))
>   goto retry;
> - membar_enter();
> + membar_enter_after_atomic();
>  
>   /*
>* If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we
> @@ -295,7 +295,7 @@ _rw_exit(struct rwlock *rwl LOCK_FL_VARS
>   WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0,
>   file, line);
>  
> - membar_exit();
> + membar_exit_before_atomic();
>   do {
>   owner = rwl->rwl_owner;
>   if (wrlock)
> 
> 



rwlock tweak

2017-10-16 Thread Martin Pieuchot
As pointed by Mateusz Guzik [0], on x86 the cmpxchg{q,l} used by
rw_enter(9) and rw_exit(9) already include an implicit memory
barrier, so we can avoids using an explicit expensive one by
using the following variants.

[0] https://marc.info/?l=openbsd-tech&m=150765959923113&w=2

ok?

Index: kern/kern_rwlock.c
===
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.31
diff -u -p -r1.31 kern_rwlock.c
--- kern/kern_rwlock.c  12 Oct 2017 09:19:45 -  1.31
+++ kern/kern_rwlock.c  16 Oct 2017 08:24:27 -
@@ -96,7 +96,7 @@ _rw_enter_read(struct rwlock *rwl LOCK_F
rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR)))
_rw_enter(rwl, RW_READ LOCK_FL_ARGS);
else {
-   membar_enter();
+   membar_enter_after_atomic();
WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, file, line,
NULL);
WITNESS_LOCK(&rwl->rwl_lock_obj, 0, file, line);
@@ -112,7 +112,7 @@ _rw_enter_write(struct rwlock *rwl LOCK_
RW_PROC(p) | RWLOCK_WRLOCK)))
_rw_enter(rwl, RW_WRITE LOCK_FL_ARGS);
else {
-   membar_enter();
+   membar_enter_after_atomic();
WITNESS_CHECKORDER(&rwl->rwl_lock_obj,
LOP_EXCLUSIVE | LOP_NEWORDER, file, line, NULL);
WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE, file, line);
@@ -126,7 +126,7 @@ _rw_exit_read(struct rwlock *rwl LOCK_FL
 
rw_assert_rdlock(rwl);
 
-   membar_exit();
+   membar_exit_before_atomic();
if (__predict_false((owner & RWLOCK_WAIT) ||
rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
_rw_exit(rwl LOCK_FL_ARGS);
@@ -141,7 +141,7 @@ _rw_exit_write(struct rwlock *rwl LOCK_F
 
rw_assert_wrlock(rwl);
 
-   membar_exit();
+   membar_exit_before_atomic();
if (__predict_false((owner & RWLOCK_WAIT) ||
rw_cas(&rwl->rwl_owner, owner, 0)))
_rw_exit(rwl LOCK_FL_ARGS);
@@ -261,7 +261,7 @@ retry:
 
if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc)))
goto retry;
-   membar_enter();
+   membar_enter_after_atomic();
 
/*
 * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we
@@ -295,7 +295,7 @@ _rw_exit(struct rwlock *rwl LOCK_FL_VARS
WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0,
file, line);
 
-   membar_exit();
+   membar_exit_before_atomic();
do {
owner = rwl->rwl_owner;
if (wrlock)