Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-02 Thread Alfred Perlstein

Why is

+struct lock_delay_config {
+u_int initial;
+u_int step;
+u_int min;
+u_int max;
+};

missing comments for its members?  Are they documented anywhere else?

-Alfred


On 7/31/16 5:41 AM, Mateusz Guzik wrote:

On Sun, Jul 31, 2016 at 01:49:28PM +0300, Konstantin Belousov wrote:
[snip]

After an irc discussion, the following was produced (also available at:
https://people.freebsd.org/~mjg/lock_backoff_complete4.diff):

Differences:
- uint64_t usage was converted to u_int (also see r303584)
- currently unused features (cap limit and return value) were removed
- lock_delay args got packed into a dedicated structure

Note this patch requires the tree to be at least at r303584.

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 0555a78..9b07b8b 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -138,6 +139,36 @@ struct lock_class lock_class_mtx_spin = {
  #endif
  };
  
+#ifdef ADAPTIVE_MUTEXES

+static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging");
+
+static struct lock_delay_config mtx_delay = {
+   .initial= 1000,
+   .step   = 500,
+   .min= 100,
+   .max= 5000,
+};
+
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, _delay.initial,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, _delay.step,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, _delay.min,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, _delay.max,
+0, "");
+
+static void
+mtx_delay_sysinit(void *dummy)
+{
+
+   mtx_delay.initial = mp_ncpus * 25;
+   mtx_delay.min = mp_ncpus * 5;
+   mtx_delay.max = mp_ncpus * 25 * 10;
+}
+LOCK_DELAY_SYSINIT(mtx_delay_sysinit);
+#endif
+
  /*
   * System-wide mutexes
   */
@@ -408,8 +439,10 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
int contested = 0;
uint64_t waittime = 0;
  #endif
+#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS)
+   struct lock_delay_arg lda;
+#endif
  #ifdef KDTRACE_HOOKS
-   u_int spin_cnt = 0;
u_int sleep_cnt = 0;
int64_t sleep_time = 0;
int64_t all_time = 0;
@@ -418,6 +451,9 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
if (SCHEDULER_STOPPED())
return;
  
+#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS)

+   lock_delay_arg_init(, _delay);
+#endif
m = mtxlock2mtx(c);
  
  	if (mtx_owned(m)) {

@@ -451,7 +487,7 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
if (m->mtx_lock == MTX_UNOWNED && _mtx_obtain_lock(m, tid))
break;
  #ifdef KDTRACE_HOOKS
-   spin_cnt++;
+   lda.spin_cnt++;
  #endif
  #ifdef ADAPTIVE_MUTEXES
/*
@@ -471,12 +507,8 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
"spinning", "lockname:\"%s\"",
m->lock_object.lo_name);
while (mtx_owner(m) == owner &&
-   TD_IS_RUNNING(owner)) {
-   cpu_spinwait();
-#ifdef KDTRACE_HOOKS
-   spin_cnt++;
-#endif
-   }
+   TD_IS_RUNNING(owner))
+   lock_delay();
KTR_STATE0(KTR_SCHED, "thread",
sched_tdname((struct thread *)tid),
"running");
@@ -570,7 +602,7 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
/*
 * Only record the loops spinning and not sleeping.
 */
-   if (spin_cnt > sleep_cnt)
+   if (lda.spin_cnt > sleep_cnt)
LOCKSTAT_RECORD1(adaptive__spin, m, all_time - sleep_time);
  #endif
  }
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index d4cae61..363b042 100644
--- a/sys/kern/kern_rwlock.c
+++ b/sys/kern/kern_rwlock.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -65,15 +66,6 @@ PMC_SOFT_DECLARE( , , lock, failed);
   */
  #define   rwlock2rw(c)(__containerof(c, struct rwlock, rw_lock))
  
-#ifdef ADAPTIVE_RWLOCKS

-static int rowner_retries = 10;
-static int rowner_loops = 1;
-static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
-"rwlock debugging");
-SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, _retries, 0, "");
-SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, _loops, 0, "");
-#endif
-
  #ifdef DDB
  #include 
  
@@ -100,6 +92,41 @@ struct lock_class lock_class_rw = {

  #endif
  };
  
+#ifdef ADAPTIVE_RWLOCKS


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread John Baldwin
On Monday, August 01, 2016 10:08:43 PM Mateusz Guzik wrote:
> On Mon, Aug 01, 2016 at 11:37:50AM -0700, John Baldwin wrote:
> > On Sunday, July 31, 2016 02:41:13 PM Mateusz Guzik wrote:
> > > On Sun, Jul 31, 2016 at 01:49:28PM +0300, Konstantin Belousov wrote:
> > > [snip]
> > > 
> > > After an irc discussion, the following was produced (also available at:
> > > https://people.freebsd.org/~mjg/lock_backoff_complete4.diff):
> > > 
> > > Differences:
> > > - uint64_t usage was converted to u_int (also see r303584)
> > > - currently unused features (cap limit and return value) were removed
> > > - lock_delay args got packed into a dedicated structure
> > 
> > lock_delay_enabled declaration seems to be stale?
> > 
> 
> Oops, thanks.
> 
> > I would maybe just provide a "standard" lock_delay_init function that the
> > sysinit's use rather than duplicating the same exact code 3 times.  I'm
> > not sure we really want to use different tunables for different lock types
> > anyway.  (Alternatively we could even just have a single 'config' variable
> > that is a global.  We can always revisit this in the future if we find that
> > we need that granularity, but it would remove an extra pointer indirection
> > if you just had a single 'lock_delay_config' that was exported as a global
> > for now and initialized in a single SYSINIT.)
> > 
> 
> The per-lock type config is partially an artifact of the real version of
> the patch which has different configs per state of the lock, see loops
> with rowner_loops in the current implementation of rw and sx locks and
> this is were it mattered. It was cut off from this patch for simplicity
> (90% of the benefit for 10% of the work).
> 
> That said, fine tuned it does matter for "mere" spinning as well but
> here I put very low values on purpose.
> 
> Putting them all in one config makes for a small compatibility issue,
> where debug.lock.delay_* sysctls would disappear later.
> 
> So I would prefer to just keep this as I don't think it matters much.

Ok.  This is also something we can change later if need to.  It doesn't
affect the ABI, etc.

-- 
John Baldwin
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread Mateusz Guzik
On Mon, Aug 01, 2016 at 11:37:50AM -0700, John Baldwin wrote:
> On Sunday, July 31, 2016 02:41:13 PM Mateusz Guzik wrote:
> > On Sun, Jul 31, 2016 at 01:49:28PM +0300, Konstantin Belousov wrote:
> > [snip]
> > 
> > After an irc discussion, the following was produced (also available at:
> > https://people.freebsd.org/~mjg/lock_backoff_complete4.diff):
> > 
> > Differences:
> > - uint64_t usage was converted to u_int (also see r303584)
> > - currently unused features (cap limit and return value) were removed
> > - lock_delay args got packed into a dedicated structure
> 
> lock_delay_enabled declaration seems to be stale?
> 

Oops, thanks.

> I would maybe just provide a "standard" lock_delay_init function that the
> sysinit's use rather than duplicating the same exact code 3 times.  I'm
> not sure we really want to use different tunables for different lock types
> anyway.  (Alternatively we could even just have a single 'config' variable
> that is a global.  We can always revisit this in the future if we find that
> we need that granularity, but it would remove an extra pointer indirection
> if you just had a single 'lock_delay_config' that was exported as a global
> for now and initialized in a single SYSINIT.)
> 

The per-lock type config is partially an artifact of the real version of
the patch which has different configs per state of the lock, see loops
with rowner_loops in the current implementation of rw and sx locks and
this is were it mattered. It was cut off from this patch for simplicity
(90% of the benefit for 10% of the work).

That said, fine tuned it does matter for "mere" spinning as well but
here I put very low values on purpose.

Putting them all in one config makes for a small compatibility issue,
where debug.lock.delay_* sysctls would disappear later.

So I would prefer to just keep this as I don't think it matters much.

I have further optimisation to primitives not related to spinning. They
boil down to the fact that KDTRACE_HOOKS-enabled kernels contain an
unconditional function call to lockstat_nsecs even with the lock held.

> I think the idea is fine.  I'm less worried about the overhead of the
> divide as you are only doing it when you are contesting (so you are already
> sort of hosed anyway).  Long delays in checking the lock cookie can be
> bad (see my local APIC snafu which only polled once per microsecond).  I
> don't really think a divide is going to be that long?
> 

This should be perfectly fine. One could argue the time wasted should be
wasted efficiently, i.e. the more cpu_spinwait, the better, at least on
amd64.

-- 
Mateusz Guzik 
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread John Baldwin
On Sunday, July 31, 2016 02:41:13 PM Mateusz Guzik wrote:
> On Sun, Jul 31, 2016 at 01:49:28PM +0300, Konstantin Belousov wrote:
> [snip]
> 
> After an irc discussion, the following was produced (also available at:
> https://people.freebsd.org/~mjg/lock_backoff_complete4.diff):
> 
> Differences:
> - uint64_t usage was converted to u_int (also see r303584)
> - currently unused features (cap limit and return value) were removed
> - lock_delay args got packed into a dedicated structure

lock_delay_enabled declaration seems to be stale?

I would maybe just provide a "standard" lock_delay_init function that the
sysinit's use rather than duplicating the same exact code 3 times.  I'm
not sure we really want to use different tunables for different lock types
anyway.  (Alternatively we could even just have a single 'config' variable
that is a global.  We can always revisit this in the future if we find that
we need that granularity, but it would remove an extra pointer indirection
if you just had a single 'lock_delay_config' that was exported as a global
for now and initialized in a single SYSINIT.)

I think the idea is fine.  I'm less worried about the overhead of the
divide as you are only doing it when you are contesting (so you are already
sort of hosed anyway).  Long delays in checking the lock cookie can be
bad (see my local APIC snafu which only polled once per microsecond).  I
don't really think a divide is going to be that long?

-- 
John Baldwin
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread Mateusz Guzik
On Mon, Aug 01, 2016 at 09:38:09AM -0700, Maxim Sobolev wrote:
> On Sun, Jul 31, 2016 at 1:36 PM, Mateusz Guzik  wrote:
> 
> > On Sun, Jul 31, 2016 at 07:03:08AM -0700, Adrian Chadd wrote:
> > > Hi,
> > >
> > > Did you test on any 1, 2, 4, 8 cpu machines? just to see if there are
> > > any performance degredations on lower count CPUs?
> > >
> >
> > I did not test on machines which physically that few cpus, but I did
> > test the impact on microbenchmark with 2 and 4 threads on the 80-way
> > machine. There was no difference.
> >
> 
> Well, arguably running 4 threads on a 80-way machine is not quite the same
> as running the same 4 threads on 4-way or 8-way machine. Unless you
> actually bind your test threads to a specific CPUs, on a bigger system
> scheduler is free to migrate your thread on another CPU if all 4 are
> spinning, this might not the option for smaller box. I suggest you at very
> least re-run your benchmark on a virtual machine with small CPUs count
> assigned, it should be quite easy to do so on your monster box.
> 

The test does bind threads to cpus. Further, the dealy is autotuned
based on the number of cpus in the system. I also once more stress that
the backoff is extremely small and we don't take full advantage of it
specifically so that detrimental behaviour is very unlikely.

So here even on the 80-way machine with 80 * 25 initial delay there was
no performance loss with only few threads competing. The worry could
have been it helps only if all cpu threads compete and hurts performance
otherwise.

On an actual 4-way machine will have only 4 * 25 delays configured with
4 * 25 * 10 upper limit. But note the actual delay is cpu_ticks() % that
value, so even that is often smaller.

tl;dr this is really a simple patch and at least on amd64 a pure win

-- 
Mateusz Guzik 
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread Mateusz Guzik
On Mon, Aug 01, 2016 at 02:16:59PM +0300, Andriy Gapon wrote:
> 
> Mateusz,
> 
> just out of curiosity, have you tried to explore alternative spinlock
> implementations like a ticket lock?   It would be interesting to see if
> there are any improvements to be gained there.
> 

The patch is the simplest hack possible so that this has a chance of
getting into 11.0 as it helps /a lot/ even in the simplest form.

Revamping this stuff in a more "here to stay" manner is a much longer
endeavor.

-- 
Mateusz Guzik 
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread Maxim Sobolev
On Sun, Jul 31, 2016 at 1:36 PM, Mateusz Guzik  wrote:

> On Sun, Jul 31, 2016 at 07:03:08AM -0700, Adrian Chadd wrote:
> > Hi,
> >
> > Did you test on any 1, 2, 4, 8 cpu machines? just to see if there are
> > any performance degredations on lower count CPUs?
> >
>
> I did not test on machines which physically that few cpus, but I did
> test the impact on microbenchmark with 2 and 4 threads on the 80-way
> machine. There was no difference.
>

Well, arguably running 4 threads on a 80-way machine is not quite the same
as running the same 4 threads on 4-way or 8-way machine. Unless you
actually bind your test threads to a specific CPUs, on a bigger system
scheduler is free to migrate your thread on another CPU if all 4 are
spinning, this might not the option for smaller box. I suggest you at very
least re-run your benchmark on a virtual machine with small CPUs count
assigned, it should be quite easy to do so on your monster box.

-Maxim
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread Slawa Olhovchenkov
On Mon, Aug 01, 2016 at 02:16:59PM +0300, Andriy Gapon wrote:

> 
> Mateusz,
> 
> just out of curiosity, have you tried to explore alternative spinlock
> implementations like a ticket lock?   It would be interesting to see if
> there are any improvements to be gained there.

Effective ticket lock implementation as I see limited to 256 (255?)
concurent locks. And anyway may be need randomize delay (QPI bus on
x86 have very limited performance).
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread Andriy Gapon

Mateusz,

just out of curiosity, have you tried to explore alternative spinlock
implementations like a ticket lock?   It would be interesting to see if
there are any improvements to be gained there.

-- 
Andriy Gapon
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-08-01 Thread Konstantin Belousov
On Sun, Jul 31, 2016 at 10:36:12PM +0200, Mateusz Guzik wrote:
> On Sun, Jul 31, 2016 at 07:03:08AM -0700, Adrian Chadd wrote:
> > Also, yeah, the MOD operator in each loop could get spendy on older
> > CPUs (eg my MIPS CPUs, older ARM stuff, etc.) Is it possible to
> > achieve much the same autotuning with pow2 operations instead of
> > divide/mod?
> > 
> 
> The % operation acts a randomizer. It is optional and I'm happy to ifdef
> it based on the architecture. It does seem to be useful at least on
> amd64.
> 
> As a side note, exponential backoff is not used to keep things smaller
> (see above). It is definitely subject to change later.

I asked for removal of the 64bit divide in the backoff precalculation,
because the calculation is relatively intensive on 32bit i386 or arm,
and loop consists of the calculation heating CPU and only then the loop
starts issuing the PAUSE instructions, encoded as cpu_spinwait().

On the other hand, on e.g. MIPS, this is not true. cpu_spinwait() on
MIPS is empty, so the backoff loop consists of heating CPU anyway.
It does not matter if we heat it using divide instruction or branch
instruction.

I do not know is this fixable on MIPS, but it is definitely a bug on
ARM to have empty cpu_spinwait().  It is additionally the bug that ARM
MD code uses cpu_spinwait() in tight loops as well, without providing
an implementation.

___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-07-31 Thread K. Macy
On Sun, Jul 31, 2016 at 7:03 AM, Adrian Chadd  wrote:
> Hi,
>
> Did you test on any 1, 2, 4, 8 cpu machines? just to see if there are
> any performance degredations on lower count CPUs?


The adaptive spinning path will never run on a uniprocessor. Except
for potential i-cache displacement you're not going to actually see
any effect unless there is substantial contention. You'll need all
threads consistently contending for the same lock. A potential
workload to exercise this would be to run ncpu threads sending small
UDP packets on a driver with a legacy mutex protected IFQ interface.


> Also, yeah, the MOD operator in each loop could get spendy on older
> CPUs (eg my MIPS CPUs, older ARM stuff, etc.) Is it possible to
> achieve much the same autotuning with pow2 operations instead of
> divide/mod?
>
>
> -a
> ___
> freebsd-current@freebsd.org mailing list
> https://lists.freebsd.org/mailman/listinfo/freebsd-current
> To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-07-31 Thread Mateusz Guzik
On Sun, Jul 31, 2016 at 07:03:08AM -0700, Adrian Chadd wrote:
> Hi,
> 
> Did you test on any 1, 2, 4, 8 cpu machines? just to see if there are
> any performance degredations on lower count CPUs?
> 

I did not test on machines which physically that few cpus, but I did
test the impact on microbenchmark with 2 and 4 threads on the 80-way
machine. There was no difference.

For this iteration of the patch, given limited time I tried to be very
conservative as to not intoduce additional latency. In fact I would
argue the patch is undertuned (as in, it can do better in certain
workloads).

That said, I think it is safe to use.

> Also, yeah, the MOD operator in each loop could get spendy on older
> CPUs (eg my MIPS CPUs, older ARM stuff, etc.) Is it possible to
> achieve much the same autotuning with pow2 operations instead of
> divide/mod?
> 

The % operation acts a randomizer. It is optional and I'm happy to ifdef
it based on the architecture. It does seem to be useful at least on
amd64.

As a side note, exponential backoff is not used to keep things smaller
(see above). It is definitely subject to change later.

-- 
Mateusz Guzik 
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-07-31 Thread Adrian Chadd
Hi,

Did you test on any 1, 2, 4, 8 cpu machines? just to see if there are
any performance degredations on lower count CPUs?

Also, yeah, the MOD operator in each loop could get spendy on older
CPUs (eg my MIPS CPUs, older ARM stuff, etc.) Is it possible to
achieve much the same autotuning with pow2 operations instead of
divide/mod?


-a
___
freebsd-current@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: [PATCH] randomized delay in locking primitives, take 2

2016-07-31 Thread Mateusz Guzik
On Sun, Jul 31, 2016 at 01:49:28PM +0300, Konstantin Belousov wrote:
[snip]

After an irc discussion, the following was produced (also available at:
https://people.freebsd.org/~mjg/lock_backoff_complete4.diff):

Differences:
- uint64_t usage was converted to u_int (also see r303584)
- currently unused features (cap limit and return value) were removed
- lock_delay args got packed into a dedicated structure

Note this patch requires the tree to be at least at r303584.

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 0555a78..9b07b8b 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -138,6 +139,36 @@ struct lock_class lock_class_mtx_spin = {
 #endif
 };
 
+#ifdef ADAPTIVE_MUTEXES
+static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging");
+
+static struct lock_delay_config mtx_delay = {
+   .initial= 1000,
+   .step   = 500,
+   .min= 100,
+   .max= 5000,
+};
+
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, _delay.initial,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, _delay.step,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, _delay.min,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, _delay.max,
+0, "");
+
+static void
+mtx_delay_sysinit(void *dummy)
+{
+
+   mtx_delay.initial = mp_ncpus * 25;
+   mtx_delay.min = mp_ncpus * 5;
+   mtx_delay.max = mp_ncpus * 25 * 10;
+}
+LOCK_DELAY_SYSINIT(mtx_delay_sysinit);
+#endif
+
 /*
  * System-wide mutexes
  */
@@ -408,8 +439,10 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
int contested = 0;
uint64_t waittime = 0;
 #endif
+#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS)
+   struct lock_delay_arg lda;
+#endif
 #ifdef KDTRACE_HOOKS
-   u_int spin_cnt = 0;
u_int sleep_cnt = 0;
int64_t sleep_time = 0;
int64_t all_time = 0;
@@ -418,6 +451,9 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
if (SCHEDULER_STOPPED())
return;
 
+#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS)
+   lock_delay_arg_init(, _delay);
+#endif
m = mtxlock2mtx(c);
 
if (mtx_owned(m)) {
@@ -451,7 +487,7 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
if (m->mtx_lock == MTX_UNOWNED && _mtx_obtain_lock(m, tid))
break;
 #ifdef KDTRACE_HOOKS
-   spin_cnt++;
+   lda.spin_cnt++;
 #endif
 #ifdef ADAPTIVE_MUTEXES
/*
@@ -471,12 +507,8 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
"spinning", "lockname:\"%s\"",
m->lock_object.lo_name);
while (mtx_owner(m) == owner &&
-   TD_IS_RUNNING(owner)) {
-   cpu_spinwait();
-#ifdef KDTRACE_HOOKS
-   spin_cnt++;
-#endif
-   }
+   TD_IS_RUNNING(owner))
+   lock_delay();
KTR_STATE0(KTR_SCHED, "thread",
sched_tdname((struct thread *)tid),
"running");
@@ -570,7 +602,7 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
/*
 * Only record the loops spinning and not sleeping. 
 */
-   if (spin_cnt > sleep_cnt)
+   if (lda.spin_cnt > sleep_cnt)
LOCKSTAT_RECORD1(adaptive__spin, m, all_time - sleep_time);
 #endif
 }
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index d4cae61..363b042 100644
--- a/sys/kern/kern_rwlock.c
+++ b/sys/kern/kern_rwlock.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -65,15 +66,6 @@ PMC_SOFT_DECLARE( , , lock, failed);
  */
 #definerwlock2rw(c)(__containerof(c, struct rwlock, rw_lock))
 
-#ifdef ADAPTIVE_RWLOCKS
-static int rowner_retries = 10;
-static int rowner_loops = 1;
-static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
-"rwlock debugging");
-SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, _retries, 0, "");
-SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, _loops, 0, "");
-#endif
-
 #ifdef DDB
 #include 
 
@@ -100,6 +92,41 @@ struct lock_class lock_class_rw = {
 #endif
 };
 
+#ifdef ADAPTIVE_RWLOCKS
+static int rowner_retries = 10;
+static int rowner_loops = 1;
+static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
+"rwlock debugging");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, _retries, 0, "");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, 

Re: [PATCH] randomized delay in locking primitives, take 2

2016-07-31 Thread Konstantin Belousov
On Sun, Jul 31, 2016 at 11:57:06AM +0200, Mateusz Guzik wrote:
> The patch can be found inline below and also here:
> https://people.freebsd.org/~mjg/lock_backoff_complete.diff
> 
> The previous version of the patch was posted here:
> https://lists.freebsd.org/pipermail/freebsd-current/2016-July/062320.html
> 
> This time around instead of a total hack I have something of reasonable
> quality.
> 
> Problem description and the way to fix it stands, to quote:
> > If the lock is contended, primitives like __mtx_lock_sleep will spin
> > checking if the owner is running or the lock was freed. The problem is
> > that once it is discovered that the lock is free, multiple CPUs are
> > likely to try to do the atomic op which will make it more costly for
> > everyone and throughput suffers.
> 
> > The standard thing to do is to have some sort of a randomized delay so
> > that this kind of behaviour is reduced.
> 
> So, this time around lock_delay primitive accepts configuration including
> min/max spin count per iteration, incrementation step per lock_delay
> invocation and total spins allowed during one execution of the locking
> primitive. The code "autotunes" with mp_ncpus on boot. Current values
> were determined after testing the code on a 80-way and a 40-way system.
> There is definitely room for improvement there, but the biggest gain as
> it is is sufficient to get the patch in.
> 
> Parameters are settable by lock type.
> 
> Converted locks are: mtx, sx, rwlocks.
> 
> Trivial loops spinning as long as the lock owner is running are
> modified. The rest was left untouched in order to keep the patch
> simpler.
> 
> The following locks were not touched: spin mutexes, thread locks and
> lockmgr. First two could be, but I don't have any suitable benchmarks
> for them. lockmgr has support for spinning, but is compiled out.
> 
> In terms of results, this time around I got my hands on an 80-way
> machine (pig1 in the testing cluster). Vanilla kernel performance drops
> significantly with increased contention. For instance, the following is
> make -j 80 buildkernel of 11.0 tree on a tmpfs:
>   7681.08s user 17040.02s system 7014% cpu 5:52.43 total
> 
> And this is with the patched kernel:
>   4585.85s user 2365.95s system 5675% cpu 2:02.48 total
> 
> Both results are reproducible.
> 
> I would like to get this in as fast as possible.
> 
> Note: lock_delay has a .cap parameter which is currently set to 0 and
> has no sysctl to control it. It is there for later when it will replace
> the unmodified spinning code in sx and rwlocks.
> 
> diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
> index 453add4..ca9319c 100644
> --- a/sys/kern/kern_mutex.c
> +++ b/sys/kern/kern_mutex.c
> @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -138,6 +139,37 @@ struct lock_class lock_class_mtx_spin = {
>  #endif
>  };
>  
> +#ifdef ADAPTIVE_MUTEXES
> +static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging");
> +
> +static struct lock_delay_config mtx_delay = {
> + .initial= 1000,
> + .step   = 500,
> + .cap= 0,
> + .min= 100,
> + .max= 5000,
> +};
> +
> +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, 
> _delay.initial,
> +0, "");
> +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, _delay.step,
> +0, "");
> +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, _delay.min,
> +0, "");
> +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, _delay.max,
> +0, "");
> +
> +static void
> +mtx_delay_sysinit(void *dummy)
> +{
> +
> + mtx_delay.initial = mp_ncpus * 25;
> + mtx_delay.min = mp_ncpus * 5;
> + mtx_delay.max = mp_ncpus * 25 * 10;
> +}
> +LOCK_DELAY_SYSINIT(mtx_delay_sysinit);
> +#endif
> +
>  /*
>   * System-wide mutexes
>   */
> @@ -408,8 +440,11 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, 
> int opts,
>   int contested = 0;
>   uint64_t waittime = 0;
>  #endif
> -#ifdef KDTRACE_HOOKS
> +#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS)
>   uint64_t spin_cnt = 0;
> + uint64_t delay = 0;
> +#endif
> +#ifdef KDTRACE_HOOKS
>   uint64_t sleep_cnt = 0;
>   int64_t sleep_time = 0;
>   int64_t all_time = 0;
> @@ -471,12 +506,8 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, 
> int opts,
>   "spinning", "lockname:\"%s\"",
>   m->lock_object.lo_name);
>   while (mtx_owner(m) == owner &&
> - TD_IS_RUNNING(owner)) {
> - cpu_spinwait();
> -#ifdef KDTRACE_HOOKS
> - spin_cnt++;
> -#endif
> - }
> + TD_IS_RUNNING(owner))
> + lock_delay(, _delay, 
> _cnt);
>

[PATCH] randomized delay in locking primitives, take 2

2016-07-31 Thread Mateusz Guzik
The patch can be found inline below and also here:
https://people.freebsd.org/~mjg/lock_backoff_complete.diff

The previous version of the patch was posted here:
https://lists.freebsd.org/pipermail/freebsd-current/2016-July/062320.html

This time around instead of a total hack I have something of reasonable
quality.

Problem description and the way to fix it stands, to quote:
> If the lock is contended, primitives like __mtx_lock_sleep will spin
> checking if the owner is running or the lock was freed. The problem is
> that once it is discovered that the lock is free, multiple CPUs are
> likely to try to do the atomic op which will make it more costly for
> everyone and throughput suffers.

> The standard thing to do is to have some sort of a randomized delay so
> that this kind of behaviour is reduced.

So, this time around lock_delay primitive accepts configuration including
min/max spin count per iteration, incrementation step per lock_delay
invocation and total spins allowed during one execution of the locking
primitive. The code "autotunes" with mp_ncpus on boot. Current values
were determined after testing the code on a 80-way and a 40-way system.
There is definitely room for improvement there, but the biggest gain as
it is is sufficient to get the patch in.

Parameters are settable by lock type.

Converted locks are: mtx, sx, rwlocks.

Trivial loops spinning as long as the lock owner is running are
modified. The rest was left untouched in order to keep the patch
simpler.

The following locks were not touched: spin mutexes, thread locks and
lockmgr. First two could be, but I don't have any suitable benchmarks
for them. lockmgr has support for spinning, but is compiled out.

In terms of results, this time around I got my hands on an 80-way
machine (pig1 in the testing cluster). Vanilla kernel performance drops
significantly with increased contention. For instance, the following is
make -j 80 buildkernel of 11.0 tree on a tmpfs:
  7681.08s user 17040.02s system 7014% cpu 5:52.43 total

And this is with the patched kernel:
  4585.85s user 2365.95s system 5675% cpu 2:02.48 total

Both results are reproducible.

I would like to get this in as fast as possible.

Note: lock_delay has a .cap parameter which is currently set to 0 and
has no sysctl to control it. It is there for later when it will replace
the unmodified spinning code in sx and rwlocks.

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 453add4..ca9319c 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -138,6 +139,37 @@ struct lock_class lock_class_mtx_spin = {
 #endif
 };
 
+#ifdef ADAPTIVE_MUTEXES
+static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging");
+
+static struct lock_delay_config mtx_delay = {
+   .initial= 1000,
+   .step   = 500,
+   .cap= 0,
+   .min= 100,
+   .max= 5000,
+};
+
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, _delay.initial,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, _delay.step,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, _delay.min,
+0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, _delay.max,
+0, "");
+
+static void
+mtx_delay_sysinit(void *dummy)
+{
+
+   mtx_delay.initial = mp_ncpus * 25;
+   mtx_delay.min = mp_ncpus * 5;
+   mtx_delay.max = mp_ncpus * 25 * 10;
+}
+LOCK_DELAY_SYSINIT(mtx_delay_sysinit);
+#endif
+
 /*
  * System-wide mutexes
  */
@@ -408,8 +440,11 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
int contested = 0;
uint64_t waittime = 0;
 #endif
-#ifdef KDTRACE_HOOKS
+#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS)
uint64_t spin_cnt = 0;
+   uint64_t delay = 0;
+#endif
+#ifdef KDTRACE_HOOKS
uint64_t sleep_cnt = 0;
int64_t sleep_time = 0;
int64_t all_time = 0;
@@ -471,12 +506,8 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int 
opts,
"spinning", "lockname:\"%s\"",
m->lock_object.lo_name);
while (mtx_owner(m) == owner &&
-   TD_IS_RUNNING(owner)) {
-   cpu_spinwait();
-#ifdef KDTRACE_HOOKS
-   spin_cnt++;
-#endif
-   }
+   TD_IS_RUNNING(owner))
+   lock_delay(, _delay, 
_cnt);
KTR_STATE0(KTR_SCHED, "thread",
sched_tdname((struct thread *)tid),
"running");
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index 496738d..cd93520 100644
---