Re: C mutex impl. for x86

2018-02-15 Thread Mateusz Guzik
On Wed, Dec 20, 2017 at 12:17:27PM +0100, Martin Pieuchot wrote:
> On 15/12/17(Fri) 22:03, Mateusz Guzik wrote:
> > > +void
> > > +__mtx_enter(struct mutex *mtx)
> > > +{
> > > +#ifdef MP_LOCKDEBUG
> > > + int nticks = __mp_lock_spinout;
> > > +#endif
> > > +
> > > + while (__mtx_enter_try(mtx) == 0) {
> > > + CPU_BUSY_CYCLE();
> >
> > So this is effectively __mtx_enter_try with single pause in-between.
> >
> > > +}
> > > +
> > > +int
> > > +__mtx_enter_try(struct mutex *mtx)
> > > +{
> > > + struct cpu_info *owner, *ci = curcpu();
> > > + int s;
> > > +
> > > + if (mtx->mtx_wantipl != IPL_NONE)
> > > + s = splraise(mtx->mtx_wantipl);
> > > +
> >
> > This is at least one read.
> >
> > > + owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
> > > +#ifdef DIAGNOSTIC
> > > + if (__predict_false(owner == ci))
> > > + panic("mtx %p: locking against myself", mtx);
> > > +#endif
> > > + if (owner == NULL) {
> > > + membar_enter_after_atomic();
> > > + if (mtx->mtx_wantipl != IPL_NONE)
> > > + mtx->mtx_oldipl = s;
> >
> > This repeats the read done earlier.
> >
> > Since the caller loops, this is effectively a very inefficient lock of
> > the form:
> > while (!atomic_cas_ptr(...))
> > CPU_BUSY_CYCLE();
> >
> > + some reads in-between
> >

So how about the patch below. It booted fine and I did a make -j 8 of
the kernel build with it. Unfortunately the vm in question is running in
too volatile invironment for any kind of performance testing atm.
Impact should be minor anyway.

Note this is still a super basic lock with giant room for improvement.
I'm not committing to any kind of work in the area, but I may contribute
other code in the future if stuff of this sort is fine with you.

diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
index 64c5a1f6319..5c14f2b6720 100644
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@@ -253,34 +253,14 @@ __mtx_init(struct mutex *mtx, int wantipl)
 }

 #ifdef MULTIPROCESSOR
-void
-__mtx_enter(struct mutex *mtx)
-{
-#ifdef MP_LOCKDEBUG
-   int nticks = __mp_lock_spinout;
-#endif
-
-   while (__mtx_enter_try(mtx) == 0) {
-   CPU_BUSY_CYCLE();
-
-#ifdef MP_LOCKDEBUG
-   int nticks = __mp_lock_spinout;
-#endif
-
-   while (__mtx_enter_try(mtx) == 0) {
-   CPU_BUSY_CYCLE();
-
-#ifdef MP_LOCKDEBUG
-   if (--nticks == 0) {
-   db_printf("%s: %p lock spun out", __func__, mtx);
-   db_enter();
-   nticks = __mp_lock_spinout;
-   }
-#endif
-   }
-}
-
 int
-__mtx_enter_try(struct mutex *mtx)
+__mtx_enter_common(struct mutex *mtx, int wantipl, struct cpu_info *ci)
 {
-   struct cpu_info *owner, *ci = curcpu();
+   struct cpu_info *owner;
int s;

-   if (mtx->mtx_wantipl != IPL_NONE)
-   s = splraise(mtx->mtx_wantipl);
+   if (wantipl != IPL_NONE)
+   s = splraise(wantipl);

owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
 #ifdef DIAGNOSTIC
@@ -289,7 +269,7 @@ __mtx_enter_try(struct mutex *mtx)
 #endif
if (owner == NULL) {
membar_enter_after_atomic();
-   if (mtx->mtx_wantipl != IPL_NONE)
+   if (wantipl != IPL_NONE)
mtx->mtx_oldipl = s;
 #ifdef DIAGNOSTIC
ci->ci_mutex_level++;
@@ -297,11 +277,49 @@ __mtx_enter_try(struct mutex *mtx)
return (1);
}

-   if (mtx->mtx_wantipl != IPL_NONE)
+   if (wantipl != IPL_NONE)
splx(s);

return (0);
 }
+
+void
+__mtx_enter(struct mutex *mtx)
+{
+   struct cpu_info *ci = curcpu();
+   int wantipl = mtx->mtx_wantipl;
+#ifdef MP_LOCKDEBUG
+   int nticks = __mp_lock_spinout;
+#endif
+
+   for (;;) {
+   if (__mtx_enter_common(mtx, wantipl, ci))
+   break;
+
+   for (;;) {
+#ifdef MP_LOCKDEBUG
+   if (--nticks == 0) {
+   db_printf("%s: %p lock spun out", __func__,
mtx);
+   db_enter();
+   nticks = __mp_lock_spinout;
+   }
+#endif
+   CPU_BUSY_CYCLE();
+   if (mtx->mtx_owner == NULL)
+   break;
+   }
+   }
+}
+
+int
+__mtx_enter_try(struct mutex *mtx)
+{
+   struct cpu_info *ci = curcpu();
+   int wantipl = mtx->mtx_wantipl;
+
+   return (__mtx_enter_common(mtx, wantipl, ci));
+}
+
 #else
 void
 __mtx_enter(struct mutex *mtx)

-- 
Mateusz Guzik 


Re: C mutex impl. for x86

2017-12-20 Thread Martin Pieuchot
On 15/12/17(Fri) 22:03, Mateusz Guzik wrote:
> [...] 
> However, contended behaviour is a regression compared to the asm
> variant.

Now that I checked the files in could you generate a diff with your
suggestions?

> From what I gather this is a step towards unifying all mutex
> implementations, hence the membar_* use.

Exactly.

> > +void
> > +__mtx_enter(struct mutex *mtx)
> > +{
> > +#ifdef MP_LOCKDEBUG
> > + int nticks = __mp_lock_spinout;
> > +#endif
> > +
> > + while (__mtx_enter_try(mtx) == 0) {
> > + CPU_BUSY_CYCLE();
> 
> So this is effectively __mtx_enter_try with single pause in-between.
> 
> > +}
> > +
> > +int
> > +__mtx_enter_try(struct mutex *mtx)
> > +{
> > + struct cpu_info *owner, *ci = curcpu();
> > + int s;
> > +
> > + if (mtx->mtx_wantipl != IPL_NONE)
> > + s = splraise(mtx->mtx_wantipl);
> > +
> 
> This is at least one read.
> 
> > + owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
> > +#ifdef DIAGNOSTIC
> > + if (__predict_false(owner == ci))
> > + panic("mtx %p: locking against myself", mtx);
> > +#endif
> > + if (owner == NULL) {
> > + membar_enter_after_atomic();
> > + if (mtx->mtx_wantipl != IPL_NONE)
> > + mtx->mtx_oldipl = s;
> 
> This repeats the read done earlier.
> 
> Since the caller loops, this is effectively a very inefficient lock of
> the form:
> while (!atomic_cas_ptr(...))
> CPU_BUSY_CYCLE();
> 
> + some reads in-between
> 
> Assembly code would spin waiting for the lock to become free before
> playing with spl level and attempting to lock. I don't know how
> contended your mutexes are right now, but this will have to be very
> quickly changed at least to current asm behaviour as more of the kernel
> gets unlocked. Going for ticket locks or backoff later will probably
> work fine enough for the foreseeable future and will postpone the need
> to invest into anything fancy.
> 
> That said, proposed restructure is as follows (pseudo-code, no debug):
> 
> void
> __mtx_enter(struct mutex *mtx)
> {
> int want_ipl, s;
> 
> /* mark mtx_wantipl as volatile or add a read casted through
>  * one to force a read *here* and no re-reads later */
> want_ipl = mtx->mtx_wantipl;
> 
> for (;;) {
> if (want_ipl != IPL_NONE)
> s = splraise(want_ipl);
> owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
> if (owner == NULL) {
> membar_enter_after_atomic();
> if (want_ipl != IPL_NONE)
> mtx->mtx_oldipl = s;
> break;
> }
> 
> if (want_ipl != IPL_NONE)
> splx(s);
> 
> do {
> CPU_BUSY_CYCLE();
> } while (mtx->mtx_owner != NULL);
> }
> }
> 
> I don't think partial duplication of try_enter can be avoided without
> serious ugliness.

Duplication is not a problem since we're going to have a single MI file
for all (most?) archs.

> > +void
> > +__mtx_leave(struct mutex *mtx)
> > +{
> > + int s;
> > +
> > + MUTEX_ASSERT_LOCKED(mtx);
> > +
> > +#ifdef DIAGNOSTIC
> > + curcpu()->ci_mutex_level--;
> > +#endif
> > +
> > + s = mtx->mtx_oldipl;
> > +#ifdef MULTIPROCESSOR
> > + membar_exit_before_atomic();
> > +#endif
> > + mtx->mtx_owner = NULL;
> > + if (mtx->mtx_wantipl != IPL_NONE)
> > + splx(s);
> > +}
> 
> It should be somewhat faster to read mtx_wantipl while the lock is still
> held - it is less likely that someone else dirtied the cacheline. Then
> mtx_oldipl can be only conditionally read.
> 
> This function does not do atomic ops, so membar_exit_before_atomic does
> not really fit, even though it happens to do the exact same thing the
> "correctly" named primitive would - just provide a compiler barrier on
> amd64/i386.
> 
> From what I gather you are trying to mimick illumos nomenclature, but
> they don't have an equivalent afaics. (perhaps Solaris grew one in the
> meantime?)
> 
> In FreeBSD an appropriate routine is named atomic_thread_fence_rel (see
> amd64/include/atomic.h) and I suggest just borrowing the api.

API changes are subject to bikescheding so I'd suggest we concentrate on
the nice optimizations/improvements you're pointing out.

> Side note is that you probably can shorten ipl to vars to make the lock
> smaller. It can be doable to fit it into lock word, but I don't know how
> much
> sense would playing with it make.

One reason to have MI locks is also to make debugging easier.  I don't
know if shrinking the size of the structure goes in the same direction.
Time will tell.



Re: C mutex impl. for x86

2017-12-18 Thread Mark Kettenis
> Date: Mon, 18 Dec 2017 10:08:23 +0100
> From: Martin Pieuchot 
> 
> On 14/12/17(Thu) 16:06, Martin Pieuchot wrote:
> > Diff below moves amd64 and i386 mutex to the common C implementation.
> > 
> > The differences are:
> >   - membar_enter_after_atomic(9) instead of membar_enter(9), and
> >   - membar_exit_before_atomic(9) instead of membar_exit(9)
> > 
> > I'd appreciate any performance test to know if the performance
> > degradation is acceptable with these barriers.
> 
> Hrvoje Popovski confirmed there's not performance regression with this
> implementation on his forwarding setup.
> 
> So I'd like reviews and oks before improving stuff in tree.

The ultimate goal is still to make this MI-code isn't it?

ok kettenis@

> > Index: amd64/amd64/mutex.c
> > ===
> > RCS file: amd64/amd64/mutex.c
> > diff -N amd64/amd64/mutex.c
> > --- /dev/null   1 Jan 1970 00:00:00 -
> > +++ amd64/amd64/mutex.c 14 Dec 2017 14:49:59 -
> > @@ -0,0 +1,151 @@
> > +/* $OpenBSD: mutex.c,v 1.19 2017/09/11 09:52:15 mpi Exp $  */
> > +
> > +/*
> > + * Copyright (c) 2004 Artur Grabowski 
> > + * All rights reserved. 
> > + *
> > + * Redistribution and use in source and binary forms, with or without 
> > + * modification, are permitted provided that the following conditions 
> > + * are met: 
> > + *
> > + * 1. Redistributions of source code must retain the above copyright 
> > + *notice, this list of conditions and the following disclaimer. 
> > + * 2. The name of the author may not be used to endorse or promote products
> > + *derived from this software without specific prior written 
> > permission. 
> > + *
> > + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED 
> > WARRANTIES,
> > + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
> > + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
> > + * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> > + * EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> > + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
> > PROFITS;
> > + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
> > + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
> > + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
> > + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
> > + */
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#include 
> > +
> > +#include 
> > +
> > +void
> > +__mtx_init(struct mutex *mtx, int wantipl)
> > +{
> > +   mtx->mtx_owner = NULL;
> > +   mtx->mtx_wantipl = wantipl;
> > +   mtx->mtx_oldipl = IPL_NONE;
> > +}
> > +
> > +#ifdef MULTIPROCESSOR
> > +#ifdef MP_LOCKDEBUG
> > +#ifndef DDB
> > +#error "MP_LOCKDEBUG requires DDB"
> > +#endif
> > +
> > +/* CPU-dependent timing, needs this to be settable from ddb. */
> > +extern int __mp_lock_spinout;
> > +#endif
> > +
> > +void
> > +__mtx_enter(struct mutex *mtx)
> > +{
> > +#ifdef MP_LOCKDEBUG
> > +   int nticks = __mp_lock_spinout;
> > +#endif
> > +
> > +   while (__mtx_enter_try(mtx) == 0) {
> > +   CPU_BUSY_CYCLE();
> > +
> > +#ifdef MP_LOCKDEBUG
> > +   if (--nticks == 0) {
> > +   db_printf("%s: %p lock spun out", __func__, mtx);
> > +   db_enter();
> > +   nticks = __mp_lock_spinout;
> > +   }
> > +#endif
> > +   }
> > +}
> > +
> > +int
> > +__mtx_enter_try(struct mutex *mtx)
> > +{
> > +   struct cpu_info *owner, *ci = curcpu();
> > +   int s;
> > +
> > +   if (mtx->mtx_wantipl != IPL_NONE)
> > +   s = splraise(mtx->mtx_wantipl);
> > +
> > +   owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
> > +#ifdef DIAGNOSTIC
> > +   if (__predict_false(owner == ci))
> > +   panic("mtx %p: locking against myself", mtx);
> > +#endif
> > +   if (owner == NULL) {
> > +   membar_enter_after_atomic();
> > +   if (mtx->mtx_wantipl != IPL_NONE)
> > +   mtx->mtx_oldipl = s;
> > +#ifdef DIAGNOSTIC
> > +   ci->ci_mutex_level++;
> > +#endif
> > +   return (1);
> > +   }
> > +
> > +   if (mtx->mtx_wantipl != IPL_NONE)
> > +   splx(s);
> > +
> > +   return (0);
> > +}
> > +#else
> > +void
> > +__mtx_enter(struct mutex *mtx)
> > +{
> > +   struct cpu_info *ci = curcpu();
> > +
> > +#ifdef DIAGNOSTIC
> > +   if (__predict_false(mtx->mtx_owner == ci))
> > +   panic("mtx %p: locking against myself", mtx);
> > +#endif
> > +
> > +   if (mtx->mtx_wantipl != IPL_NONE)
> > +   mtx->mtx_oldipl = splraise(mtx->mtx_wantipl);
> > +
> > +   mtx->mtx_owner = ci;
> > +
> > +#ifdef DIAGNOSTIC
> > +   ci->ci_mutex_level++;
> > +#endif
> > +}
> > +
> > +int
> > +__mtx_enter_try(struct mutex *mtx)
> > +{
> > +   __mtx_enter(mtx);
> > +   return (1);
> > +}
> > +#endif
> > +
> > +void
> > 

Re: C mutex impl. for x86

2017-12-18 Thread Martin Pieuchot
On 14/12/17(Thu) 16:06, Martin Pieuchot wrote:
> Diff below moves amd64 and i386 mutex to the common C implementation.
> 
> The differences are:
>   - membar_enter_after_atomic(9) instead of membar_enter(9), and
>   - membar_exit_before_atomic(9) instead of membar_exit(9)
> 
> I'd appreciate any performance test to know if the performance
> degradation is acceptable with these barriers.

Hrvoje Popovski confirmed there's not performance regression with this
implementation on his forwarding setup.

So I'd like reviews and oks before improving stuff in tree.

> Index: amd64/amd64/mutex.c
> ===
> RCS file: amd64/amd64/mutex.c
> diff -N amd64/amd64/mutex.c
> --- /dev/null 1 Jan 1970 00:00:00 -
> +++ amd64/amd64/mutex.c   14 Dec 2017 14:49:59 -
> @@ -0,0 +1,151 @@
> +/*   $OpenBSD: mutex.c,v 1.19 2017/09/11 09:52:15 mpi Exp $  */
> +
> +/*
> + * Copyright (c) 2004 Artur Grabowski 
> + * All rights reserved. 
> + *
> + * Redistribution and use in source and binary forms, with or without 
> + * modification, are permitted provided that the following conditions 
> + * are met: 
> + *
> + * 1. Redistributions of source code must retain the above copyright 
> + *notice, this list of conditions and the following disclaimer. 
> + * 2. The name of the author may not be used to endorse or promote products
> + *derived from this software without specific prior written permission. 
> + *
> + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
> + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
> + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
> + * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> + * EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
> PROFITS;
> + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
> + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
> + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
> + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +#include 
> +
> +void
> +__mtx_init(struct mutex *mtx, int wantipl)
> +{
> + mtx->mtx_owner = NULL;
> + mtx->mtx_wantipl = wantipl;
> + mtx->mtx_oldipl = IPL_NONE;
> +}
> +
> +#ifdef MULTIPROCESSOR
> +#ifdef MP_LOCKDEBUG
> +#ifndef DDB
> +#error "MP_LOCKDEBUG requires DDB"
> +#endif
> +
> +/* CPU-dependent timing, needs this to be settable from ddb. */
> +extern int __mp_lock_spinout;
> +#endif
> +
> +void
> +__mtx_enter(struct mutex *mtx)
> +{
> +#ifdef MP_LOCKDEBUG
> + int nticks = __mp_lock_spinout;
> +#endif
> +
> + while (__mtx_enter_try(mtx) == 0) {
> + CPU_BUSY_CYCLE();
> +
> +#ifdef MP_LOCKDEBUG
> + if (--nticks == 0) {
> + db_printf("%s: %p lock spun out", __func__, mtx);
> + db_enter();
> + nticks = __mp_lock_spinout;
> + }
> +#endif
> + }
> +}
> +
> +int
> +__mtx_enter_try(struct mutex *mtx)
> +{
> + struct cpu_info *owner, *ci = curcpu();
> + int s;
> +
> + if (mtx->mtx_wantipl != IPL_NONE)
> + s = splraise(mtx->mtx_wantipl);
> +
> + owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
> +#ifdef DIAGNOSTIC
> + if (__predict_false(owner == ci))
> + panic("mtx %p: locking against myself", mtx);
> +#endif
> + if (owner == NULL) {
> + membar_enter_after_atomic();
> + if (mtx->mtx_wantipl != IPL_NONE)
> + mtx->mtx_oldipl = s;
> +#ifdef DIAGNOSTIC
> + ci->ci_mutex_level++;
> +#endif
> + return (1);
> + }
> +
> + if (mtx->mtx_wantipl != IPL_NONE)
> + splx(s);
> +
> + return (0);
> +}
> +#else
> +void
> +__mtx_enter(struct mutex *mtx)
> +{
> + struct cpu_info *ci = curcpu();
> +
> +#ifdef DIAGNOSTIC
> + if (__predict_false(mtx->mtx_owner == ci))
> + panic("mtx %p: locking against myself", mtx);
> +#endif
> +
> + if (mtx->mtx_wantipl != IPL_NONE)
> + mtx->mtx_oldipl = splraise(mtx->mtx_wantipl);
> +
> + mtx->mtx_owner = ci;
> +
> +#ifdef DIAGNOSTIC
> + ci->ci_mutex_level++;
> +#endif
> +}
> +
> +int
> +__mtx_enter_try(struct mutex *mtx)
> +{
> + __mtx_enter(mtx);
> + return (1);
> +}
> +#endif
> +
> +void
> +__mtx_leave(struct mutex *mtx)
> +{
> + int s;
> +
> + MUTEX_ASSERT_LOCKED(mtx);
> +
> +#ifdef DIAGNOSTIC
> + curcpu()->ci_mutex_level--;
> +#endif
> +
> + s = mtx->mtx_oldipl;
> +#ifdef MULTIPROCESSOR
> + membar_exit_before_atomic();
> +#endif
> + mtx->mtx_owner = NULL;
> + if (mtx->mtx_wantipl != IPL_NONE)
> + splx(s);
> +}
> Index: amd64/conf/files.amd64
> 

Re: C mutex impl. for x86

2017-12-15 Thread Mateusz Guzik
On Thu, Dec 14, 2017 at 04:06:41PM +0100, Martin Pieuchot wrote:
> Diff below moves amd64 and i386 mutex to the common C implementation.
>
> The differences are:
>   - membar_enter_after_atomic(9) instead of membar_enter(9), and
>   - membar_exit_before_atomic(9) instead of membar_exit(9)
>
> I'd appreciate any performance test to know if the performance
> degradation is acceptable with these barriers.
>

These are only compiler barriers and there should be effectively no
difference.

However, contended behaviour is a regression compared to the asm
variant.

>From what I gather this is a step towards unifying all mutex
implementations, hence the membar_* use.

> +void
> +__mtx_enter(struct mutex *mtx)
> +{
> +#ifdef MP_LOCKDEBUG
> + int nticks = __mp_lock_spinout;
> +#endif
> +
> + while (__mtx_enter_try(mtx) == 0) {
> + CPU_BUSY_CYCLE();

So this is effectively __mtx_enter_try with single pause in-between.

> +}
> +
> +int
> +__mtx_enter_try(struct mutex *mtx)
> +{
> + struct cpu_info *owner, *ci = curcpu();
> + int s;
> +
> + if (mtx->mtx_wantipl != IPL_NONE)
> + s = splraise(mtx->mtx_wantipl);
> +

This is at least one read.

> + owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
> +#ifdef DIAGNOSTIC
> + if (__predict_false(owner == ci))
> + panic("mtx %p: locking against myself", mtx);
> +#endif
> + if (owner == NULL) {
> + membar_enter_after_atomic();
> + if (mtx->mtx_wantipl != IPL_NONE)
> + mtx->mtx_oldipl = s;

This repeats the read done earlier.

Since the caller loops, this is effectively a very inefficient lock of
the form:
while (!atomic_cas_ptr(...))
CPU_BUSY_CYCLE();

+ some reads in-between

Assembly code would spin waiting for the lock to become free before
playing with spl level and attempting to lock. I don't know how
contended your mutexes are right now, but this will have to be very
quickly changed at least to current asm behaviour as more of the kernel
gets unlocked. Going for ticket locks or backoff later will probably
work fine enough for the foreseeable future and will postpone the need
to invest into anything fancy.

That said, proposed restructure is as follows (pseudo-code, no debug):

void
__mtx_enter(struct mutex *mtx)
{
int want_ipl, s;

/* mark mtx_wantipl as volatile or add a read casted through
 * one to force a read *here* and no re-reads later */
want_ipl = mtx->mtx_wantipl;

for (;;) {
if (want_ipl != IPL_NONE)
s = splraise(want_ipl);
owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
if (owner == NULL) {
membar_enter_after_atomic();
if (want_ipl != IPL_NONE)
mtx->mtx_oldipl = s;
break;
}

if (want_ipl != IPL_NONE)
splx(s);

do {
CPU_BUSY_CYCLE();
} while (mtx->mtx_owner != NULL);
}
}

I don't think partial duplication of try_enter can be avoided without
serious ugliness.

> +void
> +__mtx_leave(struct mutex *mtx)
> +{
> + int s;
> +
> + MUTEX_ASSERT_LOCKED(mtx);
> +
> +#ifdef DIAGNOSTIC
> + curcpu()->ci_mutex_level--;
> +#endif
> +
> + s = mtx->mtx_oldipl;
> +#ifdef MULTIPROCESSOR
> + membar_exit_before_atomic();
> +#endif
> + mtx->mtx_owner = NULL;
> + if (mtx->mtx_wantipl != IPL_NONE)
> + splx(s);
> +}

It should be somewhat faster to read mtx_wantipl while the lock is still
held - it is less likely that someone else dirtied the cacheline. Then
mtx_oldipl can be only conditionally read.

This function does not do atomic ops, so membar_exit_before_atomic does
not really fit, even though it happens to do the exact same thing the
"correctly" named primitive would - just provide a compiler barrier on
amd64/i386.

>From what I gather you are trying to mimick illumos nomenclature, but
they don't have an equivalent afaics. (perhaps Solaris grew one in the
meantime?)

In FreeBSD an appropriate routine is named atomic_thread_fence_rel (see
amd64/include/atomic.h) and I suggest just borrowing the api.

Side note is that you probably can shorten ipl to vars to make the lock
smaller. It can be doable to fit it into lock word, but I don't know how
much
sense would playing with it make.

-- 
Mateusz Guzik 


C mutex impl. for x86

2017-12-14 Thread Martin Pieuchot
Diff below moves amd64 and i386 mutex to the common C implementation.

The differences are:
  - membar_enter_after_atomic(9) instead of membar_enter(9), and
  - membar_exit_before_atomic(9) instead of membar_exit(9)

I'd appreciate any performance test to know if the performance
degradation is acceptable with these barriers.

Index: amd64/amd64/mutex.c
===
RCS file: amd64/amd64/mutex.c
diff -N amd64/amd64/mutex.c
--- /dev/null   1 Jan 1970 00:00:00 -
+++ amd64/amd64/mutex.c 14 Dec 2017 14:49:59 -
@@ -0,0 +1,151 @@
+/* $OpenBSD: mutex.c,v 1.19 2017/09/11 09:52:15 mpi Exp $  */
+
+/*
+ * Copyright (c) 2004 Artur Grabowski 
+ * All rights reserved. 
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+ *
+ * 1. Redistributions of source code must retain the above copyright 
+ *notice, this list of conditions and the following disclaimer. 
+ * 2. The name of the author may not be used to endorse or promote products
+ *derived from this software without specific prior written permission. 
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+
+void
+__mtx_init(struct mutex *mtx, int wantipl)
+{
+   mtx->mtx_owner = NULL;
+   mtx->mtx_wantipl = wantipl;
+   mtx->mtx_oldipl = IPL_NONE;
+}
+
+#ifdef MULTIPROCESSOR
+#ifdef MP_LOCKDEBUG
+#ifndef DDB
+#error "MP_LOCKDEBUG requires DDB"
+#endif
+
+/* CPU-dependent timing, needs this to be settable from ddb. */
+extern int __mp_lock_spinout;
+#endif
+
+void
+__mtx_enter(struct mutex *mtx)
+{
+#ifdef MP_LOCKDEBUG
+   int nticks = __mp_lock_spinout;
+#endif
+
+   while (__mtx_enter_try(mtx) == 0) {
+   CPU_BUSY_CYCLE();
+
+#ifdef MP_LOCKDEBUG
+   if (--nticks == 0) {
+   db_printf("%s: %p lock spun out", __func__, mtx);
+   db_enter();
+   nticks = __mp_lock_spinout;
+   }
+#endif
+   }
+}
+
+int
+__mtx_enter_try(struct mutex *mtx)
+{
+   struct cpu_info *owner, *ci = curcpu();
+   int s;
+
+   if (mtx->mtx_wantipl != IPL_NONE)
+   s = splraise(mtx->mtx_wantipl);
+
+   owner = atomic_cas_ptr(>mtx_owner, NULL, ci);
+#ifdef DIAGNOSTIC
+   if (__predict_false(owner == ci))
+   panic("mtx %p: locking against myself", mtx);
+#endif
+   if (owner == NULL) {
+   membar_enter_after_atomic();
+   if (mtx->mtx_wantipl != IPL_NONE)
+   mtx->mtx_oldipl = s;
+#ifdef DIAGNOSTIC
+   ci->ci_mutex_level++;
+#endif
+   return (1);
+   }
+
+   if (mtx->mtx_wantipl != IPL_NONE)
+   splx(s);
+
+   return (0);
+}
+#else
+void
+__mtx_enter(struct mutex *mtx)
+{
+   struct cpu_info *ci = curcpu();
+
+#ifdef DIAGNOSTIC
+   if (__predict_false(mtx->mtx_owner == ci))
+   panic("mtx %p: locking against myself", mtx);
+#endif
+
+   if (mtx->mtx_wantipl != IPL_NONE)
+   mtx->mtx_oldipl = splraise(mtx->mtx_wantipl);
+
+   mtx->mtx_owner = ci;
+
+#ifdef DIAGNOSTIC
+   ci->ci_mutex_level++;
+#endif
+}
+
+int
+__mtx_enter_try(struct mutex *mtx)
+{
+   __mtx_enter(mtx);
+   return (1);
+}
+#endif
+
+void
+__mtx_leave(struct mutex *mtx)
+{
+   int s;
+
+   MUTEX_ASSERT_LOCKED(mtx);
+
+#ifdef DIAGNOSTIC
+   curcpu()->ci_mutex_level--;
+#endif
+
+   s = mtx->mtx_oldipl;
+#ifdef MULTIPROCESSOR
+   membar_exit_before_atomic();
+#endif
+   mtx->mtx_owner = NULL;
+   if (mtx->mtx_wantipl != IPL_NONE)
+   splx(s);
+}
Index: amd64/conf/files.amd64
===
RCS file: /cvs/src/sys/arch/amd64/conf/files.amd64,v
retrieving revision 1.91
diff -u -p -r1.91 files.amd64
--- amd64/conf/files.amd64  17 Oct 2017 14:25:35 -  1.91
+++ amd64/conf/files.amd64  14 Dec 2017 14:51:20 -
@@ -28,7 +28,7 @@ file  arch/amd64/amd64/fpu.c
 file   arch/amd64/amd64/softintr.c
 file   arch/amd64/amd64/i8259.c
 file   arch/amd64/amd64/cacheinfo.c
-file