Re: C mutex impl. for x86
On Wed, Dec 20, 2017 at 12:17:27PM +0100, Martin Pieuchot wrote: > On 15/12/17(Fri) 22:03, Mateusz Guzik wrote: > > > +void > > > +__mtx_enter(struct mutex *mtx) > > > +{ > > > +#ifdef MP_LOCKDEBUG > > > + int nticks = __mp_lock_spinout; > > > +#endif > > > + > > > + while (__mtx_enter_try(mtx) == 0) { > > > + CPU_BUSY_CYCLE(); > > > > So this is effectively __mtx_enter_try with single pause in-between. > > > > > +} > > > + > > > +int > > > +__mtx_enter_try(struct mutex *mtx) > > > +{ > > > + struct cpu_info *owner, *ci = curcpu(); > > > + int s; > > > + > > > + if (mtx->mtx_wantipl != IPL_NONE) > > > + s = splraise(mtx->mtx_wantipl); > > > + > > > > This is at least one read. > > > > > + owner = atomic_cas_ptr(>mtx_owner, NULL, ci); > > > +#ifdef DIAGNOSTIC > > > + if (__predict_false(owner == ci)) > > > + panic("mtx %p: locking against myself", mtx); > > > +#endif > > > + if (owner == NULL) { > > > + membar_enter_after_atomic(); > > > + if (mtx->mtx_wantipl != IPL_NONE) > > > + mtx->mtx_oldipl = s; > > > > This repeats the read done earlier. > > > > Since the caller loops, this is effectively a very inefficient lock of > > the form: > > while (!atomic_cas_ptr(...)) > > CPU_BUSY_CYCLE(); > > > > + some reads in-between > > So how about the patch below. It booted fine and I did a make -j 8 of the kernel build with it. Unfortunately the vm in question is running in too volatile invironment for any kind of performance testing atm. Impact should be minor anyway. Note this is still a super basic lock with giant room for improvement. I'm not committing to any kind of work in the area, but I may contribute other code in the future if stuff of this sort is fine with you. diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c index 64c5a1f6319..5c14f2b6720 100644 --- a/sys/kern/kern_lock.c +++ b/sys/kern/kern_lock.c @@ -253,34 +253,14 @@ __mtx_init(struct mutex *mtx, int wantipl) } #ifdef MULTIPROCESSOR -void -__mtx_enter(struct mutex *mtx) -{ -#ifdef MP_LOCKDEBUG - int nticks = __mp_lock_spinout; -#endif - - while (__mtx_enter_try(mtx) == 0) { - CPU_BUSY_CYCLE(); - -#ifdef MP_LOCKDEBUG - int nticks = __mp_lock_spinout; -#endif - - while (__mtx_enter_try(mtx) == 0) { - CPU_BUSY_CYCLE(); - -#ifdef MP_LOCKDEBUG - if (--nticks == 0) { - db_printf("%s: %p lock spun out", __func__, mtx); - db_enter(); - nticks = __mp_lock_spinout; - } -#endif - } -} - int -__mtx_enter_try(struct mutex *mtx) +__mtx_enter_common(struct mutex *mtx, int wantipl, struct cpu_info *ci) { - struct cpu_info *owner, *ci = curcpu(); + struct cpu_info *owner; int s; - if (mtx->mtx_wantipl != IPL_NONE) - s = splraise(mtx->mtx_wantipl); + if (wantipl != IPL_NONE) + s = splraise(wantipl); owner = atomic_cas_ptr(>mtx_owner, NULL, ci); #ifdef DIAGNOSTIC @@ -289,7 +269,7 @@ __mtx_enter_try(struct mutex *mtx) #endif if (owner == NULL) { membar_enter_after_atomic(); - if (mtx->mtx_wantipl != IPL_NONE) + if (wantipl != IPL_NONE) mtx->mtx_oldipl = s; #ifdef DIAGNOSTIC ci->ci_mutex_level++; @@ -297,11 +277,49 @@ __mtx_enter_try(struct mutex *mtx) return (1); } - if (mtx->mtx_wantipl != IPL_NONE) + if (wantipl != IPL_NONE) splx(s); return (0); } + +void +__mtx_enter(struct mutex *mtx) +{ + struct cpu_info *ci = curcpu(); + int wantipl = mtx->mtx_wantipl; +#ifdef MP_LOCKDEBUG + int nticks = __mp_lock_spinout; +#endif + + for (;;) { + if (__mtx_enter_common(mtx, wantipl, ci)) + break; + + for (;;) { +#ifdef MP_LOCKDEBUG + if (--nticks == 0) { + db_printf("%s: %p lock spun out", __func__, mtx); + db_enter(); + nticks = __mp_lock_spinout; + } +#endif + CPU_BUSY_CYCLE(); + if (mtx->mtx_owner == NULL) + break; + } + } +} + +int +__mtx_enter_try(struct mutex *mtx) +{ + struct cpu_info *ci = curcpu(); + int wantipl = mtx->mtx_wantipl; + + return (__mtx_enter_common(mtx, wantipl, ci)); +} + #else void __mtx_enter(struct mutex *mtx) -- Mateusz Guzik
Re: C mutex impl. for x86
On 15/12/17(Fri) 22:03, Mateusz Guzik wrote: > [...] > However, contended behaviour is a regression compared to the asm > variant. Now that I checked the files in could you generate a diff with your suggestions? > From what I gather this is a step towards unifying all mutex > implementations, hence the membar_* use. Exactly. > > +void > > +__mtx_enter(struct mutex *mtx) > > +{ > > +#ifdef MP_LOCKDEBUG > > + int nticks = __mp_lock_spinout; > > +#endif > > + > > + while (__mtx_enter_try(mtx) == 0) { > > + CPU_BUSY_CYCLE(); > > So this is effectively __mtx_enter_try with single pause in-between. > > > +} > > + > > +int > > +__mtx_enter_try(struct mutex *mtx) > > +{ > > + struct cpu_info *owner, *ci = curcpu(); > > + int s; > > + > > + if (mtx->mtx_wantipl != IPL_NONE) > > + s = splraise(mtx->mtx_wantipl); > > + > > This is at least one read. > > > + owner = atomic_cas_ptr(>mtx_owner, NULL, ci); > > +#ifdef DIAGNOSTIC > > + if (__predict_false(owner == ci)) > > + panic("mtx %p: locking against myself", mtx); > > +#endif > > + if (owner == NULL) { > > + membar_enter_after_atomic(); > > + if (mtx->mtx_wantipl != IPL_NONE) > > + mtx->mtx_oldipl = s; > > This repeats the read done earlier. > > Since the caller loops, this is effectively a very inefficient lock of > the form: > while (!atomic_cas_ptr(...)) > CPU_BUSY_CYCLE(); > > + some reads in-between > > Assembly code would spin waiting for the lock to become free before > playing with spl level and attempting to lock. I don't know how > contended your mutexes are right now, but this will have to be very > quickly changed at least to current asm behaviour as more of the kernel > gets unlocked. Going for ticket locks or backoff later will probably > work fine enough for the foreseeable future and will postpone the need > to invest into anything fancy. > > That said, proposed restructure is as follows (pseudo-code, no debug): > > void > __mtx_enter(struct mutex *mtx) > { > int want_ipl, s; > > /* mark mtx_wantipl as volatile or add a read casted through > * one to force a read *here* and no re-reads later */ > want_ipl = mtx->mtx_wantipl; > > for (;;) { > if (want_ipl != IPL_NONE) > s = splraise(want_ipl); > owner = atomic_cas_ptr(>mtx_owner, NULL, ci); > if (owner == NULL) { > membar_enter_after_atomic(); > if (want_ipl != IPL_NONE) > mtx->mtx_oldipl = s; > break; > } > > if (want_ipl != IPL_NONE) > splx(s); > > do { > CPU_BUSY_CYCLE(); > } while (mtx->mtx_owner != NULL); > } > } > > I don't think partial duplication of try_enter can be avoided without > serious ugliness. Duplication is not a problem since we're going to have a single MI file for all (most?) archs. > > +void > > +__mtx_leave(struct mutex *mtx) > > +{ > > + int s; > > + > > + MUTEX_ASSERT_LOCKED(mtx); > > + > > +#ifdef DIAGNOSTIC > > + curcpu()->ci_mutex_level--; > > +#endif > > + > > + s = mtx->mtx_oldipl; > > +#ifdef MULTIPROCESSOR > > + membar_exit_before_atomic(); > > +#endif > > + mtx->mtx_owner = NULL; > > + if (mtx->mtx_wantipl != IPL_NONE) > > + splx(s); > > +} > > It should be somewhat faster to read mtx_wantipl while the lock is still > held - it is less likely that someone else dirtied the cacheline. Then > mtx_oldipl can be only conditionally read. > > This function does not do atomic ops, so membar_exit_before_atomic does > not really fit, even though it happens to do the exact same thing the > "correctly" named primitive would - just provide a compiler barrier on > amd64/i386. > > From what I gather you are trying to mimick illumos nomenclature, but > they don't have an equivalent afaics. (perhaps Solaris grew one in the > meantime?) > > In FreeBSD an appropriate routine is named atomic_thread_fence_rel (see > amd64/include/atomic.h) and I suggest just borrowing the api. API changes are subject to bikescheding so I'd suggest we concentrate on the nice optimizations/improvements you're pointing out. > Side note is that you probably can shorten ipl to vars to make the lock > smaller. It can be doable to fit it into lock word, but I don't know how > much > sense would playing with it make. One reason to have MI locks is also to make debugging easier. I don't know if shrinking the size of the structure goes in the same direction. Time will tell.
Re: C mutex impl. for x86
> Date: Mon, 18 Dec 2017 10:08:23 +0100 > From: Martin Pieuchot> > On 14/12/17(Thu) 16:06, Martin Pieuchot wrote: > > Diff below moves amd64 and i386 mutex to the common C implementation. > > > > The differences are: > > - membar_enter_after_atomic(9) instead of membar_enter(9), and > > - membar_exit_before_atomic(9) instead of membar_exit(9) > > > > I'd appreciate any performance test to know if the performance > > degradation is acceptable with these barriers. > > Hrvoje Popovski confirmed there's not performance regression with this > implementation on his forwarding setup. > > So I'd like reviews and oks before improving stuff in tree. The ultimate goal is still to make this MI-code isn't it? ok kettenis@ > > Index: amd64/amd64/mutex.c > > === > > RCS file: amd64/amd64/mutex.c > > diff -N amd64/amd64/mutex.c > > --- /dev/null 1 Jan 1970 00:00:00 - > > +++ amd64/amd64/mutex.c 14 Dec 2017 14:49:59 - > > @@ -0,0 +1,151 @@ > > +/* $OpenBSD: mutex.c,v 1.19 2017/09/11 09:52:15 mpi Exp $ */ > > + > > +/* > > + * Copyright (c) 2004 Artur Grabowski > > + * All rights reserved. > > + * > > + * Redistribution and use in source and binary forms, with or without > > + * modification, are permitted provided that the following conditions > > + * are met: > > + * > > + * 1. Redistributions of source code must retain the above copyright > > + *notice, this list of conditions and the following disclaimer. > > + * 2. The name of the author may not be used to endorse or promote products > > + *derived from this software without specific prior written > > permission. > > + * > > + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED > > WARRANTIES, > > + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY > > + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL > > + * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, > > + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, > > + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > > PROFITS; > > + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, > > + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR > > + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF > > + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > > + */ > > + > > +#include > > +#include > > +#include > > +#include > > + > > +#include > > + > > +#include > > + > > +void > > +__mtx_init(struct mutex *mtx, int wantipl) > > +{ > > + mtx->mtx_owner = NULL; > > + mtx->mtx_wantipl = wantipl; > > + mtx->mtx_oldipl = IPL_NONE; > > +} > > + > > +#ifdef MULTIPROCESSOR > > +#ifdef MP_LOCKDEBUG > > +#ifndef DDB > > +#error "MP_LOCKDEBUG requires DDB" > > +#endif > > + > > +/* CPU-dependent timing, needs this to be settable from ddb. */ > > +extern int __mp_lock_spinout; > > +#endif > > + > > +void > > +__mtx_enter(struct mutex *mtx) > > +{ > > +#ifdef MP_LOCKDEBUG > > + int nticks = __mp_lock_spinout; > > +#endif > > + > > + while (__mtx_enter_try(mtx) == 0) { > > + CPU_BUSY_CYCLE(); > > + > > +#ifdef MP_LOCKDEBUG > > + if (--nticks == 0) { > > + db_printf("%s: %p lock spun out", __func__, mtx); > > + db_enter(); > > + nticks = __mp_lock_spinout; > > + } > > +#endif > > + } > > +} > > + > > +int > > +__mtx_enter_try(struct mutex *mtx) > > +{ > > + struct cpu_info *owner, *ci = curcpu(); > > + int s; > > + > > + if (mtx->mtx_wantipl != IPL_NONE) > > + s = splraise(mtx->mtx_wantipl); > > + > > + owner = atomic_cas_ptr(>mtx_owner, NULL, ci); > > +#ifdef DIAGNOSTIC > > + if (__predict_false(owner == ci)) > > + panic("mtx %p: locking against myself", mtx); > > +#endif > > + if (owner == NULL) { > > + membar_enter_after_atomic(); > > + if (mtx->mtx_wantipl != IPL_NONE) > > + mtx->mtx_oldipl = s; > > +#ifdef DIAGNOSTIC > > + ci->ci_mutex_level++; > > +#endif > > + return (1); > > + } > > + > > + if (mtx->mtx_wantipl != IPL_NONE) > > + splx(s); > > + > > + return (0); > > +} > > +#else > > +void > > +__mtx_enter(struct mutex *mtx) > > +{ > > + struct cpu_info *ci = curcpu(); > > + > > +#ifdef DIAGNOSTIC > > + if (__predict_false(mtx->mtx_owner == ci)) > > + panic("mtx %p: locking against myself", mtx); > > +#endif > > + > > + if (mtx->mtx_wantipl != IPL_NONE) > > + mtx->mtx_oldipl = splraise(mtx->mtx_wantipl); > > + > > + mtx->mtx_owner = ci; > > + > > +#ifdef DIAGNOSTIC > > + ci->ci_mutex_level++; > > +#endif > > +} > > + > > +int > > +__mtx_enter_try(struct mutex *mtx) > > +{ > > + __mtx_enter(mtx); > > + return (1); > > +} > > +#endif > > + > > +void > >
Re: C mutex impl. for x86
On 14/12/17(Thu) 16:06, Martin Pieuchot wrote: > Diff below moves amd64 and i386 mutex to the common C implementation. > > The differences are: > - membar_enter_after_atomic(9) instead of membar_enter(9), and > - membar_exit_before_atomic(9) instead of membar_exit(9) > > I'd appreciate any performance test to know if the performance > degradation is acceptable with these barriers. Hrvoje Popovski confirmed there's not performance regression with this implementation on his forwarding setup. So I'd like reviews and oks before improving stuff in tree. > Index: amd64/amd64/mutex.c > === > RCS file: amd64/amd64/mutex.c > diff -N amd64/amd64/mutex.c > --- /dev/null 1 Jan 1970 00:00:00 - > +++ amd64/amd64/mutex.c 14 Dec 2017 14:49:59 - > @@ -0,0 +1,151 @@ > +/* $OpenBSD: mutex.c,v 1.19 2017/09/11 09:52:15 mpi Exp $ */ > + > +/* > + * Copyright (c) 2004 Artur Grabowski> + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * 1. Redistributions of source code must retain the above copyright > + *notice, this list of conditions and the following disclaimer. > + * 2. The name of the author may not be used to endorse or promote products > + *derived from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, > + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY > + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL > + * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, > + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, > + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > PROFITS; > + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, > + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR > + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF > + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#include > +#include > +#include > +#include > + > +#include > + > +#include > + > +void > +__mtx_init(struct mutex *mtx, int wantipl) > +{ > + mtx->mtx_owner = NULL; > + mtx->mtx_wantipl = wantipl; > + mtx->mtx_oldipl = IPL_NONE; > +} > + > +#ifdef MULTIPROCESSOR > +#ifdef MP_LOCKDEBUG > +#ifndef DDB > +#error "MP_LOCKDEBUG requires DDB" > +#endif > + > +/* CPU-dependent timing, needs this to be settable from ddb. */ > +extern int __mp_lock_spinout; > +#endif > + > +void > +__mtx_enter(struct mutex *mtx) > +{ > +#ifdef MP_LOCKDEBUG > + int nticks = __mp_lock_spinout; > +#endif > + > + while (__mtx_enter_try(mtx) == 0) { > + CPU_BUSY_CYCLE(); > + > +#ifdef MP_LOCKDEBUG > + if (--nticks == 0) { > + db_printf("%s: %p lock spun out", __func__, mtx); > + db_enter(); > + nticks = __mp_lock_spinout; > + } > +#endif > + } > +} > + > +int > +__mtx_enter_try(struct mutex *mtx) > +{ > + struct cpu_info *owner, *ci = curcpu(); > + int s; > + > + if (mtx->mtx_wantipl != IPL_NONE) > + s = splraise(mtx->mtx_wantipl); > + > + owner = atomic_cas_ptr(>mtx_owner, NULL, ci); > +#ifdef DIAGNOSTIC > + if (__predict_false(owner == ci)) > + panic("mtx %p: locking against myself", mtx); > +#endif > + if (owner == NULL) { > + membar_enter_after_atomic(); > + if (mtx->mtx_wantipl != IPL_NONE) > + mtx->mtx_oldipl = s; > +#ifdef DIAGNOSTIC > + ci->ci_mutex_level++; > +#endif > + return (1); > + } > + > + if (mtx->mtx_wantipl != IPL_NONE) > + splx(s); > + > + return (0); > +} > +#else > +void > +__mtx_enter(struct mutex *mtx) > +{ > + struct cpu_info *ci = curcpu(); > + > +#ifdef DIAGNOSTIC > + if (__predict_false(mtx->mtx_owner == ci)) > + panic("mtx %p: locking against myself", mtx); > +#endif > + > + if (mtx->mtx_wantipl != IPL_NONE) > + mtx->mtx_oldipl = splraise(mtx->mtx_wantipl); > + > + mtx->mtx_owner = ci; > + > +#ifdef DIAGNOSTIC > + ci->ci_mutex_level++; > +#endif > +} > + > +int > +__mtx_enter_try(struct mutex *mtx) > +{ > + __mtx_enter(mtx); > + return (1); > +} > +#endif > + > +void > +__mtx_leave(struct mutex *mtx) > +{ > + int s; > + > + MUTEX_ASSERT_LOCKED(mtx); > + > +#ifdef DIAGNOSTIC > + curcpu()->ci_mutex_level--; > +#endif > + > + s = mtx->mtx_oldipl; > +#ifdef MULTIPROCESSOR > + membar_exit_before_atomic(); > +#endif > + mtx->mtx_owner = NULL; > + if (mtx->mtx_wantipl != IPL_NONE) > + splx(s); > +} > Index: amd64/conf/files.amd64 >
Re: C mutex impl. for x86
On Thu, Dec 14, 2017 at 04:06:41PM +0100, Martin Pieuchot wrote: > Diff below moves amd64 and i386 mutex to the common C implementation. > > The differences are: > - membar_enter_after_atomic(9) instead of membar_enter(9), and > - membar_exit_before_atomic(9) instead of membar_exit(9) > > I'd appreciate any performance test to know if the performance > degradation is acceptable with these barriers. > These are only compiler barriers and there should be effectively no difference. However, contended behaviour is a regression compared to the asm variant. >From what I gather this is a step towards unifying all mutex implementations, hence the membar_* use. > +void > +__mtx_enter(struct mutex *mtx) > +{ > +#ifdef MP_LOCKDEBUG > + int nticks = __mp_lock_spinout; > +#endif > + > + while (__mtx_enter_try(mtx) == 0) { > + CPU_BUSY_CYCLE(); So this is effectively __mtx_enter_try with single pause in-between. > +} > + > +int > +__mtx_enter_try(struct mutex *mtx) > +{ > + struct cpu_info *owner, *ci = curcpu(); > + int s; > + > + if (mtx->mtx_wantipl != IPL_NONE) > + s = splraise(mtx->mtx_wantipl); > + This is at least one read. > + owner = atomic_cas_ptr(>mtx_owner, NULL, ci); > +#ifdef DIAGNOSTIC > + if (__predict_false(owner == ci)) > + panic("mtx %p: locking against myself", mtx); > +#endif > + if (owner == NULL) { > + membar_enter_after_atomic(); > + if (mtx->mtx_wantipl != IPL_NONE) > + mtx->mtx_oldipl = s; This repeats the read done earlier. Since the caller loops, this is effectively a very inefficient lock of the form: while (!atomic_cas_ptr(...)) CPU_BUSY_CYCLE(); + some reads in-between Assembly code would spin waiting for the lock to become free before playing with spl level and attempting to lock. I don't know how contended your mutexes are right now, but this will have to be very quickly changed at least to current asm behaviour as more of the kernel gets unlocked. Going for ticket locks or backoff later will probably work fine enough for the foreseeable future and will postpone the need to invest into anything fancy. That said, proposed restructure is as follows (pseudo-code, no debug): void __mtx_enter(struct mutex *mtx) { int want_ipl, s; /* mark mtx_wantipl as volatile or add a read casted through * one to force a read *here* and no re-reads later */ want_ipl = mtx->mtx_wantipl; for (;;) { if (want_ipl != IPL_NONE) s = splraise(want_ipl); owner = atomic_cas_ptr(>mtx_owner, NULL, ci); if (owner == NULL) { membar_enter_after_atomic(); if (want_ipl != IPL_NONE) mtx->mtx_oldipl = s; break; } if (want_ipl != IPL_NONE) splx(s); do { CPU_BUSY_CYCLE(); } while (mtx->mtx_owner != NULL); } } I don't think partial duplication of try_enter can be avoided without serious ugliness. > +void > +__mtx_leave(struct mutex *mtx) > +{ > + int s; > + > + MUTEX_ASSERT_LOCKED(mtx); > + > +#ifdef DIAGNOSTIC > + curcpu()->ci_mutex_level--; > +#endif > + > + s = mtx->mtx_oldipl; > +#ifdef MULTIPROCESSOR > + membar_exit_before_atomic(); > +#endif > + mtx->mtx_owner = NULL; > + if (mtx->mtx_wantipl != IPL_NONE) > + splx(s); > +} It should be somewhat faster to read mtx_wantipl while the lock is still held - it is less likely that someone else dirtied the cacheline. Then mtx_oldipl can be only conditionally read. This function does not do atomic ops, so membar_exit_before_atomic does not really fit, even though it happens to do the exact same thing the "correctly" named primitive would - just provide a compiler barrier on amd64/i386. >From what I gather you are trying to mimick illumos nomenclature, but they don't have an equivalent afaics. (perhaps Solaris grew one in the meantime?) In FreeBSD an appropriate routine is named atomic_thread_fence_rel (see amd64/include/atomic.h) and I suggest just borrowing the api. Side note is that you probably can shorten ipl to vars to make the lock smaller. It can be doable to fit it into lock word, but I don't know how much sense would playing with it make. -- Mateusz Guzik
C mutex impl. for x86
Diff below moves amd64 and i386 mutex to the common C implementation. The differences are: - membar_enter_after_atomic(9) instead of membar_enter(9), and - membar_exit_before_atomic(9) instead of membar_exit(9) I'd appreciate any performance test to know if the performance degradation is acceptable with these barriers. Index: amd64/amd64/mutex.c === RCS file: amd64/amd64/mutex.c diff -N amd64/amd64/mutex.c --- /dev/null 1 Jan 1970 00:00:00 - +++ amd64/amd64/mutex.c 14 Dec 2017 14:49:59 - @@ -0,0 +1,151 @@ +/* $OpenBSD: mutex.c,v 1.19 2017/09/11 09:52:15 mpi Exp $ */ + +/* + * Copyright (c) 2004 Artur Grabowski+ * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions and the following disclaimer. + * 2. The name of the author may not be used to endorse or promote products + *derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include + +#include + +void +__mtx_init(struct mutex *mtx, int wantipl) +{ + mtx->mtx_owner = NULL; + mtx->mtx_wantipl = wantipl; + mtx->mtx_oldipl = IPL_NONE; +} + +#ifdef MULTIPROCESSOR +#ifdef MP_LOCKDEBUG +#ifndef DDB +#error "MP_LOCKDEBUG requires DDB" +#endif + +/* CPU-dependent timing, needs this to be settable from ddb. */ +extern int __mp_lock_spinout; +#endif + +void +__mtx_enter(struct mutex *mtx) +{ +#ifdef MP_LOCKDEBUG + int nticks = __mp_lock_spinout; +#endif + + while (__mtx_enter_try(mtx) == 0) { + CPU_BUSY_CYCLE(); + +#ifdef MP_LOCKDEBUG + if (--nticks == 0) { + db_printf("%s: %p lock spun out", __func__, mtx); + db_enter(); + nticks = __mp_lock_spinout; + } +#endif + } +} + +int +__mtx_enter_try(struct mutex *mtx) +{ + struct cpu_info *owner, *ci = curcpu(); + int s; + + if (mtx->mtx_wantipl != IPL_NONE) + s = splraise(mtx->mtx_wantipl); + + owner = atomic_cas_ptr(>mtx_owner, NULL, ci); +#ifdef DIAGNOSTIC + if (__predict_false(owner == ci)) + panic("mtx %p: locking against myself", mtx); +#endif + if (owner == NULL) { + membar_enter_after_atomic(); + if (mtx->mtx_wantipl != IPL_NONE) + mtx->mtx_oldipl = s; +#ifdef DIAGNOSTIC + ci->ci_mutex_level++; +#endif + return (1); + } + + if (mtx->mtx_wantipl != IPL_NONE) + splx(s); + + return (0); +} +#else +void +__mtx_enter(struct mutex *mtx) +{ + struct cpu_info *ci = curcpu(); + +#ifdef DIAGNOSTIC + if (__predict_false(mtx->mtx_owner == ci)) + panic("mtx %p: locking against myself", mtx); +#endif + + if (mtx->mtx_wantipl != IPL_NONE) + mtx->mtx_oldipl = splraise(mtx->mtx_wantipl); + + mtx->mtx_owner = ci; + +#ifdef DIAGNOSTIC + ci->ci_mutex_level++; +#endif +} + +int +__mtx_enter_try(struct mutex *mtx) +{ + __mtx_enter(mtx); + return (1); +} +#endif + +void +__mtx_leave(struct mutex *mtx) +{ + int s; + + MUTEX_ASSERT_LOCKED(mtx); + +#ifdef DIAGNOSTIC + curcpu()->ci_mutex_level--; +#endif + + s = mtx->mtx_oldipl; +#ifdef MULTIPROCESSOR + membar_exit_before_atomic(); +#endif + mtx->mtx_owner = NULL; + if (mtx->mtx_wantipl != IPL_NONE) + splx(s); +} Index: amd64/conf/files.amd64 === RCS file: /cvs/src/sys/arch/amd64/conf/files.amd64,v retrieving revision 1.91 diff -u -p -r1.91 files.amd64 --- amd64/conf/files.amd64 17 Oct 2017 14:25:35 - 1.91 +++ amd64/conf/files.amd64 14 Dec 2017 14:51:20 - @@ -28,7 +28,7 @@ file arch/amd64/amd64/fpu.c file arch/amd64/amd64/softintr.c file arch/amd64/amd64/i8259.c file arch/amd64/amd64/cacheinfo.c -file