Re: Please test: UVM fault unlocking (aka vmobjlock)

2021-11-29 Thread Martin Pieuchot
On 24/11/21(Wed) 11:16, Martin Pieuchot wrote:
> Diff below unlock the bottom part of the UVM fault handler.  I'm
> interested in squashing the remaining bugs.  Please test with your usual
> setup & report back.

Thanks to all the testers, here's a new version that includes a bug fix.

Tests on !x86 architectures are much appreciated!

Thanks a lot,
Martin

diff --git sys/arch/amd64/conf/GENERIC.MP sys/arch/amd64/conf/GENERIC.MP
index bb842f6d96e..e5334c19eac 100644
--- sys/arch/amd64/conf/GENERIC.MP
+++ sys/arch/amd64/conf/GENERIC.MP
@@ -4,6 +4,6 @@ include "arch/amd64/conf/GENERIC"
 
 option MULTIPROCESSOR
 #optionMP_LOCKDEBUG
-#optionWITNESS
+option WITNESS
 
 cpu*   at mainbus?
diff --git sys/arch/i386/conf/GENERIC.MP sys/arch/i386/conf/GENERIC.MP
index 980a572b8fd..ef7ded61501 100644
--- sys/arch/i386/conf/GENERIC.MP
+++ sys/arch/i386/conf/GENERIC.MP
@@ -7,6 +7,6 @@ include "arch/i386/conf/GENERIC"
 
 option MULTIPROCESSOR  # Multiple processor support
 #optionMP_LOCKDEBUG
-#optionWITNESS
+option WITNESS
 
 cpu*   at mainbus?
diff --git sys/dev/pci/drm/i915/gem/i915_gem_shmem.c 
sys/dev/pci/drm/i915/gem/i915_gem_shmem.c
index ce8e2eca141..47b567087e7 100644
--- sys/dev/pci/drm/i915/gem/i915_gem_shmem.c
+++ sys/dev/pci/drm/i915/gem/i915_gem_shmem.c
@@ -268,8 +268,10 @@ shmem_truncate(struct drm_i915_gem_object *obj)
 #ifdef __linux__
shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
 #else
+   rw_enter(obj->base.uao->vmobjlock, RW_WRITE);
obj->base.uao->pgops->pgo_flush(obj->base.uao, 0, obj->base.size,
PGO_ALLPAGES | PGO_FREE);
+   rw_exit(obj->base.uao->vmobjlock);
 #endif
obj->mm.madv = __I915_MADV_PURGED;
obj->mm.pages = ERR_PTR(-EFAULT);
diff --git sys/dev/pci/drm/radeon/radeon_ttm.c 
sys/dev/pci/drm/radeon/radeon_ttm.c
index eb879b5c72c..837a9f94298 100644
--- sys/dev/pci/drm/radeon/radeon_ttm.c
+++ sys/dev/pci/drm/radeon/radeon_ttm.c
@@ -1006,6 +1006,8 @@ radeon_ttm_fault(struct uvm_faultinfo *ufi, vaddr_t 
vaddr, vm_page_t *pps,
struct radeon_device *rdev;
int r;
 
+   KASSERT(rw_write_held(ufi->entry->object.uvm_obj->vmobjlock));
+
bo = (struct drm_gem_object *)ufi->entry->object.uvm_obj;
rdev = bo->dev->dev_private;
down_read(>pm.mclk_lock);
diff --git sys/uvm/uvm_aobj.c sys/uvm/uvm_aobj.c
index 20051d95dc1..a5c403ab67d 100644
--- sys/uvm/uvm_aobj.c
+++ sys/uvm/uvm_aobj.c
@@ -184,7 +184,7 @@ const struct uvm_pagerops aobj_pager = {
  * deadlock.
  */
 static LIST_HEAD(aobjlist, uvm_aobj) uao_list = 
LIST_HEAD_INITIALIZER(uao_list);
-static struct mutex uao_list_lock = MUTEX_INITIALIZER(IPL_NONE);
+static struct mutex uao_list_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
 
 
 /*
@@ -277,6 +277,7 @@ uao_find_swslot(struct uvm_object *uobj, int pageidx)
  * uao_set_swslot: set the swap slot for a page in an aobj.
  *
  * => setting a slot to zero frees the slot
+ * => object must be locked by caller
  * => we return the old slot number, or -1 if we failed to allocate
  *memory to record the new slot number
  */
@@ -286,7 +287,7 @@ uao_set_swslot(struct uvm_object *uobj, int pageidx, int 
slot)
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int oldslot;
 
-   KERNEL_ASSERT_LOCKED();
+   KASSERT(rw_write_held(uobj->vmobjlock) || uobj->uo_refs == 0);
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
 
/*
@@ -358,7 +359,9 @@ uao_free(struct uvm_aobj *aobj)
struct uvm_object *uobj = >u_obj;
 
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
+   KASSERT(rw_write_held(uobj->vmobjlock));
uao_dropswap_range(uobj, 0, 0);
+   rw_exit(uobj->vmobjlock);
 
if (UAO_USES_SWHASH(aobj)) {
/*
@@ -671,6 +674,7 @@ struct uvm_object *
 uao_create(vsize_t size, int flags)
 {
static struct uvm_aobj kernel_object_store;
+   static struct rwlock bootstrap_kernel_object_lock;
static int kobj_alloced = 0;
int pages = round_page(size) >> PAGE_SHIFT;
struct uvm_aobj *aobj;
@@ -742,6 +746,11 @@ uao_create(vsize_t size, int flags)
 * Initialise UVM object.
 */
uvm_obj_init(>u_obj, _pager, refs);
+   if (flags & UAO_FLAG_KERNOBJ) {
+   /* Use a temporary static lock for kernel_object. */
+   rw_init(_kernel_object_lock, "kobjlk");
+   uvm_obj_setlock(>u_obj, _kernel_object_lock);
+   }
 
/*
 * now that aobj is ready, add it to the global list
@@ -822,20 +831,20 @@ uao_detach(struct uvm_object *uobj)
 * involved in is complete), release any swap resources and free
 * the page itself.
 */
-   uvm_lock_pageq();
-   while((pg = RBT_ROOT(uvm_objtree, >memt)) != NULL) {
+   

Re: Rework UNIX sockets locking to be fine grained

2021-11-24 Thread Martin Pieuchot
On 22/11/21(Mon) 14:42, Vitaliy Makkoveev wrote:
> On Sat, Nov 20, 2021 at 03:12:31AM +0300, Vitaliy Makkoveev wrote:
> > Updated diff. Re-lock dances were simplified in the unix(4) sockets
> > layer.
> > 
> > Reference counters added to unix(4) sockets layer too. This makes 
> > pointer dereference of peer's control block always safe after re-lock.
> > 
> > The `unp_refs' list cleanup done in the unp_detach(). This removes the
> > case where the socket connected to our dying socket could be passed to
> > unp_disconnect() and the check of it's connection state became much
> > easier.
> >
> 
> Another re-lock simplification. We could enforce the lock order between
> the listening socket `head' and the socket `so' linked to it's `so_q0'
> or `so_q' to solock(head) -> solock(so).
> 
> This removes re-lock from accept(2) and the accepting socket couldn't be
> stolen by concurrent accept(2) thread. This removes re-lock from `so_q0'
> and `so_q' cleanup on dying listening socket.
> 
> The previous incarnation of this diff does re-lock in a half of
> doaccept(), soclose(), sofree() and soisconnected() calls. The current
> diff does not re-lock in doaccept() and soclose() and always so re-lock
> in sofree() and soisconnected().
> 
> I guess this is the latest simplification and this diff could be pushed
> forward.

This diff is really interesting.  It shows that the current locking
design needs to be reworked.

I don't think we should expose the locking strategy with a `persocket'
variable then use if/else dances to decide if one of two locks need to
be taken/released.  Instead could we fold the TCP/UDP locking into more
generic functions?  For example connect() could be:

int
soconnect2(struct socket *so1, struct socket *so2)
{
int s, error;

s = solock_pair(so1, so2);
error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
(struct mbuf *)so2, NULL, curproc);
sounlock_pair(so1, so2, s);
return (error);
}

And solock_pair() would do the right thing(tm) based on the socket type.

Because in the end we want to prepare this layer to use per-socket locks
with TCP/UDP sockets as well.

Could something similar be done for doaccept()?

I'm afraid about introducing reference counting.  Once there is reference
counting it tends to be abused.  It's not clear to me for which reason it
is added.  It looks like to work around lock ordering issues, could you
talk a bit about this?  Is there any alternative?

I also don't understand the problem behind:

> + unp_ref(unp2);
> + sounlock(so, SL_LOCKED);
> + solock(so2);
> + solock(so);
> +
> + /* Datagram socket could be reconnected due to re-lock. */
> + if (unp->unp_conn != unp2) {
> + sounlock(so2, SL_LOCKED);
> + unp_rele(unp2);
> + goto again;
> + }
> +
> + unp_rele(unp2);


It seems that doing an unlock/relock dance requires a lot of added
complexity, why is it done this way?

Thanks for dealing with this!

> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.269
> diff -u -p -r1.269 uipc_socket.c
> --- sys/kern/uipc_socket.c11 Nov 2021 16:35:09 -  1.269
> +++ sys/kern/uipc_socket.c22 Nov 2021 11:36:40 -
> @@ -52,6 +52,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #ifdef DDB
>  #include 
> @@ -156,7 +157,9 @@ soalloc(int prflags)
>   so = pool_get(_pool, prflags);
>   if (so == NULL)
>   return (NULL);
> - rw_init(>so_lock, "solock");
> + rw_init_flags(>so_lock, "solock", RWL_DUPOK);
> + refcnt_init(>so_refcnt);
> +
>   return (so);
>  }
>  
> @@ -257,6 +260,8 @@ solisten(struct socket *so, int backlog)
>  void
>  sofree(struct socket *so, int s)
>  {
> + int persocket = solock_persocket(so);
> +
>   soassertlocked(so);
>  
>   if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
> @@ -264,16 +269,53 @@ sofree(struct socket *so, int s)
>   return;
>   }
>   if (so->so_head) {
> + struct socket *head = so->so_head;
> +
>   /*
>* We must not decommission a socket that's on the accept(2)
>* queue.  If we do, then accept(2) may hang after select(2)
>* indicated that the listening socket was ready.
>*/
> - if (!soqremque(so, 0)) {
> + if (so->so_onq == >so_q) {
>   sounlock(so, s);
>   return;
>   }
> +
> + if (persocket) {
> + /*
> +  * Concurrent close of `head' could
> +  * abort `so' due to re-lock.
> +  */
> + soref(so);
> + soref(head);
> +  

Please test: UVM fault unlocking (aka vmobjlock)

2021-11-24 Thread Martin Pieuchot
Diff below unlock the bottom part of the UVM fault handler.  I'm
interested in squashing the remaining bugs.  Please test with your usual
setup & report back.

Thanks,
Martin

diff --git sys/arch/amd64/conf/GENERIC.MP sys/arch/amd64/conf/GENERIC.MP
index bb842f6d96e..e5334c19eac 100644
--- sys/arch/amd64/conf/GENERIC.MP
+++ sys/arch/amd64/conf/GENERIC.MP
@@ -4,6 +4,6 @@ include "arch/amd64/conf/GENERIC"
 
 option MULTIPROCESSOR
 #optionMP_LOCKDEBUG
-#optionWITNESS
+option WITNESS
 
 cpu*   at mainbus?
diff --git sys/arch/i386/conf/GENERIC.MP sys/arch/i386/conf/GENERIC.MP
index 980a572b8fd..ef7ded61501 100644
--- sys/arch/i386/conf/GENERIC.MP
+++ sys/arch/i386/conf/GENERIC.MP
@@ -7,6 +7,6 @@ include "arch/i386/conf/GENERIC"
 
 option MULTIPROCESSOR  # Multiple processor support
 #optionMP_LOCKDEBUG
-#optionWITNESS
+option WITNESS
 
 cpu*   at mainbus?
diff --git sys/dev/pci/drm/i915/gem/i915_gem_shmem.c 
sys/dev/pci/drm/i915/gem/i915_gem_shmem.c
index ce8e2eca141..47b567087e7 100644
--- sys/dev/pci/drm/i915/gem/i915_gem_shmem.c
+++ sys/dev/pci/drm/i915/gem/i915_gem_shmem.c
@@ -268,8 +268,10 @@ shmem_truncate(struct drm_i915_gem_object *obj)
 #ifdef __linux__
shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
 #else
+   rw_enter(obj->base.uao->vmobjlock, RW_WRITE);
obj->base.uao->pgops->pgo_flush(obj->base.uao, 0, obj->base.size,
PGO_ALLPAGES | PGO_FREE);
+   rw_exit(obj->base.uao->vmobjlock);
 #endif
obj->mm.madv = __I915_MADV_PURGED;
obj->mm.pages = ERR_PTR(-EFAULT);
diff --git sys/dev/pci/drm/radeon/radeon_ttm.c 
sys/dev/pci/drm/radeon/radeon_ttm.c
index eb879b5c72c..837a9f94298 100644
--- sys/dev/pci/drm/radeon/radeon_ttm.c
+++ sys/dev/pci/drm/radeon/radeon_ttm.c
@@ -1006,6 +1006,8 @@ radeon_ttm_fault(struct uvm_faultinfo *ufi, vaddr_t 
vaddr, vm_page_t *pps,
struct radeon_device *rdev;
int r;
 
+   KASSERT(rw_write_held(ufi->entry->object.uvm_obj->vmobjlock));
+
bo = (struct drm_gem_object *)ufi->entry->object.uvm_obj;
rdev = bo->dev->dev_private;
down_read(>pm.mclk_lock);
diff --git sys/uvm/uvm_aobj.c sys/uvm/uvm_aobj.c
index 20051d95dc1..127218c4c40 100644
--- sys/uvm/uvm_aobj.c
+++ sys/uvm/uvm_aobj.c
@@ -31,7 +31,7 @@
 /*
  * uvm_aobj.c: anonymous memory uvm_object pager
  *
- * author: Chuck Silvers 
+* author: Chuck Silvers 
  * started: Jan-1998
  *
  * - design mostly from Chuck Cranor
@@ -184,7 +184,7 @@ const struct uvm_pagerops aobj_pager = {
  * deadlock.
  */
 static LIST_HEAD(aobjlist, uvm_aobj) uao_list = 
LIST_HEAD_INITIALIZER(uao_list);
-static struct mutex uao_list_lock = MUTEX_INITIALIZER(IPL_NONE);
+static struct mutex uao_list_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
 
 
 /*
@@ -277,6 +277,7 @@ uao_find_swslot(struct uvm_object *uobj, int pageidx)
  * uao_set_swslot: set the swap slot for a page in an aobj.
  *
  * => setting a slot to zero frees the slot
+ * => object must be locked by caller
  * => we return the old slot number, or -1 if we failed to allocate
  *memory to record the new slot number
  */
@@ -286,7 +287,7 @@ uao_set_swslot(struct uvm_object *uobj, int pageidx, int 
slot)
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int oldslot;
 
-   KERNEL_ASSERT_LOCKED();
+   KASSERT(rw_write_held(uobj->vmobjlock) || uobj->uo_refs == 0);
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
 
/*
@@ -358,7 +359,9 @@ uao_free(struct uvm_aobj *aobj)
struct uvm_object *uobj = >u_obj;
 
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
+   KASSERT(rw_write_held(uobj->vmobjlock));
uao_dropswap_range(uobj, 0, 0);
+   rw_exit(uobj->vmobjlock);
 
if (UAO_USES_SWHASH(aobj)) {
/*
@@ -671,6 +674,7 @@ struct uvm_object *
 uao_create(vsize_t size, int flags)
 {
static struct uvm_aobj kernel_object_store;
+   static struct rwlock bootstrap_kernel_object_lock;
static int kobj_alloced = 0;
int pages = round_page(size) >> PAGE_SHIFT;
struct uvm_aobj *aobj;
@@ -742,6 +746,11 @@ uao_create(vsize_t size, int flags)
 * Initialise UVM object.
 */
uvm_obj_init(>u_obj, _pager, refs);
+   if (flags & UAO_FLAG_KERNOBJ) {
+   /* Use a temporary static lock for kernel_object. */
+   rw_init(_kernel_object_lock, "kobjlk");
+   uvm_obj_setlock(>u_obj, _kernel_object_lock);
+   }
 
/*
 * now that aobj is ready, add it to the global list
@@ -822,20 +831,20 @@ uao_detach(struct uvm_object *uobj)
 * involved in is complete), release any swap resources and free
 * the page itself.
 */
-   uvm_lock_pageq();
-   while((pg = RBT_ROOT(uvm_objtree, >memt)) != NULL) {
+   rw_enter(uobj->vmobjlock, RW_WRITE);
+   while ((pg = RBT_ROOT(uvm_objtree, >memt)) != NULL) {
+   pmap_page_protect(pg, PROT_NONE);

Re: Retry sleep in poll/select

2021-11-18 Thread Martin Pieuchot
On 17/11/21(Wed) 09:51, Scott Cheloha wrote:
> > On Nov 17, 2021, at 03:22, Martin Pieuchot  wrote:
> > 
> > On 16/11/21(Tue) 13:55, Visa Hankala wrote:
> >> Currently, dopselect() and doppoll() call tsleep_nsec() without retry.
> >> cheloha@ asked if the functions should handle spurious wakeups. I guess
> >> such wakeups are unlikely with the nowake wait channel, but I am not
> >> sure if that is a safe guess.
> > 
> > I'm not sure to understand, are we afraid a thread sleeping on `nowake'
> > can be awaken?  Is it the assumption here?
> 
> Yes, but I don't know how.

Then I'd suggest we start with understanding how this can happen otherwise
I fear we are adding more complexity for reasons we don't understands.

> kettenis@ said spurious wakeups were
> possible on a similar loop in sigsuspend(2)
> so I mentioned this to visa@ off-list.

I don't understand how this can happen.

> If we added an assert to panic in wakeup(9)
> if the channel is , would that be
> sufficient?

I guess so.

> Ideally if you sleep on  you should
> never get a zero status from the sleep
> functions.  It should be impossible… if that
> is possible to ensure.



Re: Retry sleep in poll/select

2021-11-17 Thread Martin Pieuchot
On 16/11/21(Tue) 13:55, Visa Hankala wrote:
> Currently, dopselect() and doppoll() call tsleep_nsec() without retry.
> cheloha@ asked if the functions should handle spurious wakeups. I guess
> such wakeups are unlikely with the nowake wait channel, but I am not
> sure if that is a safe guess.

I'm not sure to understand, are we afraid a thread sleeping on `nowake'
can be awaken?  Is it the assumption here?

> The following diff adds the retrying. The code is a bit arduous, so the
> retry loop is put in a separate function that both poll and select use.

Using a separate function makes sense anyway.

> Index: kern/sys_generic.c
> ===
> RCS file: src/sys/kern/sys_generic.c,v
> retrieving revision 1.141
> diff -u -p -r1.141 sys_generic.c
> --- kern/sys_generic.c16 Nov 2021 13:48:23 -  1.141
> +++ kern/sys_generic.c16 Nov 2021 13:50:08 -
> @@ -90,6 +90,7 @@ int dopselect(struct proc *, int, fd_set
>  int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
>  const sigset_t *, register_t *);
>  void doselwakeup(struct selinfo *);
> +int selsleep(struct timespec *);
>  
>  int
>  iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec 
> *aiov,
> @@ -664,19 +665,7 @@ dopselect(struct proc *p, int nd, fd_set
>* there's nothing to wait for.
>*/
>   if (nevents == 0 && ncollected == 0) {
> - uint64_t nsecs = INFSLP;
> -
> - if (timeout != NULL) {
> - if (!timespecisset(timeout))
> - goto done;
> - nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
> - }
> - error = tsleep_nsec(, PSOCK | PCATCH, "kqsel", nsecs);
> - /* select is not restarted after signals... */
> - if (error == ERESTART)
> - error = EINTR;
> - if (error == EWOULDBLOCK)
> - error = 0;
> + error = selsleep(timeout);
>   goto done;
>   }
>  
> @@ -849,6 +838,46 @@ selfalse(dev_t dev, int events, struct p
>  }
>  
>  /*
> + * Sleep until a signal arrives or the optional timeout expires.
> + */
> +int
> +selsleep(struct timespec *timeout)
> +{
> + uint64_t end, now, nsecs;
> + int error;
> +
> + if (timeout != NULL) {
> + if (!timespecisset(timeout))
> + return (0);
> + now = getnsecuptime();
> + end = MIN(now + TIMESPEC_TO_NSEC(timeout), MAXTSLP);
> + if (end < now)
> + end = MAXTSLP;
> + }
> +
> + do {
> + if (timeout != NULL)
> + nsecs = MAX(1, end - now);
> + else
> + nsecs = INFSLP;
> + error = tsleep_nsec(, PSOCK | PCATCH, "selslp", nsecs);
> + if (timeout != NULL) {
> + now = getnsecuptime();
> + if (now >= end)
> + break;
> + }
> + } while (error == 0);
> +
> + /* poll/select is not restarted after signals... */
> + if (error == ERESTART)
> + error = EINTR;
> + if (error == EWOULDBLOCK)
> + error = 0;
> +
> + return (error);
> +}
> +
> +/*
>   * Record a select request.
>   */
>  void
> @@ -1158,19 +1187,7 @@ doppoll(struct proc *p, struct pollfd *f
>* there's nothing to wait for.
>*/
>   if (nevents == 0 && ncollected == 0) {
> - uint64_t nsecs = INFSLP;
> -
> - if (timeout != NULL) {
> - if (!timespecisset(timeout))
> - goto done;
> - nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
> - }
> -
> - error = tsleep_nsec(, PSOCK | PCATCH, "kqpoll", nsecs);
> - if (error == ERESTART)
> - error = EINTR;
> - if (error == EWOULDBLOCK)
> - error = 0;
> + error = selsleep(timeout);
>   goto done;
>   }
>  
> 



Re: bt.5 document count()

2021-11-16 Thread Martin Pieuchot
On 16/11/21(Tue) 11:07, Claudio Jeker wrote:
> This documents count(). This function only works when used like this
>   @map[key] = count();
> But it is implemented and works. If used differently you get a syntax
> error which is not helpful. This is why I chose to document it like this.
> Another option would be to document the language (so it is clear where it
> is possible to use what). 

ok mpi@

> max(), min() and sum() are other functions that behave like this. Their
> documentation should also be adjusted IMO.
> 
> -- 
> :wq Claudio
> 
> Index: bt.5
> ===
> RCS file: /cvs/src/usr.sbin/btrace/bt.5,v
> retrieving revision 1.13
> diff -u -p -r1.13 bt.5
> --- bt.5  12 Nov 2021 16:57:24 -  1.13
> +++ bt.5  16 Nov 2021 09:50:52 -
> @@ -120,6 +120,11 @@ Functions:
>  .It Fn clear "@map"
>  Delete all (key, value) pairs from
>  .Va @map .
> +.It "@map[key]" = Fn count
> +Increment the value of
> +.Va key
> +from
> +.Va @map .
>  .It Fn delete "@map[key]"
>  Delete the pair indexed by
>  .Va key
> 



Re: poll/select: Lazy removal of knotes

2021-11-06 Thread Martin Pieuchot
On 06/11/21(Sat) 15:53, Visa Hankala wrote:
> On Fri, Nov 05, 2021 at 10:04:50AM +0100, Martin Pieuchot wrote:
> > New poll/select(2) implementation convert 'struct pollfd' and 'fdset' to
> > knotes (kqueue event descriptors) then pass them to the kqueue subsystem.
> > A knote is allocated, with kqueue_register(), for every read, write and
> > except condition watched on a given FD.  That means at most 3 allocations
> > might be necessary per FD.
> > 
> > The diff below reduce the overhead of per-syscall allocation/free of those
> > descriptors by leaving those which didn't trigger on the kqueue across
> > syscall.  Leaving knotes on the kqueue allows kqueue_register() to re-use
> > existing descriptor instead of re-allocating a new one.
> > 
> > With this knotes are now lazily removed.  The mechanism uses a serial
> > number which is incremented for every syscall that indicates if a knote
> > sitting in the kqueue is still valid or should be freed.
> > 
> > Note that performance improvements might not be visible with this diff
> > alone because kqueue_register() still pre-allocate a descriptor then drop
> > it.
> > 
> > visa@ already pointed out that the lazy removal logic could be integrated
> > in kqueue_scan() which would reduce the complexity of those two syscalls.
> > I'm arguing for doing this in a next step in-tree.
> 
> I think it would make more sense to add the removal logic to the scan
> function first as doing so would keep the code modifications more
> logical and simpler. This would also avoid the need to go through
> a temporary removal approach.

I totally support your effort and your design however I don't have the
time to do another round of test/debugging.  So please, can you take
care of doing these cleanups afterward?  If not, please send a full diff
and take over this feature, it's too much effort for me to work out of
tree.

> Index: kern/kern_event.c
> ===
> RCS file: src/sys/kern/kern_event.c,v
> retrieving revision 1.170
> diff -u -p -r1.170 kern_event.c
> --- kern/kern_event.c 6 Nov 2021 05:48:47 -   1.170
> +++ kern/kern_event.c 6 Nov 2021 15:31:04 -
> @@ -73,6 +73,7 @@ voidkqueue_terminate(struct proc *p, st
>  void KQREF(struct kqueue *);
>  void KQRELE(struct kqueue *);
>  
> +void kqueue_purge(struct proc *, struct kqueue *);
>  int  kqueue_sleep(struct kqueue *, struct timespec *);
>  
>  int  kqueue_read(struct file *, struct uio *, int);
> @@ -806,6 +807,22 @@ kqpoll_exit(void)
>  }
>  
>  void
> +kqpoll_done(unsigned int num)
> +{
> + struct proc *p = curproc;
> +
> + KASSERT(p->p_kq != NULL);
> +
> + if (p->p_kq_serial + num >= p->p_kq_serial) {
> + p->p_kq_serial += num;
> + } else {
> + /* Clear all knotes after serial wraparound. */
> + kqueue_purge(p, p->p_kq);
> + p->p_kq_serial = 1;
> + }
> +}
> +
> +void
>  kqpoll_dequeue(struct proc *p, int all)
>  {
>   struct knote marker;
> @@ -1383,6 +1400,15 @@ retry:
>  
>   mtx_leave(>kq_lock);
>  
> + /* Drop spurious events. */
> + if (p->p_kq == kq &&
> + p->p_kq_serial > (unsigned long)kn->kn_udata) {
> + filter_detach(kn);
> + knote_drop(kn, p);
> + mtx_enter(>kq_lock);
> + continue;
> + }
> +
>   memset(kevp, 0, sizeof(*kevp));
>   if (filter_process(kn, kevp) == 0) {
>   mtx_enter(>kq_lock);
> Index: kern/sys_generic.c
> ===
> RCS file: src/sys/kern/sys_generic.c,v
> retrieving revision 1.139
> diff -u -p -r1.139 sys_generic.c
> --- kern/sys_generic.c29 Oct 2021 15:52:44 -  1.139
> +++ kern/sys_generic.c6 Nov 2021 15:31:04 -
> @@ -730,8 +730,7 @@ done:
>   if (pibits[0] != (fd_set *)[0])
>   free(pibits[0], M_TEMP, 6 * ni);
>  
> - kqueue_purge(p, p->p_kq);
> - p->p_kq_serial += nd;
> + kqpoll_done(nd);
>  
>   return (error);
>  }
> @@ -1230,8 +1229,7 @@ bad:
>   if (pl != pfds)
>   free(pl, M_TEMP, sz);
>  
> - kqueue_purge(p, p->p_kq);
> - p->p_kq_serial += nfds;
> + kqpoll_done(nfds);
>  
>   return (error);
>  }
> @@ -1251,8 +1249,7 @@ ppollcollect(struct proc *p, struct keve
>   /*
>* Lazily delete spurious events.
>*
> -   

poll/select: Lazy removal of knotes

2021-11-05 Thread Martin Pieuchot
New poll/select(2) implementation convert 'struct pollfd' and 'fdset' to
knotes (kqueue event descriptors) then pass them to the kqueue subsystem.
A knote is allocated, with kqueue_register(), for every read, write and
except condition watched on a given FD.  That means at most 3 allocations
might be necessary per FD.

The diff below reduce the overhead of per-syscall allocation/free of those
descriptors by leaving those which didn't trigger on the kqueue across
syscall.  Leaving knotes on the kqueue allows kqueue_register() to re-use
existing descriptor instead of re-allocating a new one.

With this knotes are now lazily removed.  The mechanism uses a serial
number which is incremented for every syscall that indicates if a knote
sitting in the kqueue is still valid or should be freed.

Note that performance improvements might not be visible with this diff
alone because kqueue_register() still pre-allocate a descriptor then drop
it.

visa@ already pointed out that the lazy removal logic could be integrated
in kqueue_scan() which would reduce the complexity of those two syscalls.
I'm arguing for doing this in a next step in-tree.

Please test and review :)

Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.139
diff -u -p -r1.139 sys_generic.c
--- kern/sys_generic.c  29 Oct 2021 15:52:44 -  1.139
+++ kern/sys_generic.c  5 Nov 2021 08:11:05 -
@@ -598,7 +598,7 @@ sys_pselect(struct proc *p, void *v, reg
 
 int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
-struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
+struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
 {
struct kqueue_scan_state scan;
fd_mask bits[6];
@@ -666,10 +666,10 @@ dopselect(struct proc *p, int nd, fd_set
if (nevents == 0 && ncollected == 0) {
uint64_t nsecs = INFSLP;
 
-   if (timeout != NULL) {
-   if (!timespecisset(timeout))
+   if (tsp != NULL) {
+   if (!timespecisset(tsp))
goto done;
-   nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
+   nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP));
}
error = tsleep_nsec(>p_kq, PSOCK | PCATCH, "kqsel", nsecs);
/* select is not restarted after signals... */
@@ -682,28 +682,37 @@ dopselect(struct proc *p, int nd, fd_set
 
/* Collect at most `nevents' possibly waiting in kqueue_scan() */
kqueue_scan_setup(, p->p_kq);
-   while (nevents > 0) {
+   while ((nevents - ncollected) > 0) {
struct kevent kev[KQ_NEVENTS];
int i, ready, count;
 
-   /* Maximum number of events per iteration */
-   count = MIN(nitems(kev), nevents);
-   ready = kqueue_scan(, count, kev, timeout, p, );
+   /*
+* Maximum number of events per iteration.  Use the whole
+* array to gather as many spurious events as possible.
+*/
+   count = nitems(kev);
+   ready = kqueue_scan(, count, kev, tsp, p, );
 #ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrevent(p, kev, ready);
 #endif
-   /* Convert back events that are ready. */
+   /* Convert back events that are ready/delete spurious ones. */
for (i = 0; i < ready && error == 0; i++)
error = pselcollect(p, [i], pobits, );
+
/*
-* Stop if there was an error or if we had enough
-* space to collect all events that were ready.
+* Stop if there was an error or if we had enough space
+* to collect all non-spurious events that were ready.
 */
-   if (error || ready < count)
+   if (error || !ready || (ncollected > 0 && ready < count))
break;
 
-   nevents -= ready;
+   /*
+* If we only got spurious events try again repositioning
+* the marker.
+*/
+   if (ncollected == 0 && ((tsp == NULL) || timespecisset(tsp)))
+   scan.kqs_nevent = 0;
}
kqueue_scan_finish();
*retval = ncollected;
@@ -730,7 +739,7 @@ done:
if (pibits[0] != (fd_set *)[0])
free(pibits[0], M_TEMP, 6 * ni);
 
-   kqueue_purge(p, p->p_kq);
+   /* Needed to remove events lazily. */
p->p_kq_serial += nd;
 
return (error);
@@ -759,7 +768,7 @@ pselregister(struct proc *p, fd_set *pib
DPRINTFN(2, "select fd %d mask %d serial %lu\n",
fd, msk, p->p_kq_serial);

Re: UNIX sockets: use vnode(9) lock to protect `v_socket' dereference

2021-11-05 Thread Martin Pieuchot
On 26/10/21(Tue) 14:12, Vitaliy Makkoveev wrote:
> Another step to make UNIX sockets locking fine grained.
> 
> The listening socket has the references from file descriptors layer and
> from the vnode(9) layer. This means when we close(2)'ing such socket it
> still referenced by concurrent thread through connect(2) path.
> 
> When we bind(2) UNIX socket we link it to vnode(9) by assigning
> `v_socket'. When we connect(2)'ing socket to the socket we previously
> bind(2)'ed we finding it by namei(9) and obtain it's reference through
> `v_socket'. This socket has no extra reference in file descriptor
> layer and could be closed by concurrent thread.
> 
> This time we have `unp_lock' rwlock(9) which protects the whole layer
> and the dereference of `v_socket' is safe. But with the fine grained
> locking the `v_socket' will not be protected by global lock. When we
> obtain the vnode(9) by namei(9) in connect(9) or bind(9) paths it is
> already exclusively locked by vlode(9) lock. But in unp_detach() which
> is called on the close(2)'ing socket we assume `unp_lock' protects
> `v_socket'.
> 
> I propose to use exclusive vnode(9) lock to protect `v_socket'. With the
> fine grained locking, the `v_socket' dereference in unp_bind() or
> unp_connect() threads will be safe because unp_detach() thread will wait
> the vnode(9) lock release. The vnode referenced by `unp_vnod' has
> reference counter bumped so it's dereference is also safe without
> `unp_lock' held.

This makes sense to me.  Using the vnode lock here seems the simplest
approach.

> The `i_lock' should be take before `unp_lock' and unp_detach() should
> release solock(). To prevent connections on this socket the
> 'SO_ACCEPTCONN' bit cleared in soclose().

This is done to prevent races when solock() is released inside soabort(),
right?  Is it the only one or some more care is needed?
Will this stay with per-socket locks or is this only necessary because of
the global `unp_lock'?

> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.265
> diff -u -p -r1.265 uipc_socket.c
> --- sys/kern/uipc_socket.c14 Oct 2021 23:05:10 -  1.265
> +++ sys/kern/uipc_socket.c26 Oct 2021 11:05:59 -
> @@ -315,6 +315,8 @@ soclose(struct socket *so, int flags)
>   /* Revoke async IO early. There is a final revocation in sofree(). */
>   sigio_free(>so_sigio);
>   if (so->so_options & SO_ACCEPTCONN) {
> + so->so_options &= ~SO_ACCEPTCONN;
> +
>   while ((so2 = TAILQ_FIRST(>so_q0)) != NULL) {
>   (void) soqremque(so2, 0);
>   (void) soabort(so2);
> Index: sys/kern/uipc_usrreq.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.150
> diff -u -p -r1.150 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c21 Oct 2021 22:11:07 -  1.150
> +++ sys/kern/uipc_usrreq.c26 Oct 2021 11:05:59 -
> @@ -474,20 +474,30 @@ void
>  unp_detach(struct unpcb *unp)
>  {
>   struct socket *so = unp->unp_socket;
> - struct vnode *vp = NULL;
> + struct vnode *vp = unp->unp_vnode;
>  
>   rw_assert_wrlock(_lock);
>  
>   LIST_REMOVE(unp, unp_link);
> - if (unp->unp_vnode) {
> +
> + if (vp) {
> + unp->unp_vnode = NULL;
> +
>   /*
> -  * `v_socket' is only read in unp_connect and
> -  * unplock prevents concurrent access.
> +  * Enforce `i_lock' -> `unp_lock' because fifo
> +  * subsystem requires it.
>*/
>  
> - unp->unp_vnode->v_socket = NULL;
> - vp = unp->unp_vnode;
> - unp->unp_vnode = NULL;
> + sounlock(so, SL_LOCKED);
> +
> + VOP_LOCK(vp, LK_EXCLUSIVE);
> + vp->v_socket = NULL;
> +
> + KERNEL_LOCK();
> + vput(vp);
> + KERNEL_UNLOCK();
> +
> + solock(so);
>   }
>  
>   if (unp->unp_conn)
> @@ -500,21 +510,6 @@ unp_detach(struct unpcb *unp)
>   pool_put(_pool, unp);
>   if (unp_rights)
>   task_add(systqmp, _gc_task);
> -
> - if (vp != NULL) {
> - /*
> -  * Enforce `i_lock' -> `unplock' because fifo subsystem
> -  * requires it. The socket can't be closed concurrently
> -  * because the file descriptor reference is
> -  * still hold.
> -  */
> -
> - sounlock(so, SL_LOCKED);
> - KERNEL_LOCK();
> - vrele(vp);
> - KERNEL_UNLOCK();
> - solock(so);
> - }
>  }
>  
>  int
> 



Re: UNIX sockets: make `unp_rights', `unp_msgcount' and `unp_file' atomic

2021-11-05 Thread Martin Pieuchot
On 30/10/21(Sat) 21:22, Vitaliy Makkoveev wrote:
> This completely removes global rwlock(9) from the unp_internalize() and
> unp_externalize() normal paths but only leaves it in unp_externalize()
> error path. Also we don't need to simultaneously hold both fdplock()
> and `unp_lock' in unp_internalize(). As non obvious profit this
> simplifies the future lock dances in the UNIX sockets layer.
> 
> It's safe to call fptounp() without `unp_lock' held. We always got this
> file descriptor by fd_getfile(9) so we always have the extra reference
> and this descriptor can't be closed by concurrent thread. Some sockets
> could be destroyed through 'PRU_ABORT' path but they don't have
> associated file descriptor and they are not accessible in the
> unp_internalize() path.
> 
> The `unp_file' access without `unp_lock' held is also safe. Each socket
> could have the only associated file descriptor and each file descriptor
> could have the only associated socket. We only assign `unp_file' in the
> unp_internalize() path where we got the socket by fd_getfile(9). This
> descriptor has the extra reference and couldn't be closed concurrently.
> We could override `unp_file' but with the same address because the
> associated file descriptor can't be changed so the address will be also
> the same. So while unp_gc() concurrently runs the dereference of
> non-NULL `unp_file' is always safe.

Using an atomic operation for `unp_msgcount' is ok with me, one comment
about `unp_rights' below.

> Index: sys/kern/uipc_usrreq.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.153
> diff -u -p -r1.153 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c30 Oct 2021 16:35:31 -  1.153
> +++ sys/kern/uipc_usrreq.c30 Oct 2021 18:41:25 -
> @@ -58,6 +58,7 @@
>   * Locks used to protect global data and struct members:
>   *  I   immutable after creation
>   *  U   unp_lock
> + *  a   atomic
>   */
>  struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
>  
> @@ -99,7 +100,7 @@ SLIST_HEAD(,unp_deferral)  unp_deferred =
>   SLIST_HEAD_INITIALIZER(unp_deferred);
>  
>  ino_tunp_ino;/* [U] prototype for fake inode numbers */
> -int  unp_rights; /* [U] file descriptors in flight */
> +int  unp_rights; /* [a] file descriptors in flight */
>  int  unp_defer;  /* [U] number of deferred fp to close by the GC task */
>  int  unp_gcing;  /* [U] GC task currently running */
>  
> @@ -927,17 +928,16 @@ restart:
>*/
>   rp = (struct fdpass *)CMSG_DATA(cm);
>  
> - rw_enter_write(_lock);
>   for (i = 0; i < nfds; i++) {
>   struct unpcb *unp;
>  
>   fp = rp->fp;
>   rp++;
>   if ((unp = fptounp(fp)) != NULL)
> - unp->unp_msgcount--;
> - unp_rights--;
> + atomic_dec_long(>unp_msgcount);
>   }
> - rw_exit_write(_lock);
> +
> + atomic_sub_int(_rights, nfds);
>  
>   /*
>* Copy temporary array to message and adjust length, in case of
> @@ -985,13 +985,10 @@ unp_internalize(struct mbuf *control, st
>   return (EINVAL);
>   nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
>  
> - rw_enter_write(_lock);
> - if (unp_rights + nfds > maxfiles / 10) {
> - rw_exit_write(_lock);
> + if (atomic_add_int_nv(_rights, nfds) > maxfiles / 10) {
> + atomic_sub_int(_rights, nfds);

I can't believe this is race free. If two threads, T1 and T2, call
atomic_add at the same time both might end up returning EMFILE even
if only the first one currently does.  This could happen if T1 exceeds
the limit and T2 does atomic_add on an already-exceeded `unp_rights'
before T1 could do atomic_sub.

I suggest using a mutex to protect `unp_rights' instead to solve this
issue.

>   return (EMFILE);
>   }
> - unp_rights += nfds;
> - rw_exit_write(_lock);
>  
>   /* Make sure we have room for the struct file pointers */
>  morespace:
> @@ -1031,7 +1028,6 @@ morespace:
>   ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
>   rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
>   fdplock(fdp);
> - rw_enter_write(_lock);
>   for (i = 0; i < nfds; i++) {
>   memcpy(, ip, sizeof fd);
>   ip--;
> @@ -1056,15 +1052,13 @@ morespace:
>   rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
>   rp--;
>   if ((unp = fptounp(fp)) != NULL) {
> + atomic_inc_long(>unp_msgcount);
>   unp->unp_file = fp;
> - unp->unp_msgcount++;
>   }
>   }
> - rw_exit_write(_lock);
>   fdpunlock(fdp);
>   return (0);
>  fail:
> - rw_exit_write(_lock);
>   fdpunlock(fdp);
>   if (fp != NULL)
>   FRELE(fp, p);
> @@ -1072,17 +1066,13 @@ fail:
>   for ( ; i > 

Re: Please test: full poll/select(2) switch

2021-10-29 Thread Martin Pieuchot
On 29/10/21(Fri) 14:48, Alexandre Ratchov wrote:
> On Fri, Oct 29, 2021 at 01:12:06PM +0100, Martin Pieuchot wrote:
> > On 29/10/21(Fri) 13:12, Alexandre Ratchov wrote:
> > > On Sat, Oct 23, 2021 at 10:40:56AM +0100, Martin Pieuchot wrote:
> > > > Diff below switches both poll(2) and select(2) to the kqueue-based
> > > > implementation.
> > > > 
> > > > In addition it switches libevent(3) to use poll(2) by default for
> > > > testing purposes.
> > > > 
> > > > I don't have any open bug left with this diff and I'm happily running
> > > > GNOME with it.  So I'd be happy if you could try to break it and report
> > > > back.
> > > > 
> > > 
> > > Without the below diff (copied from audio(4) driver), kernel panics
> > > upon the first MIDI input byte.
> > 
> > What is the panic?  The mutex is taken recursively, right?
> >  
> 
> Exactly, this is the "locking against myself", panic.
> 
> AFAIU, the interrupt handler grabs the audio_lock and calls
> midi_iintr(). It calls selwakeup(), which in turn calls
> filt_midiread(), which attempts to grab the audio_lock a second time.
> 
> > > ok? suggestion for a better fix?
> > 
> > Without seeing the panic, I'm guessing this is correct.
> > 
> > That suggest kevent(2) wasn't safe to use with midi(4).
> > 
> 
> Yes, this is the very first time midi(4) is used with kevent(2).

Then this is correct, thanks a lot.  Please go ahead, ok mpi@



Re: Please test: full poll/select(2) switch

2021-10-29 Thread Martin Pieuchot
On 29/10/21(Fri) 13:12, Alexandre Ratchov wrote:
> On Sat, Oct 23, 2021 at 10:40:56AM +0100, Martin Pieuchot wrote:
> > Diff below switches both poll(2) and select(2) to the kqueue-based
> > implementation.
> > 
> > In addition it switches libevent(3) to use poll(2) by default for
> > testing purposes.
> > 
> > I don't have any open bug left with this diff and I'm happily running
> > GNOME with it.  So I'd be happy if you could try to break it and report
> > back.
> > 
> 
> Without the below diff (copied from audio(4) driver), kernel panics
> upon the first MIDI input byte.

What is the panic?  The mutex is taken recursively, right?
 
> ok? suggestion for a better fix?

Without seeing the panic, I'm guessing this is correct.

That suggest kevent(2) wasn't safe to use with midi(4).

> Index: midi.c
> ===
> RCS file: /cvs/src/sys/dev/midi.c,v
> retrieving revision 1.48
> diff -u -p -r1.48 midi.c
> --- midi.c25 Dec 2020 12:59:52 -  1.48
> +++ midi.c29 Oct 2021 11:09:47 -
> @@ -386,9 +386,11 @@ filt_midiread(struct knote *kn, long hin
>   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
>   int retval;
>  
> - mtx_enter(_lock);
> + if ((hint & NOTE_SUBMIT) == 0)
> + mtx_enter(_lock);
>   retval = !MIDIBUF_ISEMPTY(>inbuf);
> - mtx_leave(_lock);
> + if ((hint & NOTE_SUBMIT) == 0)
> + mtx_leave(_lock);
>  
>   return (retval);
>  }
> @@ -409,9 +411,11 @@ filt_midiwrite(struct knote *kn, long hi
>   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
>   intretval;
>  
> - mtx_enter(_lock);
> + if ((hint & NOTE_SUBMIT) == 0)
> + mtx_enter(_lock);
>   retval = !MIDIBUF_ISFULL(>outbuf);
> - mtx_leave(_lock);
> + if ((hint & NOTE_SUBMIT) == 0)
> + mtx_leave(_lock);
>  
>   return (retval);
>  }
> 
> 



Re: uvm_km_pgremove() tweak

2021-10-24 Thread Martin Pieuchot
On 24/10/21(Sun) 14:49, Martin Pieuchot wrote:
> Here's another small tweak I could extract from the UVM unlocking diff.
> This doesn't introduce any functional change. uvm_km_pgremove() is used
> in only one place.

Updated diff that also moves pmap_kremove() into the intrsafe variant to
be coherent, pointed out by kettenis@.  This also reduce differences with
NetBSD.

ok?

Index: uvm/uvm_km.c
===
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
retrieving revision 1.145
diff -u -p -r1.145 uvm_km.c
--- uvm/uvm_km.c15 Jun 2021 16:38:09 -  1.145
+++ uvm/uvm_km.c24 Oct 2021 14:08:42 -
@@ -239,8 +239,10 @@ uvm_km_suballoc(struct vm_map *map, vadd
  *the pages right away.(this gets called from uvm_unmap_...).
  */
 void
-uvm_km_pgremove(struct uvm_object *uobj, vaddr_t start, vaddr_t end)
+uvm_km_pgremove(struct uvm_object *uobj, vaddr_t startva, vaddr_t endva)
 {
+   const voff_t start = startva - vm_map_min(kernel_map);
+   const voff_t end = endva - vm_map_min(kernel_map);
struct vm_page *pp;
voff_t curoff;
int slot;
@@ -248,6 +250,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
 
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
 
+   pmap_remove(pmap_kernel(), startva, endva);
for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
pp = uvm_pagelookup(uobj, curoff);
if (pp && pp->pg_flags & PG_BUSY) {
@@ -301,6 +304,7 @@ uvm_km_pgremove_intrsafe(vaddr_t start, 
panic("uvm_km_pgremove_intrsafe: no page");
uvm_pagefree(pg);
}
+   pmap_kremove(start, end - start);
 }
 
 /*
Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.278
diff -u -p -r1.278 uvm_map.c
--- uvm/uvm_map.c   5 Oct 2021 15:37:21 -   1.278
+++ uvm/uvm_map.c   24 Oct 2021 14:09:13 -
@@ -2116,8 +2116,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
/* Nothing to be done for holes. */
} else if (map->flags & VM_MAP_INTRSAFE) {
KASSERT(vm_map_pmap(map) == pmap_kernel());
+
uvm_km_pgremove_intrsafe(entry->start, entry->end);
-   pmap_kremove(entry->start, entry->end - entry->start);
} else if (UVM_ET_ISOBJ(entry) &&
UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
KASSERT(vm_map_pmap(map) == pmap_kernel());
@@ -2155,10 +2155,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
 * from the object.  offsets are always relative
 * to vm_map_min(kernel_map).
 */
-   pmap_remove(pmap_kernel(), entry->start, entry->end);
-   uvm_km_pgremove(entry->object.uvm_obj,
-   entry->start - vm_map_min(kernel_map),
-   entry->end - vm_map_min(kernel_map));
+   uvm_km_pgremove(entry->object.uvm_obj, entry->start,
+   entry->end);
 
/*
 * null out kernel_object reference, we've just



uvm_km_pgremove() tweak

2021-10-24 Thread Martin Pieuchot
Here's another small tweak I could extract from the UVM unlocking diff.
This doesn't introduce any functional change. uvm_km_pgremove() is used
in only one place.

Ok?

Index: uvm/uvm_km.c
===
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
retrieving revision 1.145
diff -u -p -r1.145 uvm_km.c
--- uvm/uvm_km.c15 Jun 2021 16:38:09 -  1.145
+++ uvm/uvm_km.c24 Oct 2021 13:23:22 -
@@ -239,8 +239,10 @@ uvm_km_suballoc(struct vm_map *map, vadd
  *the pages right away.(this gets called from uvm_unmap_...).
  */
 void
-uvm_km_pgremove(struct uvm_object *uobj, vaddr_t start, vaddr_t end)
+uvm_km_pgremove(struct uvm_object *uobj, vaddr_t startva, vaddr_t endva)
 {
+   const voff_t start = startva - vm_map_min(kernel_map);
+   const voff_t end = endva - vm_map_min(kernel_map);
struct vm_page *pp;
voff_t curoff;
int slot;
@@ -248,6 +250,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
 
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
 
+   pmap_remove(pmap_kernel(), startva, endva);
for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
pp = uvm_pagelookup(uobj, curoff);
if (pp && pp->pg_flags & PG_BUSY) {
Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.278
diff -u -p -r1.278 uvm_map.c
--- uvm/uvm_map.c   5 Oct 2021 15:37:21 -   1.278
+++ uvm/uvm_map.c   24 Oct 2021 13:24:21 -
@@ -2155,10 +2155,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
 * from the object.  offsets are always relative
 * to vm_map_min(kernel_map).
 */
-   pmap_remove(pmap_kernel(), entry->start, entry->end);
-   uvm_km_pgremove(entry->object.uvm_obj,
-   entry->start - vm_map_min(kernel_map),
-   entry->end - vm_map_min(kernel_map));
+   uvm_km_pgremove(entry->object.uvm_obj, entry->start,
+   entry->end);
 
/*
 * null out kernel_object reference, we've just



More uvm_obj_destroy()

2021-10-23 Thread Martin Pieuchot
Diff below is extracted from the current UVM unlocking diff.  It adds a
couple of uvm_obj_destroy() and move some uvm_obj_init() around.

uvm_obj_destroy() will be used to release the memory of the, possibly
shared, lock allocated in uvm_obj_init().  When it is call the object
should no longer have any paged attached to it, that's why I added the
corresponding KASSERT().

uvm_obj_init() have been moved to satisfy lock assertions and reduce
differences with NetBSD.  The tricky one is for vnode which are never
freed. 

Comments?  Oks?

Index: kern/vfs_subr.c
===
RCS file: /cvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.309
diff -u -p -r1.309 vfs_subr.c
--- kern/vfs_subr.c 21 Oct 2021 09:59:14 -  1.309
+++ kern/vfs_subr.c 23 Oct 2021 09:53:48 -
@@ -410,6 +410,7 @@ getnewvnode(enum vtagtype tag, struct mo
vp = pool_get(_pool, PR_WAITOK | PR_ZERO);
vp->v_uvm = pool_get(_vnode_pool, PR_WAITOK | PR_ZERO);
vp->v_uvm->u_vnode = vp;
+   uvm_obj_init(>v_uvm->u_obj, _vnodeops, 0);
RBT_INIT(buf_rb_bufs, >v_bufs_tree);
cache_tree_init(>v_nc_tree);
TAILQ_INIT(>v_cache_dst);
Index: uvm/uvm_aobj.c
===
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.99
diff -u -p -r1.99 uvm_aobj.c
--- uvm/uvm_aobj.c  28 Jun 2021 11:19:01 -  1.99
+++ uvm/uvm_aobj.c  23 Oct 2021 09:52:02 -
@@ -372,6 +372,7 @@ uao_free(struct uvm_aobj *aobj)
/*
 * finally free the aobj itself
 */
+   uvm_obj_destroy(uobj);
pool_put(_aobj_pool, aobj);
 }
 
Index: uvm/uvm_device.c
===
RCS file: /cvs/src/sys/uvm/uvm_device.c,v
retrieving revision 1.64
diff -u -p -r1.64 uvm_device.c
--- uvm/uvm_device.c29 Jun 2021 01:46:35 -  1.64
+++ uvm/uvm_device.c23 Oct 2021 09:49:16 -
@@ -182,6 +182,7 @@ udv_attach(dev_t device, vm_prot_t acces
mtx_leave(_lock);
/* NOTE: we could sleep in the following malloc() */
udv = malloc(sizeof(*udv), M_TEMP, M_WAITOK);
+   uvm_obj_init(>u_obj, _deviceops, 1);
mtx_enter(_lock);
 
/*
@@ -199,6 +200,7 @@ udv_attach(dev_t device, vm_prot_t acces
 */
if (lcv) {
mtx_leave(_lock);
+   uvm_obj_destroy(>u_obj);
free(udv, M_TEMP, sizeof(*udv));
continue;
}
@@ -207,7 +209,6 @@ udv_attach(dev_t device, vm_prot_t acces
 * we have it!   init the data structures, add to list
 * and return.
 */
-   uvm_obj_init(>u_obj, _deviceops, 1);
udv->u_flags = 0;
udv->u_device = device;
LIST_INSERT_HEAD(_list, udv, u_list);
@@ -275,6 +276,8 @@ again:
if (udv->u_flags & UVM_DEVICE_WANTED)
wakeup(udv);
mtx_leave(_lock);
+
+   uvm_obj_destroy(uobj);
free(udv, M_TEMP, sizeof(*udv));
 }
 
Index: uvm/uvm_object.c
===
RCS file: /cvs/src/sys/uvm/uvm_object.c,v
retrieving revision 1.21
diff -u -p -r1.21 uvm_object.c
--- uvm/uvm_object.c12 Oct 2021 18:16:51 -  1.21
+++ uvm/uvm_object.c23 Oct 2021 09:49:57 -
@@ -66,9 +66,13 @@ uvm_obj_init(struct uvm_object *uobj, co
uobj->uo_refs = refs;
 }
 
+/*
+ * uvm_obj_destroy: destroy UVM memory object.
+ */
 void
 uvm_obj_destroy(struct uvm_object *uo)
 {
+   KASSERT(RBT_EMPTY(uvm_objtree, >memt));
 }
 
 #ifndef SMALL_KERNEL
Index: uvm/uvm_vnode.c
===
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.118
diff -u -p -r1.118 uvm_vnode.c
--- uvm/uvm_vnode.c 20 Oct 2021 06:35:40 -  1.118
+++ uvm/uvm_vnode.c 23 Oct 2021 09:56:32 -
@@ -229,7 +229,8 @@ uvn_attach(struct vnode *vp, vm_prot_t a
 #endif
 
/* now set up the uvn. */
-   uvm_obj_init(>u_obj, _vnodeops, 1);
+   KASSERT(uvn->u_obj.uo_refs == 0);
+   uvn->u_obj.uo_refs++;
oldflags = uvn->u_flags;
uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST;
uvn->u_nio = 0;



Please test: full poll/select(2) switch

2021-10-23 Thread Martin Pieuchot
Diff below switches both poll(2) and select(2) to the kqueue-based
implementation.

In addition it switches libevent(3) to use poll(2) by default for
testing purposes.

I don't have any open bug left with this diff and I'm happily running
GNOME with it.  So I'd be happy if you could try to break it and report
back.

Index: lib/libevent/event.c
===
RCS file: /cvs/src/lib/libevent/event.c,v
retrieving revision 1.41
diff -u -p -r1.41 event.c
--- lib/libevent/event.c1 May 2019 19:14:25 -   1.41
+++ lib/libevent/event.c23 Oct 2021 09:36:10 -
@@ -53,9 +53,9 @@ extern const struct eventop kqops;
 
 /* In order of preference */
 static const struct eventop *eventops[] = {
-   ,
,
,
+   ,
NULL
 };
 
Index: sys/kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.137
diff -u -p -r1.137 sys_generic.c
--- sys/kern/sys_generic.c  15 Oct 2021 06:59:57 -  1.137
+++ sys/kern/sys_generic.c  23 Oct 2021 09:14:59 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -66,8 +67,23 @@
 
 #include 
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2) information, somewhat verbose
+ *  3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) {  \
+   printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);  \
+   printf(x);  \
+}
+
+int pselregister(struct proc *, fd_set *[], fd_set *[], int, int *, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
+int ppollregister(struct proc *, struct pollfd *, int, int *);
+int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
+
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
 struct timespec *, const sigset_t *, register_t *);
@@ -584,11 +600,10 @@ int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
 {
+   struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
-   struct timespec elapsed, start, stop;
-   uint64_t nsecs;
-   int s, ncoll, error = 0;
+   int error, ncollected = 0, nevents = 0;
u_int ni;
 
if (nd < 0)
@@ -618,6 +633,8 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)[5];
}
 
+   kqpoll_init();
+
 #definegetbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
goto done;
@@ -636,43 +653,61 @@ dopselect(struct proc *p, int nd, fd_set
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-   ncoll = nselcoll;
-   atomic_setbits_int(>p_flag, P_SELECT);
-   error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
-   if (error || *retval)
+   /* Register kqueue events */
+   error = pselregister(p, pibits, pobits, nd, , );
+   if (error != 0)
goto done;
-   if (timeout == NULL || timespecisset(timeout)) {
-   if (timeout != NULL) {
-   getnanouptime();
-   nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-   } else
-   nsecs = INFSLP;
-   s = splhigh();
-   if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-   splx(s);
-   goto retry;
-   }
-   atomic_clearbits_int(>p_flag, P_SELECT);
-   error = tsleep_nsec(, PSOCK | PCATCH, "select", nsecs);
-   splx(s);
+
+   /*
+* The poll/select family of syscalls has been designed to
+* block when file descriptors are not available, even if
+* there's nothing to wait for.
+*/
+   if (nevents == 0 && ncollected == 0) {
+   uint64_t nsecs = INFSLP;
+
if (timeout != NULL) {
-   getnanouptime();
-   timespecsub(, , );
-   timespecsub(timeout, , timeout);
-   if (timeout->tv_sec < 0)
-   timespecclear(timeout);
+   if (!timespecisset(timeout))
+   goto done;
+   nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
}
-   if (error == 0 || error == EWOULDBLOCK)
-  

Re: xhci uhub on arm64: handle device in SS_INACTIVE state

2021-10-22 Thread Martin Pieuchot
On 17/10/21(Sun) 09:06, Christopher Zimmermann wrote:
> Hi,
> 
> on my RK3399, a usb device connected to the USB 3 port is not detected
> during boot because it is in SS_INACTIVE (0x00c0) state:
> 
> uhub3 at usb3 configuration 1 interface 0 "Generic xHCI root hub" rev
> 3.00/1.00 addr 1
> uhub3: uhub_attach
> uhub3: 2 ports with 2 removable, self powered
> uhub3: intr status=0
> usb_needs_explore: usb3: not exploring before first explore
> uhub3: uhub_explore
> uhub3: port 1 status=0x02a0 change=0x
> uhub3: port 2 status=0x02c0 change=0x0040
> usb_explore: usb3: first explore done
> xhci1: port=2 change=0x04
> uhub3: intr status=0
> uhub3: uhub_explore
> uhub3: port 2 status=0x02c0 change=0x0040
> xhci1: port=2 change=0x04
> uhub3: intr status=0
> uhub3: uhub_explore
> uhub3: port 2 status=0x02c0 change=0x0040
> 
> [...]
> 
> [turn the usb device off and on again]
> 
> uhub3: intr status=0
> uhub3: uhub_explore
> uhub3: port 2 status=0x0203 change=0x0001
> usbd_reset_port: port 2 reset done
> xhci1: port=2 change=0x04
> uhub3: intr status=0
> uhub3: port 2 status=0x0203 change=0x
> umass0 at uhub3 port 2 configuration 1 interface 0 "ATEC Dual Disk Drive" rev 
> 3.00/1.08 addr 2
> 
> This might be because u-boot-aarch64-2021.10 from packages left it in that
> state.
> I added this code to reset a device locked in such a state:

It's not clear to me if this a warm reset or not?  If the port is in
SS.Inactive it needs a warm reset, no?

If so could you add a comment on top of the block.  I'd also suggest
moving the block after the "warm reset change" (BH_PORT_RESET), this
might matter and at least match Linux's logic.

I wish you could add the logic to properly check if a warm reset is
required by checking the proper bits against the port number, but we
can rely on UPS_C_PORT_LINK_STATE for now and do that in a second step.

Comments below:

> Index: uhub.c
> ===
> RCS file: /cvs/src/sys/dev/usb/uhub.c,v
> retrieving revision 1.95
> diff -u -p -r1.95 uhub.c
> --- uhub.c  31 Jul 2020 10:49:33 -  1.95
> +++ uhub.c  17 Oct 2021 06:44:14 -
> @@ -414,6 +414,24 @@ uhub_explore(struct usbd_device *dev)
> change |= UPS_C_CONNECT_STATUS;
> }
> 
> +   if (change & UPS_C_PORT_LINK_STATE &&
> +   UPS_PORT_LS_GET(status) == UPS_PORT_LS_SS_INACTIVE &&

This should check for the speed of the HUB: 

sc->sc_hub->speed == USB_SPEED_SUPER

Should we also check if the link state is UPS_PORT_LS_COMP_MOD?

> +   ! (status & UPS_CURRENT_CONNECT_STATUS)) {
   ^
 Please drop the space here

> +   DPRINTF("%s: port %d is in in SS_INACTIVE.Quiet 
> state. "
> + "Reset port.\n",
> + sc->sc_dev.dv_xname, port);
> +   usbd_clear_port_feature(sc->sc_hub, port,
> +   UHF_C_PORT_RESET);
> +
> +   if (usbd_reset_port(sc->sc_hub, port)) {
> +   printf("%s: port %d reset failed\n",
> + DEVNAME(sc), port);
> +   return (-1);
> +   }
> +
> +   change |= UPS_C_CONNECT_STATUS;
> +   }
> +
> if (change & UPS_C_BH_PORT_RESET &&
> sc->sc_hub->speed == USB_SPEED_SUPER) {
> usbd_clear_port_feature(sc->sc_hub, port,
> 
> 
> Now the device attaches during boot. A redundant second reset of the device
> is performed during uhub_port_connect():
> 
> uhub3 at usb3 configuration 1 interface 0 "Generic xHCI root hub" rev
> 3.00/1.00 addr 1
> uhub3: uhub_attach
> uhub3: 2 ports with 2 removable, self powered
> xhci1: port=2 change=0x04
> uhub3: intr status=0
> usb_needs_explore: usb3: not exploring before first explore
> uhub3: uhub_explore
> uhub3: port 1 status=0x02a0 change=0x
> uhub3: port 2 status=0x02c0 change=0x0040
> uhub3: port 2 is in in SS_INACTIVE.Quiet state. Reset port.
> usbd_reset_port: port 2 reset done
> usb_explore: usb3: first explore done
> xhci1: port=2 change=0x04
> uhub3: intr status=0
> uhub3: uhub_explore
> uhub3: port 2 status=0x0203 change=0x0031
> uhub3: uhub_port_connect
> usbd_reset_port: port 2 reset done
> xhci1: port=2 change=0x04
> uhub3: intr status=0
> uhub3: port 2 status=0x0203 change=0x
> umass0 at uhub3 port 2 configuration 1 interface 0 "ATEC Dual Disk Drive" rev 
> 3.00/1.08 addr 2
> 
> 
> OK to commit this diff? Or should this be done some other way?
> 
> 
> Christopher
> 



Re: Make pipe event filters MP-safe

2021-10-22 Thread Martin Pieuchot
On 22/10/21(Fri) 13:15, Visa Hankala wrote:
> This diff makes pipe event filters ready to run without the kernel lock.
> The code pattern in the callbacks is the same as in sockets. Pipes
> have a klist lock already.
> 
> So far, pipe event filters have used read-locking. The patch changes
> that to write-locking for clarity. This should not be a real loss,
> though, because the lock is fine-grained and there is little multiple-
> readers parallelism to be utilized.

The removal of the KERNEL_LOCK() in pipeselwakeup() makes me very happy.
As found with patrick@ this was a non negligible spinning time in:
  https://undeadly.org/features/2021/09/2ytHD+googlemap_arm64.svg

ok mpi@

> Index: kern/sys_pipe.c
> ===
> RCS file: src/sys/kern/sys_pipe.c,v
> retrieving revision 1.127
> diff -u -p -r1.127 sys_pipe.c
> --- kern/sys_pipe.c   22 Oct 2021 05:00:26 -  1.127
> +++ kern/sys_pipe.c   22 Oct 2021 12:17:57 -
> @@ -78,20 +78,30 @@ static const struct fileops pipeops = {
>  
>  void filt_pipedetach(struct knote *kn);
>  int  filt_piperead(struct knote *kn, long hint);
> +int  filt_pipereadmodify(struct kevent *kev, struct knote *kn);
> +int  filt_pipereadprocess(struct knote *kn, struct kevent *kev);
> +int  filt_piperead_common(struct knote *kn, struct pipe *rpipe);
>  int  filt_pipewrite(struct knote *kn, long hint);
> +int  filt_pipewritemodify(struct kevent *kev, struct knote *kn);
> +int  filt_pipewriteprocess(struct knote *kn, struct kevent *kev);
> +int  filt_pipewrite_common(struct knote *kn, struct pipe *rpipe);
>  
>  const struct filterops pipe_rfiltops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_pipedetach,
>   .f_event= filt_piperead,
> + .f_modify   = filt_pipereadmodify,
> + .f_process  = filt_pipereadprocess,
>  };
>  
>  const struct filterops pipe_wfiltops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_pipedetach,
>   .f_event= filt_pipewrite,
> + .f_modify   = filt_pipewritemodify,
> + .f_process  = filt_pipewriteprocess,
>  };
>  
>  /*
> @@ -362,9 +372,7 @@ pipeselwakeup(struct pipe *cpipe)
>   cpipe->pipe_state &= ~PIPE_SEL;
>   selwakeup(>pipe_sel);
>   } else {
> - KERNEL_LOCK();
> - KNOTE(>pipe_sel.si_note, NOTE_SUBMIT);
> - KERNEL_UNLOCK();
> + KNOTE(>pipe_sel.si_note, 0);
>   }
>  
>   if (cpipe->pipe_state & PIPE_ASYNC)
> @@ -929,45 +937,76 @@ filt_pipedetach(struct knote *kn)
>  }
>  
>  int
> -filt_piperead(struct knote *kn, long hint)
> +filt_piperead_common(struct knote *kn, struct pipe *rpipe)
>  {
> - struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
> - struct rwlock *lock = rpipe->pipe_lock;
> + struct pipe *wpipe;
> +
> + rw_assert_wrlock(rpipe->pipe_lock);
>  
> - if ((hint & NOTE_SUBMIT) == 0)
> - rw_enter_read(lock);
>   wpipe = pipe_peer(rpipe);
>  
>   kn->kn_data = rpipe->pipe_buffer.cnt;
>  
>   if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
> - if ((hint & NOTE_SUBMIT) == 0)
> - rw_exit_read(lock);
>   kn->kn_flags |= EV_EOF; 
>   if (kn->kn_flags & __EV_POLL)
>   kn->kn_flags |= __EV_HUP;
>   return (1);
>   }
>  
> - if ((hint & NOTE_SUBMIT) == 0)
> - rw_exit_read(lock);
> -
>   return (kn->kn_data > 0);
>  }
>  
>  int
> -filt_pipewrite(struct knote *kn, long hint)
> +filt_piperead(struct knote *kn, long hint)
>  {
> - struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
> - struct rwlock *lock = rpipe->pipe_lock;
> + struct pipe *rpipe = kn->kn_fp->f_data;
> +
> + return (filt_piperead_common(kn, rpipe));
> +}
> +
> +int
> +filt_pipereadmodify(struct kevent *kev, struct knote *kn)
> +{
> + struct pipe *rpipe = kn->kn_fp->f_data;
> + int active;
> +
> + rw_enter_write(rpipe->pipe_lock);
> + knote_modify(kev, kn);
> + active = filt_piperead_common(kn, rpipe);
> + rw_exit_write(rpipe->pipe_lock);
> +
> + return (active);
> +}
> +
> +int
> +filt_pipereadprocess(struct knote *kn, struct kevent *kev)
> +{
> + struct pipe *rpipe = kn->kn_fp->f_data;
> + int active;
> +
> + rw_enter_write(rpipe->pipe_lock);
> + if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
> + active = 1;
> + else
> + active = filt_piperead_common(kn, rpipe);
> + if (active)
> + knote_submit(kn, kev);
> + rw_exit_write(rpipe->pipe_lock);
> +
> + return (active);
> +}
> +
> +int
> +filt_pipewrite_common(struct knote *kn, struct pipe *rpipe)
> +{
> + struct pipe *wpipe;
> +
> + 

Re: Set klist lock for sockets, v2

2021-10-22 Thread Martin Pieuchot
On 22/10/21(Fri) 13:11, Visa Hankala wrote:
> Here is another attempt to set klist lock for sockets. This is a revised
> version of a patch that I posted in January [1].
> 
> Using solock() for the klists is probably the easiest way at the time
> being. However, the lock is a potential point of contention because of
> the underlying big-lock design. The increase of overhead is related to
> adding and removing event registrations. With persistent registrations
> the overhead is unchanged.
> 
> As a result, socket and named FIFO event filters should be ready to run
> without the kernel lock. The f_event, f_modify and f_process callbacks
> should be MP-safe already.
> 
> [1] https://marc.info/?l=openbsd-tech=160986578724696
> 
> OK?

I've been running with this and unlocked sowakeup() for quite some time
now.

ok mpi@

> Index: kern/uipc_socket.c
> ===
> RCS file: src/sys/kern/uipc_socket.c,v
> retrieving revision 1.265
> diff -u -p -r1.265 uipc_socket.c
> --- kern/uipc_socket.c14 Oct 2021 23:05:10 -  1.265
> +++ kern/uipc_socket.c22 Oct 2021 12:17:57 -
> @@ -84,7 +84,7 @@ int filt_solistenprocess(struct knote *k
>  int  filt_solisten_common(struct knote *kn, struct socket *so);
>  
>  const struct filterops solisten_filtops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_sordetach,
>   .f_event= filt_solisten,
> @@ -93,7 +93,7 @@ const struct filterops solisten_filtops 
>  };
>  
>  const struct filterops soread_filtops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_sordetach,
>   .f_event= filt_soread,
> @@ -102,7 +102,7 @@ const struct filterops soread_filtops = 
>  };
>  
>  const struct filterops sowrite_filtops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_sowdetach,
>   .f_event= filt_sowrite,
> @@ -111,7 +111,7 @@ const struct filterops sowrite_filtops =
>  };
>  
>  const struct filterops soexcept_filtops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_sordetach,
>   .f_event= filt_soread,
> @@ -169,6 +169,8 @@ socreate(int dom, struct socket **aso, i
>   return (EPROTOTYPE);
>   so = pool_get(_pool, PR_WAITOK | PR_ZERO);
>   rw_init(>so_lock, "solock");
> + klist_init(>so_rcv.sb_sel.si_note, _klistops, so);
> + klist_init(>so_snd.sb_sel.si_note, _klistops, so);
>   sigio_init(>so_sigio);
>   TAILQ_INIT(>so_q0);
>   TAILQ_INIT(>so_q);
> @@ -258,6 +260,8 @@ sofree(struct socket *so, int s)
>   }
>   }
>   sigio_free(>so_sigio);
> + klist_free(>so_rcv.sb_sel.si_note);
> + klist_free(>so_snd.sb_sel.si_note);
>  #ifdef SOCKET_SPLICE
>   if (so->so_sp) {
>   if (issplicedback(so)) {
> @@ -2038,9 +2042,9 @@ soo_kqfilter(struct file *fp, struct kno
>  {
>   struct socket *so = kn->kn_fp->f_data;
>   struct sockbuf *sb;
> + int s;
>  
> - KERNEL_ASSERT_LOCKED();
> -
> + s = solock(so);
>   switch (kn->kn_filter) {
>   case EVFILT_READ:
>   if (so->so_options & SO_ACCEPTCONN)
> @@ -2058,10 +2062,12 @@ soo_kqfilter(struct file *fp, struct kno
>   sb = >so_rcv;
>   break;
>   default:
> + sounlock(so, s);
>   return (EINVAL);
>   }
>  
>   klist_insert_locked(>sb_sel.si_note, kn);
> + sounlock(so, s);
>  
>   return (0);
>  }
> @@ -2071,9 +2077,7 @@ filt_sordetach(struct knote *kn)
>  {
>   struct socket *so = kn->kn_fp->f_data;
>  
> - KERNEL_ASSERT_LOCKED();
> -
> - klist_remove_locked(>so_rcv.sb_sel.si_note, kn);
> + klist_remove(>so_rcv.sb_sel.si_note, kn);
>  }
>  
>  int
> @@ -2159,9 +2163,7 @@ filt_sowdetach(struct knote *kn)
>  {
>   struct socket *so = kn->kn_fp->f_data;
>  
> - KERNEL_ASSERT_LOCKED();
> -
> - klist_remove_locked(>so_snd.sb_sel.si_note, kn);
> + klist_remove(>so_snd.sb_sel.si_note, kn);
>  }
>  
>  int
> @@ -2284,6 +2286,36 @@ filt_solistenprocess(struct knote *kn, s
>   return (rv);
>  }
>  
> +void
> +klist_soassertlk(void *arg)
> +{
> + struct socket *so = arg;
> +
> + soassertlocked(so);
> +}
> +
> +int
> +klist_solock(void *arg)
> +{
> + struct socket *so = arg;
> +
> + return (solock(so));
> +}
> +
> +void
> +klist_sounlock(void *arg, int ls)
> +{
> + struct socket *so = arg;
> +
> + sounlock(so, ls);
> +}
> +
> +const struct klistops socket_klistops = {
> + .klo_assertlk   = klist_soassertlk,
> + .klo_lock   = 

POLLHUP vs EVFILT_EXCEPT semantic

2021-10-22 Thread Martin Pieuchot
Last year we added the new EVFILT_EXCEPT filter type to kqueue in
order to report conditions currently available via POLLPRI/POLLRDBAND
in poll(2) and select(2).

This new filter has been implemented in tty and socket by re-using the
existing kqueue's "read" filter.  This has a downside which is the filter
will also trigger if any data is available for reading.

This "feature" makes it impossible to correctly implement poll(2)'s
"empty" condition mode.  If no bit are set in the `events' pollfd
structure we still need to return POLLHUP.  But if the filter triggers
when there's data to read, it means POLLIN not POLLHUP.

So I'd like to change the existing EVFILT_EXCEPT filters to no longer
fire if there is something to read.  Diff below does that and adds a
new filter for FIFOs necessary for poll(2) support.

Ok?

Index: kern/tty_pty.c
===
RCS file: /cvs/src/sys/kern/tty_pty.c,v
retrieving revision 1.108
diff -u -p -r1.108 tty_pty.c
--- kern/tty_pty.c  8 Feb 2021 09:18:30 -   1.108
+++ kern/tty_pty.c  22 Oct 2021 12:49:12 -
@@ -107,6 +107,7 @@ voidfilt_ptcrdetach(struct knote *);
 intfilt_ptcread(struct knote *, long);
 void   filt_ptcwdetach(struct knote *);
 intfilt_ptcwrite(struct knote *, long);
+intfilt_ptcexcept(struct knote *, long);
 
 static struct pt_softc **ptyarralloc(int);
 static int check_pty(int);
@@ -670,16 +671,6 @@ filt_ptcread(struct knote *kn, long hint
tp = pti->pt_tty;
kn->kn_data = 0;
 
-   if (kn->kn_sfflags & NOTE_OOB) {
-   /* If in packet or user control mode, check for data. */
-   if (((pti->pt_flags & PF_PKT) && pti->pt_send) ||
-   ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)) {
-   kn->kn_fflags |= NOTE_OOB;
-   kn->kn_data = 1;
-   return (1);
-   }
-   return (0);
-   }
if (ISSET(tp->t_state, TS_ISOPEN)) {
if (!ISSET(tp->t_state, TS_TTSTOP))
kn->kn_data = tp->t_outq.c_cc;
@@ -731,6 +722,34 @@ filt_ptcwrite(struct knote *kn, long hin
return (kn->kn_data > 0);
 }
 
+int
+filt_ptcexcept(struct knote *kn, long hint)
+{
+   struct pt_softc *pti = (struct pt_softc *)kn->kn_hook;
+   struct tty *tp;
+
+   tp = pti->pt_tty;
+
+   if (kn->kn_sfflags & NOTE_OOB) {
+   /* If in packet or user control mode, check for data. */
+   if (((pti->pt_flags & PF_PKT) && pti->pt_send) ||
+   ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)) {
+   kn->kn_fflags |= NOTE_OOB;
+   kn->kn_data = 1;
+   return (1);
+   }
+   return (0);
+   }
+   if (!ISSET(tp->t_state, TS_CARR_ON)) {
+   kn->kn_flags |= EV_EOF;
+   if (kn->kn_flags & __EV_POLL)
+   kn->kn_flags |= __EV_HUP;
+   return (1);
+   }
+
+   return (0);
+}
+
 const struct filterops ptcread_filtops = {
.f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
@@ -749,7 +768,7 @@ const struct filterops ptcexcept_filtops
.f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_ptcrdetach,
-   .f_event= filt_ptcread,
+   .f_event= filt_ptcexcept,
 };
 
 int
Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.265
diff -u -p -r1.265 uipc_socket.c
--- kern/uipc_socket.c  14 Oct 2021 23:05:10 -  1.265
+++ kern/uipc_socket.c  22 Oct 2021 12:49:12 -
@@ -78,6 +78,10 @@ int  filt_sowrite(struct knote *kn, long 
 intfilt_sowritemodify(struct kevent *kev, struct knote *kn);
 intfilt_sowriteprocess(struct knote *kn, struct kevent *kev);
 intfilt_sowrite_common(struct knote *kn, struct socket *so);
+intfilt_soexcept(struct knote *kn, long hint);
+intfilt_soexceptmodify(struct kevent *kev, struct knote *kn);
+intfilt_soexceptprocess(struct knote *kn, struct kevent *kev);
+intfilt_soexcept_common(struct knote *kn, struct socket *so);
 intfilt_solisten(struct knote *kn, long hint);
 intfilt_solistenmodify(struct kevent *kev, struct knote *kn);
 intfilt_solistenprocess(struct knote *kn, struct kevent *kev);
@@ -114,9 +118,9 @@ const struct filterops soexcept_filtops 
.f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
-   .f_event= filt_soread,
-   .f_modify   = filt_soreadmodify,
-   .f_process  = filt_soreadprocess,
+   .f_event= filt_soexcept,
+   .f_modify   = filt_soexceptmodify,
+   .f_process  = filt_soexceptprocess,
 };
 
 #ifndef SOMINCONN
@@ -2089,13 +2093,7 @@ 

Re: vnode lock: remove VLOCKSWORK flag

2021-10-15 Thread Martin Pieuchot
On 15/10/21(Fri) 09:27, Sebastien Marie wrote:
> Hi,
> 
> The following diff removes VLOCKSWORK flag.

Nice.

> This flag is currently used to mark or unmark a vnode to actively
> check vnode locking semantic (when compiled with VFSLCKDEBUG).
>  
> Currently, VLOCKSWORK flag isn't properly set for several FS
> implementation which have full locking support, specially:
>  - cd9660
>  - udf
>  - fuse
>  - msdosfs
>  - tmpfs
> 
> Instead of using a particular flag, I propose to directly check if
> v_op->vop_islocked is nullop or not to activate or not the vnode
> locking checks.

I wonder if we shouldn't get rid of those checks and instead make
VOP_ISLOCKED() deal with that.

VOP_ISLOCKED() is inconsistent.  It returns the value of rrw_status(9)
or EOPNOTSUPP if `vop_islocked' is NULL.

But this is a change in behavior that has a broader scope, so it should
be done separately.

> Some alternate methods might be possible, like having a specific
> member inside struct vops. But it will only duplicate the fact that
> nullop is used as lock mecanism.
> 
> I also slightly changed ASSERT_VP_ISLOCKED(vp) macro:
> - evaluate vp argument only once
> - explicitly check if VOP_ISLOCKED() != LK_EXCLUSIVE (it might returns
>   error or 'locked by some else', and it doesn't mean "locked by me")
> - show the VOP_ISLOCKED returned code in panic message
> 
> Some code are using ASSERT_VP_ISLOCKED() like code. I kept them simple.
> 
> The direct impact on snapshots should be low as VFSLCKDEBUG isn't set
> by default.
> 
> Comments or OK ?

ok mpi@

> diff e44725a8dd99f82f94f37ecff5c0e710c4dba97e 
> /home/semarie/repos/openbsd/sys-clean
> blob - c752dd99e9ef62b05162cfeda67913ab5bccf06e
> file + kern/vfs_subr.c
> --- kern/vfs_subr.c
> +++ kern/vfs_subr.c
> @@ -1075,9 +1075,6 @@ vclean(struct vnode *vp, int flags, struct proc *p)
>   vp->v_op = _vops;
>   VN_KNOTE(vp, NOTE_REVOKE);
>   vp->v_tag = VT_NON;
> -#ifdef VFSLCKDEBUG
> - vp->v_flag &= ~VLOCKSWORK;
> -#endif
>   mtx_enter(_mtx);
>   vp->v_lflag &= ~VXLOCK;
>   if (vp->v_lflag & VXWANT) {
> @@ -1930,7 +1927,7 @@ vinvalbuf(struct vnode *vp, int flags, struct ucred *c
>   int s, error;
>  
>  #ifdef VFSLCKDEBUG
> - if ((vp->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp))
> + if ((vp->v_op->vop_islocked != nullop) && !VOP_ISLOCKED(vp))
>   panic("%s: vp isn't locked, vp %p", __func__, vp);
>  #endif
>  
> blob - caf2dc327bfc2f5a001bcee80edd90938497ef99
> file + kern/vfs_vops.c
> --- kern/vfs_vops.c
> +++ kern/vfs_vops.c
> @@ -48,11 +48,15 @@
>  #include 
>  
>  #ifdef VFSLCKDEBUG
> -#define ASSERT_VP_ISLOCKED(vp) do {  \
> - if (((vp)->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp)) { \
> - VOP_PRINT(vp);  \
> - panic("vp not locked"); \
> - }   \
> +#define ASSERT_VP_ISLOCKED(vp) do {  \
> + struct vnode *_vp = (vp);   \
> + int r;  \
> + if (_vp->v_op->vop_islocked == nullop)  \
> + break;  \
> + if ((r = VOP_ISLOCKED(_vp)) != LK_EXCLUSIVE) {  \
> + VOP_PRINT(_vp); \
> + panic("%s: vp not locked, vp %p, %d", __func__, _vp, r);\
> + }   \
>  } while (0)
>  #else
>  #define ASSERT_VP_ISLOCKED(vp)  /* nothing */
> blob - 81b900e83d2071d8450f35cfae42c6cb91f1a414
> file + nfs/nfs_node.c
> --- nfs/nfs_node.c
> +++ nfs/nfs_node.c
> @@ -133,9 +133,6 @@ loop:
>   }
>  
>   vp = nvp;
> -#ifdef VFSLCKDEBUG
> - vp->v_flag |= VLOCKSWORK;
> -#endif
>   rrw_init_flags(>n_lock, "nfsnode", RWL_DUPOK | RWL_IS_VNODE);
>   vp->v_data = np;
>   /* we now have an nfsnode on this vnode */
> blob - 3668f954a9aab3fd49ed5e41e7d4ab51b4bf0a90
> file + sys/vnode.h
> --- sys/vnode.h
> +++ sys/vnode.h
> @@ -146,8 +146,7 @@ struct vnode {
>  #define  VCLONED 0x0400  /* vnode was cloned */
>  #define  VALIASED0x0800  /* vnode has an alias */
>  #define  VLARVAL 0x1000  /* vnode data not yet set up by higher 
> level */
> -#define  VLOCKSWORK  0x4000  /* FS supports locking discipline */
> -#define  VCLONE  0x8000  /* vnode is a clone */
> +#define  VCLONE  0x4000  /* vnode is a clone */
>  
>  /*
>   * (v_bioflag) Flags that may be manipulated by interrupt handlers
> blob - d859d216b40ebb2f5cce1eb5cf0becbfff21a638
> file + ufs/ext2fs/ext2fs_subr.c
> --- ufs/ext2fs/ext2fs_subr.c
> +++ ufs/ext2fs/ext2fs_subr.c
> @@ -170,9 +170,6 @@ ext2fs_vinit(struct mount *mp, struct vnode **vpp)
>   nvp->v_data = vp->v_data;
>  

poll(2) on top of kqueue

2021-10-14 Thread Martin Pieuchot
Diff below is the counterpart of the select(2) one I just committed to
make poll(2) and ppoll(2) use kqueue internally.

They use the same logic as select(2): convert pollfd into kqueue events
with EV_SET(2) then wait in kqueue_scan().

To make this implementation compatible with the existing poll(2) semantic  
I added a new specific kqueue-filter to FIFOs to handle the case where
POLLOUT is specified on a read-only event.  Thanks to millert@ for the
idea.  The regress sys/fifofs is passing with that.

As for the select(2) diff I'm currently interested in knowing if you
find any incompatibility with the current behavior. 

Thanks for testing,
Martin

Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.136
diff -u -p -r1.136 sys_generic.c
--- kern/sys_generic.c  14 Oct 2021 08:46:01 -  1.136
+++ kern/sys_generic.c  14 Oct 2021 09:00:22 -
@@ -81,6 +81,8 @@ int kqpoll_debug = 0;
 
 int pselregister(struct proc *, fd_set *[], int, int *);
 int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
+int ppollregister(struct proc *, struct pollfd *, int, int *);
+int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
 
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
@@ -769,6 +771,7 @@ pselregister(struct proc *p, fd_set *pib
/* FALLTHROUGH */
case EOPNOTSUPP:/* No underlying kqfilter */
case EINVAL:/* Unimplemented filter */
+   case EPERM: /* Specific to FIFO */
error = 0;
break;
case ENXIO: /* Device has been detached */
@@ -899,31 +902,132 @@ doselwakeup(struct selinfo *sip)
}
 }
 
-void
-pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
+int
+ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev,
+struct pollfd *pl)
 {
-   struct filedesc *fdp = p->p_fd;
-   struct file *fp;
-   u_int i;
-   int n = 0;
+   int i, error, nevents = 0;
 
-   for (i = 0; i < nfd; i++, pl++) {
-   /* Check the file descriptor. */
-   if (pl->fd < 0) {
-   pl->revents = 0;
-   continue;
+   KASSERT(pl->revents == 0);
+
+#ifdef KTRACE
+   if (KTRPOINT(p, KTR_STRUCT))
+   ktrevent(p, kevp, nkev);
+#endif
+   for (i = 0; i < nkev; i++, kevp++) {
+again:
+   error = kqueue_register(p->p_kq, kevp, p);
+   switch (error) {
+   case 0:
+   nevents++;
+   break;
+   case EOPNOTSUPP:/* No underlying kqfilter */
+   case EINVAL:/* Unimplemented filter */
+   break;
+   case EBADF: /* Bad file descriptor */
+   pl->revents |= POLLNVAL;
+   break;
+   case EPERM: /* Specific to FIFO */
+   KASSERT(kevp->filter == EVFILT_WRITE);
+   if (nkev == 1) {
+   /*
+* If this is the only filter make sure
+* POLLHUP is passed to userland.
+*/
+   kevp->filter = EVFILT_EXCEPT;
+   goto again;
+   }
+   break;
+   case EPIPE: /* Specific to pipes */
+   KASSERT(kevp->filter == EVFILT_WRITE);
+   pl->revents |= POLLHUP;
+   break;
+   default:
+#ifdef DIAGNOSTIC
+   DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
+   " %lu filt %d ERROR=%d\n",
+   ((unsigned long)kevp->udata - p->p_kq_serial),
+   pl->fd, pl->revents, p->p_kq_serial, kevp->filter,
+   error);
+#endif
+   /* FALLTHROUGH */
+   case ENXIO: /* Device has been detached */
+   pl->revents |= POLLERR;
+   break;
}
-   if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
-   pl->revents = POLLNVAL;
-   n++;
+   }
+
+   return (nevents);
+}
+
+/*
+ * Convert pollfd into kqueue events and register them on the
+ * per-thread queue.
+ *
+ * Return the number of pollfd that triggered at least one error and aren't
+ * completly monitored.  These pollfd should have the correponding error bit
+ * set in `revents'.
+ *
+ * At most 3 events can correspond to a single pollfd.
+ */
+int

Re: Switch to kqueue based select(2)

2021-10-13 Thread Martin Pieuchot
On 13/10/21(Wed) 11:41, Alexander Bluhm wrote:
> On Sat, Oct 02, 2021 at 09:10:13AM +0200, Martin Pieuchot wrote:
> > ok?
> 
> OK bluhm@
> 
> > +   /* Maxium number of events per iteration */
> 
> Maximum
> 
> > +int
> > +pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3],
> > +int *ncollected)
> > +{
> > +#ifdef DIAGNOSTIC
> > +   /* Filter out and lazily delete spurious events */
> > +   if ((unsigned long)kevp->udata != p->p_kq_serial) {
> > +   DPRINTFN(0, "select fd %u mismatched serial %lu\n",
> > +   (int)kevp->ident, p->p_kq_serial);
> > +   kevp->flags = EV_DISABLE|EV_DELETE;
> > +   kqueue_register(p->p_kq, kevp, p);
> > +   return (0);
> > +   }
> > +#endif
> 
> Why is it DIAGNOSTIC?  Either it should not happen, then call panic().
> Or it is a valid corner case, then remove #ifdef DIAGNOSTIC.
> 
> Different behavior with and without DIAGNOSTIC seems bad.

Indeed.  It should not be in DIAGNOSTIC, that's a leftover from previous
iteration of the diff, I'll fix both points before committing.

Thanks for the review.



Re: mi_switch() & setting `p_stat'

2021-10-03 Thread Martin Pieuchot
On 02/10/21(Sat) 21:09, Mark Kettenis wrote:
> > Date: Sat, 2 Oct 2021 20:35:41 +0200
> > From: Martin Pieuchot 
> > [...] 
> > There's no sleeping point but a call to wakeup().  This wakeup() is
> > supposed to wake a btrace(8) process.  But if the curproc, which just
> > added itself to the global sleep queue, ends up in the same bucket as
> > the btrace process, the KASSERT() line 565 of kern/kern_synch.c will
> > trigger:
> > 
> > /*
> >  * If the rwlock passed to rwsleep() is contended, the
> >  * CPU will end up calling wakeup() between sleep_setup()
> >  * and sleep_finish().
> >  */
> > if (p == curproc) {
> > KASSERT(p->p_stat == SONPROC);
> > continue;
> > }
> 
> Ah, right.  But that means the comment isn't accurate.  At least there
> are other cases that make us hit that codepath.
> 
> How useful is that KASSERT in catching actual bugs?

I added the KASSERT() to limit the scope of the check.  If the test is
true `curproc' is obviously on the CPU.  Its usefulness is questionable.

So a simpler fix would be to remove the assert, diff below does that and
update the comment, ok?

Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.179
diff -u -p -r1.179 kern_synch.c
--- kern/kern_synch.c   9 Sep 2021 18:41:39 -   1.179
+++ kern/kern_synch.c   3 Oct 2021 08:48:28 -
@@ -558,14 +558,11 @@ wakeup_n(const volatile void *ident, int
for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
pnext = TAILQ_NEXT(p, p_runq);
/*
-* If the rwlock passed to rwsleep() is contended, the
-* CPU will end up calling wakeup() between sleep_setup()
-* and sleep_finish().
+* This happens if wakeup(9) is called after enqueuing
+* itself on the sleep queue and both `ident' collide.
 */
-   if (p == curproc) {
-   KASSERT(p->p_stat == SONPROC);
+   if (p == curproc)
continue;
-   }
 #ifdef DIAGNOSTIC
if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
panic("wakeup: p_stat is %d", (int)p->p_stat);



Re: mi_switch() & setting `p_stat'

2021-10-02 Thread Martin Pieuchot
On 02/10/21(Sat) 20:24, Mark Kettenis wrote:
> > Date: Sat, 2 Oct 2021 19:55:49 +0200
> > From: Martin Pieuchot 
> > 
> > When a thread running on a CPU schedules itself out, it does the following
> > (pseudo_code):
> > 
> > SCHED_LOCK()
> > curproc->p_stat = SSLEEP;
> > // some more operations
> > mi_switch()
> > 
> > The problem with this is that any instrumentation between setting `p_stat'
> > and cpu_switchto() is incorrect because 'curproc' is still being executed
> > and is not yet sleeping.  Its `p_stat' should be SONPROC and not SSLEEP.
> 
> Hmm, well, we're holding the scheduler lock, so nothing should really
> look at our state at this point...

I added many TRACEPOINT() to investigate the scheduler's behaviour.  They
look at those states.

> > It is possible to reproduce the problem with the following btrace(8) script:
> > 
> >   tracepoint:sched:enqueue { printf("%d -> enqueue (%d)\n", arg0, arg1); }
> >   tracepoint:sched:dequeue { printf("%d <- dequeue (%d)\n", arg0, arg1); }
> >   tracepoint:sched:on__cpu { printf("%d -- on cpu (%d)\n", tid, pid); }
> > 
> > At which point the KASSERT() in wakeup_n() triggers if `curproc' is going to
> > sleep and its sleep channel collides with the running btrace(8) program:
> > 
> >   dt_prov_static_hook() at dt_prov_static_hook+0xe4
> >   remrunqueue() at remrunqueue+0x1a4
> >   sched_chooseproc() at sched_chooseproc+0x200
> >   mi_switch() at mi_switch+0x178
> >   sleep_finish() at sleep_finish+0x1d0
> >   tsleep() at tsleep+0x100
> >   biowait() at biowait+0x4c
> >   ffs_read() at ffs_read+0x1c0
> >   VOP_READ() at VOP_READ+0x44
> >   vn_read() at vn_read+0x84
> >   dofilereadv() at dofilereadv+0x8c
> >   sys_read() at sys_read+0x5c
> 
> which suggests that something fishy is going on here.  Did we
> accidentally introduce a sleeping point in the scheduler?

There's no sleeping point but a call to wakeup().  This wakeup() is
supposed to wake a btrace(8) process.  But if the curproc, which just
added itself to the global sleep queue, ends up in the same bucket as
the btrace process, the KASSERT() line 565 of kern/kern_synch.c will
trigger:

/*
 * If the rwlock passed to rwsleep() is contended, the
 * CPU will end up calling wakeup() between sleep_setup()
 * and sleep_finish().
 */
if (p == curproc) {
KASSERT(p->p_stat == SONPROC);
continue;
}

> > To fix this we should set `p_stat' as late a possible, diff below does that
> > just before calling cpu_switchto().
> > 
> > Note that there's an exception for SRUN because setrunqueue() change 
> > `p_stat'
> > to indicate the thread is on a queue.  I'll discuss that in an upcoming 
> > diff.
> > 
> > ok?
> 
> I'm not necessarily against this diff, but it may hide bugs.  And...

Updated diff that uses a char.

Index: kern/kern_sched.c
===
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.73
diff -u -p -r1.73 kern_sched.c
--- kern/kern_sched.c   9 Sep 2021 18:41:39 -   1.73
+++ kern/kern_sched.c   2 Oct 2021 17:00:52 -
@@ -144,10 +144,9 @@ sched_idle(void *v)
 */
SCHED_LOCK(s);
cpuset_add(_idle_cpus, ci);
-   p->p_stat = SSLEEP;
p->p_cpu = ci;
atomic_setbits_int(>p_flag, P_CPUPEG);
-   mi_switch();
+   mi_switch(SSLEEP);
cpuset_del(_idle_cpus, ci);
SCHED_UNLOCK(s);
 
@@ -159,8 +158,7 @@ sched_idle(void *v)
struct proc *dead;
 
SCHED_LOCK(s);
-   p->p_stat = SSLEEP;
-   mi_switch();
+   mi_switch(SSLEEP);
SCHED_UNLOCK(s);
 
while ((dead = LIST_FIRST(>spc_deadproc))) {
@@ -625,7 +623,7 @@ sched_peg_curproc(struct cpu_info *ci)
atomic_setbits_int(>p_flag, P_CPUPEG);
setrunqueue(ci, p, p->p_usrpri);
p->p_ru.ru_nvcsw++;
-   mi_switch();
+   mi_switch(SRUN);
SCHED_UNLOCK(s);
 }
 
Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.179
diff -u -p -r1.179 kern_synch.c
--- kern/kern_synch.c   9 Sep 2021 18:41:39 -   1.179
+++ kern/kern_synch.c   2 Oct 2021 17:00:52 -
@@ -421,10 +421,9 @@ sleep_finish(struct sleep_state *sls, in
}
 
if (do_sleep) {
-   p->p_stat = SSLEEP;
p-

mi_switch() & setting `p_stat'

2021-10-02 Thread Martin Pieuchot
When a thread running on a CPU schedules itself out, it does the following
(pseudo_code):

SCHED_LOCK()
curproc->p_stat = SSLEEP;
// some more operations
mi_switch()

The problem with this is that any instrumentation between setting `p_stat'
and cpu_switchto() is incorrect because 'curproc' is still being executed
and is not yet sleeping.  Its `p_stat' should be SONPROC and not SSLEEP.

It is possible to reproduce the problem with the following btrace(8) script:

  tracepoint:sched:enqueue { printf("%d -> enqueue (%d)\n", arg0, arg1); }
  tracepoint:sched:dequeue { printf("%d <- dequeue (%d)\n", arg0, arg1); }
  tracepoint:sched:on__cpu { printf("%d -- on cpu (%d)\n", tid, pid); }

At which point the KASSERT() in wakeup_n() triggers if `curproc' is going to
sleep and its sleep channel collides with the running btrace(8) program:

  dt_prov_static_hook() at dt_prov_static_hook+0xe4
  remrunqueue() at remrunqueue+0x1a4
  sched_chooseproc() at sched_chooseproc+0x200
  mi_switch() at mi_switch+0x178
  sleep_finish() at sleep_finish+0x1d0
  tsleep() at tsleep+0x100
  biowait() at biowait+0x4c
  ffs_read() at ffs_read+0x1c0
  VOP_READ() at VOP_READ+0x44
  vn_read() at vn_read+0x84
  dofilereadv() at dofilereadv+0x8c
  sys_read() at sys_read+0x5c

To fix this we should set `p_stat' as late a possible, diff below does that
just before calling cpu_switchto().

Note that there's an exception for SRUN because setrunqueue() change `p_stat'
to indicate the thread is on a queue.  I'll discuss that in an upcoming diff.

ok?

Index: kern/kern_sched.c
===
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.73
diff -u -p -r1.73 kern_sched.c
--- kern/kern_sched.c   9 Sep 2021 18:41:39 -   1.73
+++ kern/kern_sched.c   2 Oct 2021 17:00:52 -
@@ -144,10 +144,9 @@ sched_idle(void *v)
 */
SCHED_LOCK(s);
cpuset_add(_idle_cpus, ci);
-   p->p_stat = SSLEEP;
p->p_cpu = ci;
atomic_setbits_int(>p_flag, P_CPUPEG);
-   mi_switch();
+   mi_switch(SSLEEP);
cpuset_del(_idle_cpus, ci);
SCHED_UNLOCK(s);
 
@@ -159,8 +158,7 @@ sched_idle(void *v)
struct proc *dead;
 
SCHED_LOCK(s);
-   p->p_stat = SSLEEP;
-   mi_switch();
+   mi_switch(SSLEEP);
SCHED_UNLOCK(s);
 
while ((dead = LIST_FIRST(>spc_deadproc))) {
@@ -625,7 +623,7 @@ sched_peg_curproc(struct cpu_info *ci)
atomic_setbits_int(>p_flag, P_CPUPEG);
setrunqueue(ci, p, p->p_usrpri);
p->p_ru.ru_nvcsw++;
-   mi_switch();
+   mi_switch(SRUN);
SCHED_UNLOCK(s);
 }
 
Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.179
diff -u -p -r1.179 kern_synch.c
--- kern/kern_synch.c   9 Sep 2021 18:41:39 -   1.179
+++ kern/kern_synch.c   2 Oct 2021 17:00:52 -
@@ -421,10 +421,9 @@ sleep_finish(struct sleep_state *sls, in
}
 
if (do_sleep) {
-   p->p_stat = SSLEEP;
p->p_ru.ru_nvcsw++;
SCHED_ASSERT_LOCKED();
-   mi_switch();
+   mi_switch(SSLEEP);
} else {
unsleep(p);
}
@@ -603,7 +602,7 @@ sys_sched_yield(struct proc *p, void *v,
newprio = max(newprio, q->p_runpri);
setrunqueue(p->p_cpu, p, newprio);
p->p_ru.ru_nvcsw++;
-   mi_switch();
+   mi_switch(SRUN);
SCHED_UNLOCK(s);
 
return (0);
Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.283
diff -u -p -r1.283 kern_sig.c
--- kern/kern_sig.c 28 Sep 2021 10:00:18 -  1.283
+++ kern/kern_sig.c 2 Oct 2021 17:00:52 -
@@ -1347,7 +1347,6 @@ proc_stop(struct proc *p, int sw)
SCHED_ASSERT_LOCKED();
 #endif
 
-   p->p_stat = SSTOP;
atomic_clearbits_int(>ps_flags, PS_WAITED);
atomic_setbits_int(>ps_flags, PS_STOPPED);
atomic_setbits_int(>p_flag, P_SUSPSIG);
@@ -1357,7 +1356,7 @@ proc_stop(struct proc *p, int sw)
 */
softintr_schedule(proc_stop_si);
if (sw)
-   mi_switch();
+   mi_switch(SSTOP);
 }
 
 /*
@@ -1979,8 +1978,7 @@ single_thread_check_locked(struct proc *
}
 
/* not exiting and don't need to unwind, so suspend */
-   p->p_stat = SSTOP;
-   mi_switch();
+   mi_switch(SSTOP);
} while (pr->ps_single != NULL);
}
 
Index: kern/sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.69

Re: rtfree(): "rt->rt_refcnt > 0" assertion

2021-10-02 Thread Martin Pieuchot
On 15/09/21(Wed) 01:23, Vitaliy Makkoveev wrote:
> We have weird `rt_refcnt' check in rtfree():

> 
> 497 rtfree(struct rtentry *rt)
> 498 {
>   ...
> 504 refcnt = (int)atomic_dec_int_nv(>rt_refcnt);
> 505 if (refcnt <= 0) {
> 506 KASSERT(!ISSET(rt->rt_flags, RTF_UP));
> 507 KASSERT(!RT_ROOT(rt));
> 508 atomic_dec_int();
> 509 if (refcnt < 0) {
> 510 printf("rtfree: %p not freed (neg refs)\n", rt);
> 511 return;
> 512 }  
> 
> We underflow `rt_refcnt' when we missed to get reference to this `rt' or
> we did extra release. This is the bug which should be exposed. But
> according current code this `rt' is just leaked. Also it's easy to miss
> this error condition because we only print error message.

Yes, all of this is intentional.  If you screw up reference counting a
leak is better than a crash that you can't debug.  At least people can
report there is a leak.  

> I propose to put "rt->rt_refcnt > 0" assertion before we decrement
> `rt_refcnt'. This makes reference counting errors more notable when they
> are. Also I changed `rt_refcnt' definition to unsigned integer.

Such assert isn't safe because the value can be dereferenced by another
thread.  You can only assert for the value of a private variable or if
you're under a critical section.

> I didn't find any "rtfree: ... not freed (neg refs)" report, so it
> looks like we can't hit this assertion, but I like to commit this
> diff after release. But nothing stops to test it and provide feedback :)

We did hit it many many times in the past.  Please do not change this.

> Index: sys/net/route.c
> ===
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.399
> diff -u -p -r1.399 route.c
> --- sys/net/route.c   25 May 2021 22:45:09 -  1.399
> +++ sys/net/route.c   14 Sep 2021 21:47:11 -
> @@ -496,20 +496,15 @@ rtref(struct rtentry *rt)
>  void
>  rtfree(struct rtentry *rt)
>  {
> - int  refcnt;
> -
>   if (rt == NULL)
>   return;
>  
> - refcnt = (int)atomic_dec_int_nv(>rt_refcnt);
> - if (refcnt <= 0) {
> + KASSERT(rt->rt_refcnt > 0);
> +
> + if (atomic_dec_int_nv(>rt_refcnt) == 0) {
>   KASSERT(!ISSET(rt->rt_flags, RTF_UP));
>   KASSERT(!RT_ROOT(rt));
>   atomic_dec_int();
> - if (refcnt < 0) {
> - printf("rtfree: %p not freed (neg refs)\n", rt);
> - return;
> - }
>  
>   KERNEL_LOCK();
>   rt_timer_remove_all(rt);
> Index: sys/net/route.h
> ===
> RCS file: /cvs/src/sys/net/route.h,v
> retrieving revision 1.185
> diff -u -p -r1.185 route.h
> --- sys/net/route.h   17 Mar 2021 09:05:42 -  1.185
> +++ sys/net/route.h   14 Sep 2021 21:47:11 -
> @@ -113,7 +113,7 @@ struct rtentry {
>   struct rt_kmetrics rt_rmx;  /* metrics used by rx'ing protocols */
>   unsigned int rt_ifidx;  /* the answer: interface to use */
>   unsigned int rt_flags;  /* up/down?, host/net */
> - int  rt_refcnt; /* # held references */
> + unsigned int rt_refcnt; /* # held references */
>   int  rt_plen;   /* prefix length */
>   uint16_t rt_labelid;/* route label ID */
>   uint8_t  rt_priority;   /* routing priority to use */
> 



Switch to kqueue based select(2)

2021-10-02 Thread Martin Pieuchot
As discussed during k2k21 I'd like to switch to the new select(2)
implementation early during this release cycle.  I'd like to first
make sure there's no regression for select(2) and poll(2) then work
towards improving the latency and removing the contention on those
syscalls. 

This has been largely tested but I'd happy to have more infos, at least
espie@ volunteered to put it in through a bulk ;)

ok?

Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.135
diff -u -p -r1.135 sys_generic.c
--- kern/sys_generic.c  8 Jan 2021 09:29:04 -   1.135
+++ kern/sys_generic.c  1 Oct 2021 20:01:44 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -66,8 +67,21 @@
 
 #include 
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2) information, somewhat verbose
+ *  3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) {  \
+   printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);  \
+   printf(x);  \
+}
+
+int pselregister(struct proc *, fd_set *[], int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
+
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
 struct timespec *, const sigset_t *, register_t *);
@@ -584,11 +598,10 @@ int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
 {
+   struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
-   struct timespec elapsed, start, stop;
-   uint64_t nsecs;
-   int s, ncoll, error = 0;
+   int error, ncollected = 0, nevents = 0;
u_int ni;
 
if (nd < 0)
@@ -618,6 +631,8 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)[5];
}
 
+   kqpoll_init();
+
 #definegetbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
goto done;
@@ -636,43 +651,61 @@ dopselect(struct proc *p, int nd, fd_set
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-   ncoll = nselcoll;
-   atomic_setbits_int(>p_flag, P_SELECT);
-   error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
-   if (error || *retval)
+   /* Register kqueue events */
+   error = pselregister(p, pibits, nd, );
+   if (error != 0)
goto done;
-   if (timeout == NULL || timespecisset(timeout)) {
-   if (timeout != NULL) {
-   getnanouptime();
-   nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-   } else
-   nsecs = INFSLP;
-   s = splhigh();
-   if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-   splx(s);
-   goto retry;
-   }
-   atomic_clearbits_int(>p_flag, P_SELECT);
-   error = tsleep_nsec(, PSOCK | PCATCH, "select", nsecs);
-   splx(s);
+
+   /*
+* The poll/select family of syscalls has been designed to
+* block when file descriptors are not available, even if
+* there's nothing to wait for.
+*/
+   if (nevents == 0) {
+   uint64_t nsecs = INFSLP;
+
if (timeout != NULL) {
-   getnanouptime();
-   timespecsub(, , );
-   timespecsub(timeout, , timeout);
-   if (timeout->tv_sec < 0)
-   timespecclear(timeout);
+   if (!timespecisset(timeout))
+   goto done;
+   nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
}
-   if (error == 0 || error == EWOULDBLOCK)
-   goto retry;
+   error = tsleep_nsec(>p_kq, PSOCK | PCATCH, "kqsel", nsecs);
+   /* select is not restarted after signals... */
+   if (error == ERESTART)
+   error = EINTR;
+   if (error == EWOULDBLOCK)
+   error = 0;
+   goto done;
}
-done:
-   atomic_clearbits_int(>p_flag, P_SELECT);
-   /* select is not restarted after signals... */
-   if (error == ERESTART)
-   error = EINTR;
-   if (error == EWOULDBLOCK)
-   error = 0;
+
+   /* Collect at 

PGO_NOWAIT & time to fault again

2021-10-02 Thread Martin Pieuchot
Diff below brings back the fix for the deadlock between uvn_io() and
uvn_flush() (uvm/uvm_vnode.c r1.110) that doesn't introduces a lock
ordering issue with the inode lock.

This solution makes a thread return VM_PAGER_AGAIN and restart the fault
if there's some contention on the underlying vnode. This approach has
been previously reverted because tb@ and robert@ reported that chrome &
firefox where starting very slowly with it.

This delay at starting a highly contended multi-threaded process is due
to the 1sec delay in uvm_fault_lower() if VM_PAGER_AGAIN is returned.
So the diff below works around this by using a very small value.  It is
not clear to me which correct value should be used there, using 5nsec
basically reduce the 'sleep' time to the overhead of inserting itself on
the global sleep queue.

This has been tested as parted of the bigger UVM unlocking diff but I'd
appreciate more specific tests.

I won't put this in before snapshots are build again ;)

Comments?  Oks?

Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.120
diff -u -p -r1.120 uvm_fault.c
--- uvm/uvm_fault.c 26 Mar 2021 13:40:05 -  1.120
+++ uvm/uvm_fault.c 2 Oct 2021 06:31:48 -
@@ -1260,7 +1260,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
 
if (result == VM_PAGER_AGAIN) {
tsleep_nsec(, PVM, "fltagain2",
-   SEC_TO_NSEC(1));
+   MSEC_TO_NSEC(5));
return ERESTART;
}
 
Index: uvm/uvm_pager.h
===
RCS file: /cvs/src/sys/uvm/uvm_pager.h,v
retrieving revision 1.32
diff -u -p -r1.32 uvm_pager.h
--- uvm/uvm_pager.h 12 Mar 2021 14:15:49 -  1.32
+++ uvm/uvm_pager.h 2 Oct 2021 06:31:48 -
@@ -111,6 +111,7 @@ struct uvm_pagerops {
 #define PGO_LOCKED 0x040   /* fault data structures are locked [get] */
 #define PGO_PDFREECLUST0x080   /* daemon's free cluster flag 
[uvm_pager_put] */
 #define PGO_REALLOCSWAP0x100   /* reallocate swap area 
[pager_dropcluster] */
+#define PGO_NOWAIT 0x200   /* do not wait for inode lock */
 
 /* page we are not interested in getting */
 #define PGO_DONTCARE ((struct vm_page *) -1L)  /* [get only] */
Index: uvm/uvm_vnode.c
===
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.114
diff -u -p -r1.114 uvm_vnode.c
--- uvm/uvm_vnode.c 16 Jun 2021 09:02:21 -  1.114
+++ uvm/uvm_vnode.c 2 Oct 2021 06:31:48 -
@@ -90,9 +90,6 @@ intuvn_io(struct uvm_vnode *, vm_page
 int uvn_put(struct uvm_object *, vm_page_t *, int, boolean_t);
 voiduvn_reference(struct uvm_object *);
 
-int uvm_vnode_lock(struct uvm_vnode *);
-voiduvm_vnode_unlock(struct uvm_vnode *);
-
 /*
  * master pager structure
  */
@@ -878,16 +875,11 @@ uvn_cluster(struct uvm_object *uobj, vof
 int
 uvn_put(struct uvm_object *uobj, struct vm_page **pps, int npages, int flags)
 {
-   struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
int retval;
 
KERNEL_ASSERT_LOCKED();
 
-   retval = uvm_vnode_lock(uvn);
-   if (retval)
-   return retval;
-   retval = uvn_io(uvn, pps, npages, flags, UIO_WRITE);
-   uvm_vnode_unlock(uvn);
+   retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE);
 
return retval;
 }
@@ -905,10 +897,9 @@ int
 uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
 int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
 {
-   struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
voff_t current_offset;
struct vm_page *ptmp;
-   int lcv, result, gotpages, retval;
+   int lcv, result, gotpages;
boolean_t done;
 
KERNEL_ASSERT_LOCKED();
@@ -983,18 +974,6 @@ uvn_get(struct uvm_object *uobj, voff_t 
}
 
/*
-* Before getting non-resident pages which must be populate with data
-* using I/O on the backing vnode, lock the same vnode. Such pages are
-* about to be allocated and busied (i.e. PG_BUSY) by the current
-* thread. Allocating and busying the page(s) before acquiring the
-* vnode lock could cause a deadlock with uvn_flush() which acquires the
-* vnode lock before waiting on pages to become unbusy and then flushed.
-*/
-   retval = uvm_vnode_lock(uvn);
-   if (retval)
-   return retval;
-
-   /*
 * step 2: get non-resident or busy pages.
 * data structures are unlocked.
 *
@@ -1080,15 +1059,14 @@ uvn_get(struct uvm_object *uobj, voff_t 
 * we have a "fake/busy/clean" page that we just allocated.  do
 

i386: pmap_collect()

2021-10-01 Thread Martin Pieuchot
Diff below turns i386's pmap_collect() into a noop like it is on
amd64/arm64/powerpc64...  This is part of the UVM unlocking diff and
might no longer be necessary now that pmap_extract() has been fixed.

So I'd like to know if we want to align i386's behavior with other
archs, which should help us debug MI issues or if I drop this diff.

Since sthen@ tested it, I'd be in favor of putting it in.  ok?

Index: arch/i386/i386/pmap.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.217
diff -u -p -r1.217 pmap.c
--- arch/i386/i386/pmap.c   11 Sep 2021 18:08:32 -  1.217
+++ arch/i386/i386/pmap.c   1 Oct 2021 17:04:20 -
@@ -2259,13 +2259,6 @@ pmap_unwire_86(struct pmap *pmap, vaddr_
 void
 pmap_collect(struct pmap *pmap)
 {
-   /*
-* free all of the pt pages by removing the physical mappings
-* for its entire address space.
-*/
-
-   pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
-   PMAP_REMOVE_SKIPWIRED);
 }
 
 /*



Unref/free amap w/o KERNEL_LOCK()

2021-10-01 Thread Martin Pieuchot
amaps operation are already serialized by their own lock so it is
possible to free them w/o holding the KERNEL_LOCK().  This has been
tested by many as part of the UVM unlocking diff.

ok?

Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.277
diff -u -p -r1.277 uvm_map.c
--- uvm/uvm_map.c   17 Jun 2021 16:10:39 -  1.277
+++ uvm/uvm_map.c   1 Oct 2021 17:02:29 -
@@ -1570,9 +1570,15 @@ uvm_unmap_detach(struct uvm_map_deadq *d
int waitok = flags & UVM_PLA_WAITOK;
 
TAILQ_FOREACH_SAFE(entry, deadq, dfree.deadq, tmp) {
+   /* Drop reference to amap, if we've got one. */
+   if (entry->aref.ar_amap)
+   amap_unref(entry->aref.ar_amap,
+   entry->aref.ar_pageoff,
+   atop(entry->end - entry->start),
+   flags & AMAP_REFALL);
+
/* Skip entries for which we have to grab the kernel lock. */
-   if (entry->aref.ar_amap || UVM_ET_ISSUBMAP(entry) ||
-   UVM_ET_ISOBJ(entry))
+   if (UVM_ET_ISSUBMAP(entry) || UVM_ET_ISOBJ(entry))
continue;
 
TAILQ_REMOVE(deadq, entry, dfree.deadq);
@@ -1586,13 +1592,6 @@ uvm_unmap_detach(struct uvm_map_deadq *d
while ((entry = TAILQ_FIRST(deadq)) != NULL) {
if (waitok)
uvm_pause();
-   /* Drop reference to amap, if we've got one. */
-   if (entry->aref.ar_amap)
-   amap_unref(entry->aref.ar_amap,
-   entry->aref.ar_pageoff,
-   atop(entry->end - entry->start),
-   flags & AMAP_REFALL);
-
/* Drop reference to our backing object, if we've got one. */
if (UVM_ET_ISSUBMAP(entry)) {
/* ... unlikely to happen, but play it safe */



wakeup_n() w/o DIAGNOSTIC fix

2021-09-09 Thread Martin Pieuchot
The check to avoid a panic for contented rwlock(9) should be outside of 
#ifdef DIAGNOSTIC.

ok?

Index: kern//kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.177
diff -u -p -r1.177 kern_synch.c
--- kern//kern_synch.c  4 Mar 2021 09:02:37 -   1.177
+++ kern//kern_synch.c  9 Sep 2021 15:01:07 -
@@ -556,7 +556,6 @@ wakeup_n(const volatile void *ident, int
qp = [LOOKUP(ident)];
for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
pnext = TAILQ_NEXT(p, p_runq);
-#ifdef DIAGNOSTIC
/*
 * If the rwlock passed to rwsleep() is contended, the
 * CPU will end up calling wakeup() between sleep_setup()
@@ -566,6 +565,7 @@ wakeup_n(const volatile void *ident, int
KASSERT(p->p_stat == SONPROC);
continue;
}
+#ifdef DIAGNOSTIC
if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
panic("wakeup: p_stat is %d", (int)p->p_stat);
 #endif



Re: mutex(9): initialize some more mutexes before use?

2021-09-08 Thread Martin Pieuchot
On 07/09/21(Tue) 14:19, Patrick Wildt wrote:
> Hi,
> 
> I was playing around a little with the mutex code and found that on
> arm64 there some uninitialized mutexes out there.
> 
> I think the arm64 specific one is comparatively easy to solve.  We
> either initialize the mtx when we initialize the rest of the pmap, or
> we move it into the global definition of those.  I opted for the former
> version.

Is the kernel pmap mutex supposed to be used?  On i386 it isn't so the
mutex's IPL is set to -1 and we added a KASSERT() in splraise() to spot
any mistake.

> The other one prolly needs more discussion/debugging.  So uvm_init()
> calls first pmap_init() and then uvm_km_page_init().  The latter does
> initialize the mutex, but arm64's pmap_init() already uses pools, which
> uses km_alloc, which then uses that mutex.  Now one easy fix would be
> to just initialize the definition right away instead of during runtime.
> 
> But there might be the question if arm64's pmap is allowed to use pools
> and km_alloc during pmap_init.

That's a common question for the family of pmaps calling pool_setlowat()
in pmap_init().  That's where pool_prime() is called from.

> #0  0xff800073f984 in mtx_enter (mtx=0xff8000f3b048 ) 
> at /usr/src/sys/kern/kern_lock.c:281
> #1  0xff8000937e6c in km_alloc (sz= dwarf expression opcode 0xa3>, kv=0xff8000da6a30 , 
> kp=0xff8000da6a48 , kd=0xff8000e934d8)
> at /usr/src/sys/uvm/uvm_km.c:899
> #2  0xff800084d804 in pool_page_alloc (pp= Unhandled dwarf expression opcode 0xa3>, flags= Unhandled dwarf expression opcode 0xa3>,
> slowdown= 0xa3>) at /usr/src/sys/kern/subr_pool.c:1633
> #3  0xff800084f8dc in pool_allocator_alloc (pp=0xff8000ea6e40 
> , flags=65792, slowdown=0xff80026cd098) at 
> /usr/src/sys/kern/subr_pool.c:1602
> #4  0xff800084ef08 in pool_p_alloc (pp=0xff8000ea6e40 
> , flags=2, slowdown=0xff8000e9359c) at 
> /usr/src/sys/kern/subr_pool.c:926
> #5  0xff800084f808 in pool_prime (pp=, n= variable: Unhandled dwarf expression opcode 0xa3>) at 
> /usr/src/sys/kern/subr_pool.c:896
> #6  0xff800048c20c in pmap_init () at 
> /usr/src/sys/arch/arm64/arm64/pmap.c:1682
> #7  0xff80009384dc in uvm_init () at /usr/src/sys/uvm/uvm_init.c:118
> #8  0xff800048e664 in main (framep= dwarf expression opcode 0xa3>) at /usr/src/sys/kern/init_main.c:235
> 
> diff --git a/sys/arch/arm64/arm64/pmap.c b/sys/arch/arm64/arm64/pmap.c
> index 79a344cc84e..f070f4540ec 100644
> --- a/sys/arch/arm64/arm64/pmap.c
> +++ b/sys/arch/arm64/arm64/pmap.c
> @@ -1308,10 +1308,12 @@ pmap_bootstrap(long kvo, paddr_t lpt1, long 
> kernelstart, long kernelend,
>   pmap_kernel()->pm_vp.l1 = (struct pmapvp1 *)va;
>   pmap_kernel()->pm_privileged = 1;
>   pmap_kernel()->pm_asid = 0;
> + mtx_init(_kernel()->pm_mtx, IPL_VM);
>  
>   pmap_tramp.pm_vp.l1 = (struct pmapvp1 *)va + 1;
>   pmap_tramp.pm_privileged = 1;
>   pmap_tramp.pm_asid = 0;
> + mtx_init(_tramp.pm_mtx, IPL_VM);
>  
>   /* Mark ASID 0 as in-use. */
>   pmap_asid[0] |= (3U << 0);
> diff --git a/sys/uvm/uvm_km.c b/sys/uvm/uvm_km.c
> index 4a60377e9d7..e77afeda832 100644
> --- a/sys/uvm/uvm_km.c
> +++ b/sys/uvm/uvm_km.c
> @@ -644,7 +644,7 @@ uvm_km_page_lateinit(void)
>   * not zero filled.
>   */
>  
> -struct uvm_km_pages uvm_km_pages;
> +struct uvm_km_pages uvm_km_pages = { .mtx = MUTEX_INITIALIZER(IPL_VM) };
>  
>  void uvm_km_createthread(void *);
>  void uvm_km_thread(void *);
> @@ -664,7 +664,6 @@ uvm_km_page_init(void)
>   int len, bulk;
>   vaddr_t addr;
>  
> - mtx_init(_km_pages.mtx, IPL_VM);
>   if (!uvm_km_pages.lowat) {
>   /* based on physmem, calculate a good value here */
>   uvm_km_pages.lowat = physmem / 256;
> 



Re: [please test] amd64: schedule clock interrupts against system clock

2021-09-07 Thread Martin Pieuchot
On 07/09/21(Tue) 21:47, Patrick Wildt wrote:
> Am Tue, Sep 07, 2021 at 02:43:22PM +0200 schrieb Patrick Wildt:
> > Am Mon, Sep 06, 2021 at 09:43:29PM +0200 schrieb Patrick Wildt:
> > > Am Fri, Jul 30, 2021 at 07:55:29PM +0200 schrieb Alexander Bluhm:
> > > > On Mon, Jul 26, 2021 at 08:12:39AM -0500, Scott Cheloha wrote:
> > > > > On Fri, Jun 25, 2021 at 06:09:27PM -0500, Scott Cheloha wrote:
> > > > > 1 month bump.  I really appreciate the tests I've gotten so far, thank
> > > > > you.
> > > > 
> > > > On my Xeon machine it works and all regress tests pass.
> > > > 
> > > > But it fails on my old Opteron machine.  It hangs after attaching
> > > > cpu1.
> > > 
> > > This seems to be caused by contention on the mutex in i8254's gettick().
> > > 
> > > With Scott's diff, delay_func is i8254_delay() on that old AMD machine.
> > > Its gettick() implementation uses a mutex to protect I/O access to the
> > > i8254.
> > > 
> > > When secondary CPUs come up, they will wait for CPU0 to let them boot up
> > > further by checking for a flag:
> > > 
> > >   /*
> > >* We need to wait until we can identify, otherwise dmesg
> > >* output will be messy.
> > >*/
> > >   while ((ci->ci_flags & CPUF_IDENTIFY) == 0)
> > >   delay(10);
> > > 
> > > Now that machine has 3 secondary cores that are spinning like that.  At
> > > the same time CPU0 waits for the core to come up:
> > > 
> > >   /* wait for it to identify */
> > >   for (i = 200; (ci->ci_flags & CPUF_IDENTIFY) && i > 0; i--)
> > >   delay(10);
> > > 
> > > That means we have 3-4 cores spinning just to be able to delay().  Our
> > > mutex implementation isn't fair, which means whoever manages to claim
> > > the free mutex wins.  Now if CPU2 and CPU3 are spinning all the time,
> > > CPU1 identifies and needs delay() and CPU0 waits for CPU1, maybe the
> > > one that needs to make progress never gets it.
> > > 
> > > I changed those delay(10) in cpu_hatch() to CPU_BUSY_CYCLE() and it went
> > > ahead a bit better instead of hanging forever.
> > > 
> > > Then I remembered an idea something from years ago: fair kernel mutexes,
> > > so basically mutexes implemented as ticket lock, like our kerne lock.
> > > 
> > > I did a quick diff, which probably contains a million bugs, but with
> > > this bluhm's machine boots up well.
> > > 
> > > I'm not saying this is the solution, but it might be.
> > > 
> > > Patrick
> > 
> > Cleaned the diff up a little, changes since last time:
> > 
> > * Rename the struct members to be the same as mplock.
> > * Change the code to use ticket/user numbers like mplock.  This
> >   has one obvious downside: If a mutex is not initialized, trying
> >   to get this mutex will result in a hang.  At least that just let
> >   me find some uninitialized mutexes.
> > * More consistent use of the 'ci' variable.
> > * Definitely compiles with/without DIAGNOSTIC.
> > * Made sure mtx_enter() still has the membar.
> > * No need for READ_ONCE() when members are volatile.
> > 
> > Apart from being fair, this diff also changes the behaviour while
> > spinning for a lock.  Previously mtx_enter called mtx_enter_try
> > in a loop until it got the lock.  mtx_enter_try does splraise,
> > try lock, splx.  This diff currently spins with the SPL raised,
> > so that's a change in behaviour.  I'm sure I can change the diff
> > to splraise/splx while looping, if we prefer that behaviour.
> > 
> > Patrick

This change makes sense on its own as the contention is switching away
from KERNEL_LOCK() to mutexes.

Note that hppa has its own mutex implementation in case somebody wants
to keep in sync.
 
> make -j17 seems to have used less system time, so that seemed to have
> made the machine slightly faster:
> 
> old: make -j17  1160.01s user 3244.58s system 1288% cpu 5:41.96 total
> new: make -j17  1171.80s user 3059.67s system 1295% cpu 5:26.65 total

Is it with -current or with the UVM unlocking diff that put more
pressure on mutxes?

> I'll change the diff to do splraise/splx while looping, to make the
> behaviour more similar to before, and then re-do my testing.

That'd be nice.  You could also start a new thread to get attention of
more people, maybe dlg@, visa@ or kettenis@ have an opinion on this.



Re: fix iwx(4) firmware loading during resume

2021-09-07 Thread Martin Pieuchot
On 07/09/21(Tue) 18:03, Stefan Sperling wrote:
> On Tue, Sep 07, 2021 at 05:16:52PM +0200, Martin Pieuchot wrote:
> > On 07/09/21(Tue) 15:48, Stefan Sperling wrote:
> > > This patch makes iwx(4) resume reliably for me.
> > > 
> > > There were missing splnet() calls which leads to an obvious race
> > > between the interrupt handler and the code which triggers firmware
> > > loading and then sleeps to wait for confirmation.
> > > This patch adds the missing splnet().
> > > 
> > > However, even with splnet() protection added I need to add the
> > > following two lines of code to make the patch work reliably:
> > > 
> > >   /* wait for the firmware to load */
> > >   for (w = 0; !sc->sc_uc.uc_intr && w < 10; w++) {
> > >   err = tsleep_nsec(>sc_uc, 0, "iwxuc", MSEC_TO_NSEC(100));
> > > + /* XXX This should not be needed, should it: */
> > > + if (err == EWOULDBLOCK && sc->sc_uc.uc_intr)
> > > + err = 0;
> > >   }
> > >   if (err || !sc->sc_uc.uc_ok)
> > >   printf("%s: could not load firmware, %d\n", DEVNAME(sc), err);
> > > 
> > > Which seems odd. I would expect tsleep to return EWOULDBLOCK only when
> > > the interrupt handler did not already set uc_intr and call wakeup().
> > 
> > That suggests the timeout fires before the wakeup(9).
> 
> Yes, it does.
> 
> But how could uc_intr already be set to 1 in that case?

Is it set before the timeout fires or before tsleep(9) returns?



Re: fix iwx(4) firmware loading during resume

2021-09-07 Thread Martin Pieuchot
On 07/09/21(Tue) 15:48, Stefan Sperling wrote:
> This patch makes iwx(4) resume reliably for me.
> 
> There were missing splnet() calls which leads to an obvious race
> between the interrupt handler and the code which triggers firmware
> loading and then sleeps to wait for confirmation.
> This patch adds the missing splnet().
> 
> However, even with splnet() protection added I need to add the
> following two lines of code to make the patch work reliably:
> 
>   /* wait for the firmware to load */
>   for (w = 0; !sc->sc_uc.uc_intr && w < 10; w++) {
>   err = tsleep_nsec(>sc_uc, 0, "iwxuc", MSEC_TO_NSEC(100));
> + /* XXX This should not be needed, should it: */
> + if (err == EWOULDBLOCK && sc->sc_uc.uc_intr)
> + err = 0;
>   }
>   if (err || !sc->sc_uc.uc_ok)
>   printf("%s: could not load firmware, %d\n", DEVNAME(sc), err);
> 
> Which seems odd. I would expect tsleep to return EWOULDBLOCK only when
> the interrupt handler did not already set uc_intr and call wakeup().

That suggests the timeout fires before the wakeup(9).  You can check
that by using wakeup_n() and print how many thread have been awaken. 

> However, here tsleep returns with EWOULDBLOCK, after the interrupt did
> occur and firmware has reported that it is alive, so both uc_intr and uc_ok
> are set. This only seems to happen during resume (in a task scheduled
> during DVACT_WAKEUP), but not during autoconf or regular ifconfig down/up.

That suggests 100msec might be too small.  Did you try with a bigger
value?  Or is there something special happening during DVACT_WAKEUP?

> Am I missing something? Am I adding splnet() in the wrong place?

The splnet() you're adding ensure (with the KERNEL_LOCK()) no wakeup(9)
will happen until you go to sleep.  I believe that's what you want.

> Does this patch work reliably for anyone without the above change?
> 
> diff c05ef66598a004c1e173a5d0fd4cdf5403f2ad99 /usr/src
> blob - 428a299e7a2d859aa68a934d4048d843be1835ce
> file + sys/dev/pci/if_iwx.c
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -3350,6 +3350,8 @@ iwx_load_firmware(struct iwx_softc *sc)
>   struct iwx_fw_sects *fws;
>   int err, w;
>  
> + splassert(IPL_NET);
> +
>   sc->sc_uc.uc_intr = 0;
>  
>   fws = >sc_fw.fw_sects[IWX_UCODE_TYPE_REGULAR];
> @@ -3362,6 +3364,9 @@ iwx_load_firmware(struct iwx_softc *sc)
>   /* wait for the firmware to load */
>   for (w = 0; !sc->sc_uc.uc_intr && w < 10; w++) {
>   err = tsleep_nsec(>sc_uc, 0, "iwxuc", MSEC_TO_NSEC(100));
> + /* XXX This should not be needed, should it: */
> + if (err == EWOULDBLOCK && sc->sc_uc.uc_intr)
> + err = 0;
>   }
>   if (err || !sc->sc_uc.uc_ok)
>   printf("%s: could not load firmware, %d\n", DEVNAME(sc), err);
> @@ -3466,7 +3471,7 @@ iwx_run_init_mvm_ucode(struct iwx_softc *sc, int readn
>   struct iwx_init_extended_cfg_cmd init_cfg = {
>   .init_flags = htole32(IWX_INIT_NVM),
>   };
> - int err;
> + int err, s;
>  
>   if ((sc->sc_flags & IWX_FLAG_RFKILL) && !readnvm) {
>   printf("%s: radio is disabled by hardware switch\n",
> @@ -3474,10 +3479,12 @@ iwx_run_init_mvm_ucode(struct iwx_softc *sc, int readn
>   return EPERM;
>   }
>  
> + s = splnet();
>   sc->sc_init_complete = 0;
>   err = iwx_load_ucode_wait_alive(sc);
>   if (err) {
>   printf("%s: failed to load init firmware\n", DEVNAME(sc));
> + splx(s);
>   return err;
>   }
>  
> @@ -3487,22 +3494,28 @@ iwx_run_init_mvm_ucode(struct iwx_softc *sc, int readn
>*/
>   err = iwx_send_cmd_pdu(sc, IWX_WIDE_ID(IWX_SYSTEM_GROUP,
>   IWX_INIT_EXTENDED_CFG_CMD), 0, sizeof(init_cfg), _cfg);
> - if (err)
> + if (err) {
> + splx(s);
>   return err;
> + }
>  
>   err = iwx_send_cmd_pdu(sc, IWX_WIDE_ID(IWX_REGULATORY_AND_NVM_GROUP,
>   IWX_NVM_ACCESS_COMPLETE), 0, sizeof(nvm_complete), _complete);
> - if (err)
> + if (err) {
> + splx(s);
>   return err;
> + }
>  
>   /* Wait for the init complete notification from the firmware. */
>   while ((sc->sc_init_complete & wait_flags) != wait_flags) {
>   err = tsleep_nsec(>sc_init_complete, 0, "iwxinit",
>   SEC_TO_NSEC(2));
> - if (err)
> + if (err) {
> + splx(s);
>   return err;
> + }
>   }
> -
> + splx(s);
>   if (readnvm) {
>   err = iwx_nvm_get(sc);
>   if (err) {
> 



Re: Analyse of kernel lock contention

2021-09-07 Thread Martin Pieuchot
On 06/09/21(Mon) 17:30, Martin Pieuchot wrote:
> [...]
> 3) 2ytHD+make-j17+kqpoll_unlocked_arm64.svg
> ===

This should be:

3) 2ytHD+googlemap_arm64.svg


> The intend of this test is to expose where the contention is for heavy
> multi-threaded process workload.  We didn't care much about idle time,
> it is much more about low latency, how "smooth" can run desktop apps in
> other words what happens in the kernel.
> 
>   - UVM fault unlocking is "good enough" for such workload and all the
> contention is due to syscalls
> 
>   - If we look at time spent in kernel, 37% is spent spinning on the
> KERNEL_LOCK() and 12% on the SCHED_LOCK().  So almost half of %sys
> time is spinning.
> 
> . futex(2) for FUTEX_WAIT exposes most of it.  It spins on the
>   KERNEL_LOCK() because sleeping with PCATCH requires it, then it
>   spins on the SCHED_LOCK() to put itself on the sleep queue.
> 
> . kevent(2), poll(2), and DRM ioctl(2) are responsible for a lot
>   of KERNEL_LOCK() contention in this workload 
> 
> . NET_LOCK() contention in poll(2) and kqueue(2) generate a lot of
>   sleeps which, together with a lot of futex(2) make the SCHED_LOCK()
>   contention bad.



Re: Fix: tcp_output window calculation error

2021-09-05 Thread Martin Pieuchot
On 22/07/21(Thu) 15:03, Jan Klemkow wrote:
> Hi,
> 
> This calculation of the receive window has a logic error:
> 
> If win is 0 it will be overwritten by (rcv_adv - rcv_nxt).  Thus, win
> will be (rcv_adv - rcv_nxt) even if its below (sb_hiwat / 4).

Why is this a problem?

> We could just remove the dead (sb_hiwat / 4) code, or reorder the
> conditions to keep the original feature.
> 
> OK?
> 
> bye,
> Jan
> 
> Index: netinet/tcp_output.c
> ===
> RCS file: /cvs/src/sys/netinet/tcp_output.c,v
> retrieving revision 1.130
> diff -u -p -r1.130 tcp_output.c
> --- netinet/tcp_output.c  8 Feb 2021 19:37:15 -   1.130
> +++ netinet/tcp_output.c  22 Jul 2021 12:33:13 -
> @@ -812,12 +812,12 @@ send:
>* Calculate receive window.  Don't shrink window,
>* but avoid silly window syndrome.
>*/
> - if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
> - win = 0;
>   if (win > (long)TCP_MAXWIN << tp->rcv_scale)
>   win = (long)TCP_MAXWIN << tp->rcv_scale;
>   if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
>   win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
> + if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
> + win = 0;
>   if (flags & TH_RST)
>   win = 0;
>   th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
> 



pmap & buffer cache dummy pagers

2021-09-02 Thread Martin Pieuchot
Diff below introduces two dummy pagers for subsystem that manipulate UVM
objects that are 'special'.  Those pagers will be used to enforce checks
in functions that expect a lock to be held, like:

KASSERT(obj == NULL || UVM_OBJ_IS_PMAP(obj) ||
rw_write_held(obj->vmobjlock));

They are also used, in the diff below, to document which routines expect
such objects and a serialization offered by the KERNEL_LOCK().  More
examples can be seen in my WIP unlocking diff.

The idea is taken from NetBSD which also use such dummy pager for some
of their pmaps.  I don't believe there's a need to change anything with
these usages of the uvm_obj_* API for the moment but at the same time it
helps me to have such implicit documentation.

ok?

Index: arch/amd64/amd64/pmap.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v
retrieving revision 1.145
diff -u -p -r1.145 pmap.c
--- arch/amd64/amd64/pmap.c 18 Jun 2021 06:17:28 -  1.145
+++ arch/amd64/amd64/pmap.c 2 Sep 2021 19:55:57 -
@@ -671,7 +671,7 @@ pmap_bootstrap(paddr_t first_avail, padd
 
kpm = pmap_kernel();
for (i = 0; i < PTP_LEVELS - 1; i++) {
-   uvm_obj_init(>pm_obj[i], NULL, 1);
+   uvm_obj_init(>pm_obj[i], _pager, 1);
kpm->pm_ptphint[i] = NULL;
}
memset(>pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
@@ -1307,7 +1307,7 @@ pmap_create(void)
 
/* init uvm_object */
for (i = 0; i < PTP_LEVELS - 1; i++) {
-   uvm_obj_init(>pm_obj[i], NULL, 1);
+   uvm_obj_init(>pm_obj[i], _pager, 1);
pmap->pm_ptphint[i] = NULL;
}
pmap->pm_stats.wired_count = 0;
Index: arch/hppa/hppa/pmap.c
===
RCS file: /cvs/src/sys/arch/hppa/hppa/pmap.c,v
retrieving revision 1.175
diff -u -p -r1.175 pmap.c
--- arch/hppa/hppa/pmap.c   16 Jun 2021 09:02:21 -  1.175
+++ arch/hppa/hppa/pmap.c   2 Sep 2021 19:54:23 -
@@ -496,7 +496,7 @@ pmap_bootstrap(vaddr_t vstart)
 */
kpm = _pmap_store;
bzero(kpm, sizeof(*kpm));
-   uvm_obj_init(>pm_obj, NULL, 1);
+   uvm_obj_init(>pm_obj, _pager, 1);
kpm->pm_space = HPPA_SID_KERNEL;
kpm->pm_pid = HPPA_PID_KERNEL;
kpm->pm_pdir_pg = NULL;
@@ -678,7 +678,7 @@ pmap_create(void)
 
mtx_init(>pm_mtx, IPL_VM);
 
-   uvm_obj_init(>pm_obj, NULL, 1);
+   uvm_obj_init(>pm_obj, _pager, 1);
 
for (space = 1 + arc4random_uniform(hppa_sid_max);
pmap_sdir_get(space); space = (space + 1) % hppa_sid_max);
Index: arch/i386/i386/pmap.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.214
diff -u -p -r1.214 pmap.c
--- arch/i386/i386/pmap.c   16 Jun 2021 09:02:21 -  1.214
+++ arch/i386/i386/pmap.c   2 Sep 2021 19:55:57 -
@@ -963,7 +963,7 @@ pmap_bootstrap(vaddr_t kva_start)
kpm = pmap_kernel();
mtx_init(>pm_mtx, -1); /* must not be used */
mtx_init(>pm_apte_mtx, IPL_VM);
-   uvm_obj_init(>pm_obj, NULL, 1);
+   uvm_obj_init(>pm_obj, _pager, 1);
bzero(>pm_list, sizeof(kpm->pm_list));  /* pm_list not used */
kpm->pm_pdir = (vaddr_t)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
@@ -1348,7 +1348,7 @@ pmap_create(void)
mtx_init(>pm_apte_mtx, IPL_VM);
 
/* init uvm_object */
-   uvm_obj_init(>pm_obj, NULL, 1);
+   uvm_obj_init(>pm_obj, _pager, 1);
pmap->pm_stats.wired_count = 0;
pmap->pm_stats.resident_count = 1;  /* count the PDP allocd below */
pmap->pm_ptphint = NULL;
Index: uvm/uvm_object.c
===
RCS file: /cvs/src/sys/uvm/uvm_object.c,v
retrieving revision 1.19
diff -u -p -r1.19 uvm_object.c
--- uvm/uvm_object.c16 Jun 2021 09:02:21 -  1.19
+++ uvm/uvm_object.c2 Sep 2021 20:00:03 -
@@ -41,6 +41,16 @@
 
 #include 
 
+/* Dummy object used by some pmaps for sanity checks. */
+const struct uvm_pagerops pmap_pager = {
+   /* nothing */
+};
+
+/* Dummy object used by the buffer cache for sanity checks. */
+const struct uvm_pagerops bufcache_pager = {
+   /* nothing */
+};
+
 /* We will fetch this page count per step */
 #defineFETCH_PAGECOUNT 16
 
@@ -159,6 +169,9 @@ uvm_obj_free(struct uvm_object *uobj)
 {
struct vm_page *pg;
struct pglist pgl;
+
+   KASSERT(UVM_OBJ_IS_BUFCACHE(uobj));
+   KERNEL_ASSERT_LOCKED();
 
TAILQ_INIT();
/*
Index: uvm/uvm_object.h
===
RCS file: /cvs/src/sys/uvm/uvm_object.h,v
retrieving revision 1.26
diff -u -p -r1.26 uvm_object.h
--- uvm/uvm_object.h16 Jun 2021 09:02:21 -  1.26

i386 ioapic mtx not initialized

2021-09-02 Thread Martin Pieuchot
Seen with WITNESS, this has already been fixed in amd64, diff below
backport the fix, ok?

ioapic0 at mainbus0: apid 2 pa 0xfec0witness: lock_object uninitialized: 
0xd8841440
Starting stack trace...
witness_checkorder(f5547000,fec01000,fec0,d1820adc,d03fb01e) at 
witness_checkorder+0x85 [/home/os/openbsd/sys/kern/subr_witness.c:2497]
witness_checkorder(d8841440,9,0) at witness_checkorder+0x85 
[/home/os/openbsd/sys/kern/subr_witness.c:2497]
mtx_enter(d8841434) at mtx_enter+0x1c 
[/home/os/openbsd/sys/kern/kern_lock.c:262]
ioapic_attach(d884a040,d8841400,d1820b84) at ioapic_attach+0xe0 
[/home/os/openbsd/sys/arch/i386/i386/ioapic.c:125]
config_attach(d884a040,d0e31314,d1820b84,d068f190) at config_attach+0x18a 
[/home/os/openbsd/sys/kern/subr_autoconf.c:403]
config_found_sm(d884a040,d1820b84,d068f190,0) at config_found_sm+0x29 
[/home/os/openbsd/sys/kern/subr_autoconf.c:313]
acpimadt_attach(d8840400,d88bc2c0,d1820c78) at acpimadt_attach+0x34c 
[/home/os/openbsd/sys/dev/acpi/acpimadt.c:0]
config_attach(d8840400,d0e32574,d1820c78,d07ddd90) at config_attach+0x18a 
[/home/os/openbsd/sys/kern/subr_autoconf.c:403]
config_found_sm(d8840400,d1820c78,d07ddd90,d07e0280) at config_found_sm+0x29 
[/home/os/openbsd/sys/kern/subr_autoconf.c:313]
acpi_attach_common(d8840400,f0120) at acpi_attach_common+0x585 
[/home/os/openbsd/sys/dev/acpi/acpi.c:1207]
acpi_attach(d884a080,d8840400,d1820dd0) at acpi_attach+0x2c 
[/home/os/openbsd/sys/arch/i386/i386/acpi_machdep.c:112]
config_attach(d884a080,d0e32734,d1820dd0,d09d73d0) at config_attach+0x18a 
[/home/os/openbsd/sys/kern/subr_autoconf.c:403]
config_found_sm(d884a080,d1820dd0,d09d73d0,0) at config_found_sm+0x29 
[/home/os/openbsd/sys/kern/subr_autoconf.c:313]
biosattach(d884a040,d884a080,d1820ec0) at biosattach+0x181 
[/home/os/openbsd/sys/arch/i386/i386/bios.c:392]
config_attach(d884a040,d0e31274,d1820ec0,d04d3db0) at config_attach+0x18a 
[/home/os/openbsd/sys/kern/subr_autoconf.c:403]
config_found_sm(d884a040,d1820ec0,d04d3db0,0) at config_found_sm+0x29 
[/home/os/openbsd/sys/kern/subr_autoconf.c:313]
mainbus_attach(0,d884a040,0) at mainbus_attach+0x54 
[/home/os/openbsd/sys/arch/i386/i386/mainbus.c:157]
config_attach(0,d0e2ec34,0,0) at config_attach+0x18a 
[/home/os/openbsd/sys/kern/subr_autoconf.c:403]
config_rootfound(d0c28d4d,0) at config_rootfound+0xaf 
[/home/os/openbsd/sys/kern/subr_autoconf.c:328]
cpu_configure(3327f5e4,181e000,182d000,1821000,0) at cpu_configure+0x4c 
[/home/os/openbsd/sys/arch/i386/i386/autoconf.c:156]
main(0,0,0,0,0) at main+0x342 [/home/os/openbsd/sys/kern/init_main.c:377]
End of stack trace.

Index: i386/ioapic.c
===
RCS file: /cvs/src/sys/arch/i386/i386/ioapic.c,v
retrieving revision 1.41
diff -u -p -r1.41 ioapic.c
--- i386/ioapic.c   25 Aug 2018 16:09:29 -  1.41
+++ i386/ioapic.c   2 Sep 2021 07:27:16 -
@@ -309,6 +309,10 @@ ioapic_attach(struct device *parent, str
sc->sc_reg = (volatile u_int32_t *)(bh + IOAPIC_REG);
sc->sc_data = (volatile u_int32_t *)(bh + IOAPIC_DATA);
 
+#ifdef MULTIPROCESSOR
+   mtx_init(>sc_pic.pic_mutex, IPL_NONE);
+#endif
+
ver_sz = ioapic_read(sc, IOAPIC_VER);
sc->sc_apic_vers = (ver_sz & IOAPIC_VER_MASK) >> IOAPIC_VER_SHIFT;
sc->sc_apic_sz = (ver_sz & IOAPIC_MAX_MASK) >> IOAPIC_MAX_SHIFT;



Incorrect IPL when pool_get(9) is called under rwlock

2021-09-01 Thread Martin Pieuchot
syzkaller reported [0] the following lock ordering issue:

db{0}> trace
db_enter() at db_enter+0x18 sys/arch/amd64/amd64/db_interface.c:440
panic(82464b8f) at panic+0x177 sys/kern/subr_prf.c:202
witness_checkorder(82838c20,9,0) at witness_checkorder+0x11eb 
sys/kern/subr_witness.c:833
__mp_lock(82838a18) at __mp_lock+0xa1 read_rflags machine/cpufunc.h:195 
[inline]
__mp_lock(82838a18) at __mp_lock+0xa1 intr_disable 
machine/cpufunc.h:216 [inline]
__mp_lock(82838a18) at __mp_lock+0xa1 sys/kern/kern_lock.c:142
intr_handler(80002123ad80,80255d80) at intr_handler+0x5e 
sys/arch/amd64/amd64/intr.c:532
Xintr_ioapic_edge20_untramp() at Xintr_ioapic_edge20_untramp+0x18f
Xspllower() at Xspllower+0x19
mtx_enter_try(829b8d10) at mtx_enter_try+0x100
mtx_enter(829b8d10) at mtx_enter+0x4b sys/kern/kern_lock.c:266
pool_get(829b8d10,9) at pool_get+0xbf sys/kern/subr_pool.c:581
vm_create(80b29000,8000211922a8) at vm_create+0x261 
sys/arch/amd64/amd64/vmm.c:1526
vmmioctl(a00,c5005601,80b29000,1,8000211922a8) at vmmioctl+0x1f2
VOP_IOCTL(fd806e213830,c5005601,80b29000,1,fd807f7d8840,8000211922a8)
 at VOP_IOCTL+0x9a sys/kern/vfs_vops.c:295
vn_ioctl(fd806e4aca28,c5005601,80b29000,8000211922a8) at 
vn_ioctl+0xba sys/kern/vfs_vnops.c:531
sys_ioctl(8000211922a8,80002123b398,80002123b3e0) at sys_ioctl+0x4a2


The issue is that pool_get(9) at line 1526 is done after grabbing the
`vm_lock'.  If an interrupt needing the KERNEL_LOCK() occurs at that
moment the above mentionned lock ordering problem could cause a
deadlock.  

To prevent such issue we generally mark the pool with IPL_MPFLOOR.

[0] 
https://syzkaller.appspot.com/bug?id=c73756cc996a58a625da35fbaa90ba6b9e0c60dc

Index: arch/amd64/amd64/vmm.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
retrieving revision 1.287
diff -u -p -r1.287 vmm.c
--- arch/amd64/amd64/vmm.c  31 Aug 2021 17:40:59 -  1.287
+++ arch/amd64/amd64/vmm.c  1 Sep 2021 06:45:38 -
@@ -430,7 +430,7 @@ vmm_attach(struct device *parent, struct
 
pool_init(_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK,
"vmpool", NULL);
-   pool_init(_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
+   pool_init(_pool, sizeof(struct vcpu), 64, IPL_MPFLOOR, PR_WAITOK,
"vcpupl", NULL);
 
vmm_softc = sc;



Re: systat(1) counter overflow

2021-08-30 Thread Martin Pieuchot
On 13/07/21(Tue) 00:55, Anindya Mukherjee wrote:
> On Sat, Jul 03, 2021 at 11:20:42AM +0100, Stuart Henderson wrote:
> > On 2021/07/03 01:09, Anindya Mukherjee wrote:
> > > Thanks for the discussion. This has been very illuminating. I have been 
> > > digging
> > > around in /usr/src/ and ignoring the atomic architectures (where I got 
> > > stuck) it
> > > looks like it should be possible to use uint64_t everywhere. I'm playing 
> > > with
> > > some changes on my machine to see if I can get at least systat(1) and 
> > > vmstat(8)
> > > to work with uint64_t. The ddb printer (uvmexp_print)is another consumer.
> > > 
> > > If it works in the base system then ideally every relevant port should be
> > > updated to be consistent. That is indeed quite a big change; more than I
> > > realised so thanks for setting me straight on that.
> > 
> > We have coped with bigger changes in structs like this before,
> > it didn't used to be too difficult, but that was before go...
> > 
> 
> Hi,
> 
> I have been running for a week with the following diff. This is just a POC and
> hence there are a few ugly hacks. So far top(1), systat(1), and vmstat(8) seem
> to be happy. I haven't hit the 32-bit overflow point for any counters yet but
> the counts look right. I have completely ignored ports, but it looks like the
> base system can run with this change. This was mostly to satisfy my curiosity.

Thanks for your work.

There's no guarantee that 64bit value can be updated atomically on 32bit
architecture, so we can't follow this road.

It seems to me that this issue requires a bit more investigation.  The
existing "struct uvmexp" contains multiple fields with multiple purposes:

1. constants (pagesize, pagemask, pageshift, ...)

2. counters frequently updated, only incremented and only used
   for accounting purposes (faults, pageinsm fltnoram, ...)

3. counters rarely updated, incremented and decremented
   (swpgonly, nswapdev, ...)

4. global variables, that are incremented/decremented and used for
   making decisions (wired, free, zeropages...)

I don't believe all of them need to be of type uint64_t in the kernel.

It's also not clear to me if keeping "struct uvmexp" as it is but
bumping the size to 64bit is the best way forward.  Did you look at
other userland consumer of "struct uvmexp"?  Which fields do they care
about?  If it is the way forward we could simply use a different layout
in the kernel and do the conversion during the syscall.

> ? usr.bin/systat/vim_session
> Index: sys/arch/amd64/include/atomic.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/atomic.h,v
> retrieving revision 1.21
> diff -u -p -r1.21 atomic.h
> --- sys/arch/amd64/include/atomic.h   11 Mar 2021 11:16:55 -  1.21
> +++ sys/arch/amd64/include/atomic.h   13 Jul 2021 07:42:51 -
> @@ -150,6 +150,14 @@ _atomic_inc_long(volatile unsigned long 
>  #define atomic_inc_long(_p) _atomic_inc_long(_p)
>  
>  static inline void
> +_atomic_inc_uint64(volatile uint64_t *p)
> +{
> + __asm volatile(_LOCK " incq %0"
> + : "+m" (*p));
> +}
> +#define atomic_inc_uint64(_p) _atomic_inc_uint64(_p)
> +
> +static inline void
>  _atomic_dec_int(volatile unsigned int *p)
>  {
>   __asm volatile(_LOCK " decl %0"
> @@ -166,6 +174,14 @@ _atomic_dec_long(volatile unsigned long 
>  #define atomic_dec_long(_p) _atomic_dec_long(_p)
>  
>  static inline void
> +_atomic_dec_uint64(volatile uint64_t *p)
> +{
> + __asm volatile(_LOCK " decq %0"
> + : "+m" (*p));
> +}
> +#define atomic_dec_uint64(_p) _atomic_dec_uint64(_p)
> +
> +static inline void
>  _atomic_add_int(volatile unsigned int *p, unsigned int v)
>  {
>   __asm volatile(_LOCK " addl %1,%0"
> @@ -182,6 +198,15 @@ _atomic_add_long(volatile unsigned long 
>   : "a" (v));
>  }
>  #define atomic_add_long(_p, _v) _atomic_add_long(_p, _v)
> +
> +static inline void
> +_atomic_add_uint64(volatile uint64_t *p, uint64_t v)
> +{
> + __asm volatile(_LOCK " addq %1,%0"
> + : "+m" (*p)
> + : "a" (v));
> +}
> +#define atomic_add_uint64(_p, _v) _atomic_add_uint64(_p, _v)
>  
>  static inline void
>  _atomic_sub_int(volatile unsigned int *p, unsigned int v)
> Index: sys/sys/sysctl.h
> ===
> RCS file: /cvs/src/sys/sys/sysctl.h,v
> retrieving revision 1.218
> diff -u -p -r1.218 sysctl.h
> --- sys/sys/sysctl.h  17 May 2021 17:54:31 -  1.218
> +++ sys/sys/sysctl.h  13 Jul 2021 07:42:52 -
> @@ -38,7 +38,8 @@
>  #ifndef _SYS_SYSCTL_H_
>  #define  _SYS_SYSCTL_H_
>  
> -#include 
> +/*#include */
> +#include "/usr/src/sys/uvm/uvmexp.h"
>  
>  /*
>   * Definitions for sysctl call.  The sysctl call uses a hierarchical name
> Index: sys/uvm/uvm_anon.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
> retrieving revision 1.54
> diff -u -p -r1.54 uvm_anon.c
> --- 

Kill SYSCALL_DEBUG

2021-08-30 Thread Martin Pieuchot
Now that dt(4) and btrace(8) are enabled by default and provide a nice
and flexible way to debug syscalls on GENERIC kernels should we get rid
of the SYSCALL_DEBUG mechanism?

Note that the auto-generated kern/syscalls.c providing the `syscallnames'
array is still needed to build btrace(8).

ok?

Index: kern/exec_elf.c
===
RCS file: /cvs/src/sys/kern/exec_elf.c,v
retrieving revision 1.160
diff -u -p -r1.160 exec_elf.c
--- kern/exec_elf.c 10 Mar 2021 10:21:47 -  1.160
+++ kern/exec_elf.c 30 Aug 2021 07:19:33 -
@@ -107,9 +107,6 @@ int elf_os_pt_note_name(Elf_Note *);
 intelf_os_pt_note(struct proc *, struct exec_package *, Elf_Ehdr *, int *);
 
 extern char sigcode[], esigcode[], sigcoderet[];
-#ifdef SYSCALL_DEBUG
-extern char *syscallnames[];
-#endif
 
 /* round up and down to page boundaries. */
 #define ELF_ROUND(a, b)(((a) + (b) - 1) & ~((b) - 1))
@@ -135,11 +132,7 @@ struct emul emul_elf = {
SYS_syscall,
SYS_MAXSYSCALL,
sysent,
-#ifdef SYSCALL_DEBUG
-   syscallnames,
-#else
NULL,
-#endif
(sizeof(AuxInfo) * ELF_AUX_ENTRIES / sizeof(char *)),
elf_copyargs,
setregs,
Index: kern/kern_xxx.c
===
RCS file: /cvs/src/sys/kern/kern_xxx.c,v
retrieving revision 1.36
diff -u -p -r1.36 kern_xxx.c
--- kern/kern_xxx.c 2 Apr 2019 11:00:22 -   1.36
+++ kern/kern_xxx.c 30 Aug 2021 07:19:17 -
@@ -84,75 +84,3 @@ __stack_smash_handler(char func[], int d
panic("smashed stack in %s", func);
 }
 #endif
-
-#ifdef SYSCALL_DEBUG
-#include 
-
-#defineSCDEBUG_CALLS   0x0001  /* show calls */
-#defineSCDEBUG_RETURNS 0x0002  /* show returns */
-#defineSCDEBUG_ALL 0x0004  /* even syscalls that are 
implemented */
-#defineSCDEBUG_SHOWARGS0x0008  /* show arguments to calls */
-
-intscdebug = SCDEBUG_CALLS|SCDEBUG_RETURNS|SCDEBUG_SHOWARGS;
-
-void
-scdebug_call(struct proc *p, register_t code, const register_t args[])
-{
-   struct process *pr;
-   struct sysent *sy;
-   struct emul *em;
-   int i;
-
-   if (!(scdebug & SCDEBUG_CALLS))
-   return;
-
-   pr = p->p_p;
-   em = pr->ps_emul;
-   sy = >e_sysent[code];
-   if (!(scdebug & SCDEBUG_ALL || code < 0 || code >= em->e_nsysent ||
-sy->sy_call == sys_nosys))
-   return;
-
-   printf("proc %d (%s): %s num ", pr->ps_pid, pr->ps_comm, em->e_name);
-   if (code < 0 || code >= em->e_nsysent)
-   printf("OUT OF RANGE (%ld)", code);
-   else {
-   printf("%ld call: %s", code, em->e_syscallnames[code]);
-   if (scdebug & SCDEBUG_SHOWARGS) {
-   printf("(");
-   for (i = 0; i < sy->sy_argsize / sizeof(register_t);
-   i++)
-   printf("%s0x%lx", i == 0 ? "" : ", ", args[i]);
-   printf(")");
-   }
-   }
-   printf("\n");
-}
-
-void
-scdebug_ret(struct proc *p, register_t code, int error,
-const register_t retval[])
-{
-   struct process *pr;
-   struct sysent *sy;
-   struct emul *em;
-
-   if (!(scdebug & SCDEBUG_RETURNS))
-   return;
-
-   pr = p->p_p;
-   em = pr->ps_emul;
-   sy = >e_sysent[code];
-   if (!(scdebug & SCDEBUG_ALL || code < 0 || code >= em->e_nsysent ||
-   sy->sy_call == sys_nosys))
-   return;
-   
-   printf("proc %d (%s): %s num ", pr->ps_pid, pr->ps_comm, em->e_name);
-   if (code < 0 || code >= em->e_nsysent)
-   printf("OUT OF RANGE (%ld)", code);
-   else
-   printf("%ld ret: err = %d, rv = 0x%lx,0x%lx", code,
-   error, retval[0], retval[1]);
-   printf("\n");
-}
-#endif /* SYSCALL_DEBUG */
Index: kern/init_main.c
===
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.308
diff -u -p -r1.308 init_main.c
--- kern/init_main.c30 Jun 2021 12:21:02 -  1.308
+++ kern/init_main.c30 Aug 2021 07:17:55 -
@@ -155,9 +155,6 @@ voidpool_gc_pages(void *);
 void   percpu_init(void);
 
 extern char sigcode[], esigcode[], sigcoderet[];
-#ifdef SYSCALL_DEBUG
-extern char *syscallnames[];
-#endif
 
 struct emul emul_native = {
"native",
@@ -165,11 +162,7 @@ struct emul emul_native = {
SYS_syscall,
SYS_MAXSYSCALL,
sysent,
-#ifdef SYSCALL_DEBUG
-   syscallnames,
-#else
NULL,
-#endif
0,
copyargs,
setregs,
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.154
diff -u -p -r1.154 systm.h
--- sys/systm.h 2 Jun 2021 00:39:25 

Re: ucc(4): consumer control keyboard device driver

2021-08-18 Thread Martin Pieuchot
On 18/08/21(Wed) 17:50, Mark Kettenis wrote:
> > Date: Tue, 17 Aug 2021 20:13:41 +0200
> > From: Anton Lindqvist 
> > 
> > Hi,
> > 
> > Here's a new driver for USB HID Consumer Control keyboards. Such
> > keyboard is a pseudo device which is used to expose audio and
> > application launch keys. My prime motivation is to get the volume mute,
> > increment and decrement keys to just work on my keyboard without the
> > need to use usbhidaction(1).
> > 
> > ucc(4) attaches a wskbd(4) keyboard "on top" making it appear like an
> > ordinary keyboard, which also makes it possible to inject key
> > press/release input. It supports both translating and raw mode making it
> > compatible with the ordinary console and X11.
> > 
> > My keyboard for instance exposes 42 keys in its input report. I only
> > care about the volume and audio related ones and therefore only added
> > mappings for those. Additional mappings should be trivial to add if
> > desired.
> > 
> > Testing would be much appreciated.
> > 
> > Comments? OK?
> 
> So the downside of this is that you get a separate wskbd(4) device for
> these.  This will be transparent for most users thanks to wsmux(4),
> but it does mean that doing a multi-seat wscons setup becomes a little
> bit more involved.  That's fine with me as I don't think that's an
> important use case for OpenBSD.
>
> This looks reasonable to me.

I agree.  I am also very happy to see a kernel driver that makes things
work out-of-the-box.

Regarding the introduction of a separate wskbd(4) this can be seen as an
intermediate step.  Having this logic in ukbd(4) implies revisiting the
way reportID are mapped to USB drivers, which is still a bit of a hack
when it comes to supporting multiple of them.  Having a simpler driver
like ucc(4) can help us figure out out to support more "special" keys
without having to deal with the HID logic at the same time.

It would be great if users of usbhidaction(1) could tell us if this
introduce any regression and/or if other keys could be supported.

I'm definitively ok with this direction.

> > diff --git share/man/man4/Makefile share/man/man4/Makefile
> > index 6a0ecb20653..63b33660159 100644
> > --- share/man/man4/Makefile
> > +++ share/man/man4/Makefile
> > @@ -84,7 +84,7 @@ MAN=  aac.4 abcrtc.4 abl.4 ac97.4 acphy.4 acrtc.4 \
> > tlphy.4 thmc.4 tpm.4 tpmr.4 tqphy.4 trm.4 trunk.4 tsl.4 tty.4 \
> > tun.4 tap.4 twe.4 \
> > txp.4 txphy.4 uaudio.4 uark.4 uath.4 ubcmtp.4 uberry.4 ubsa.4 \
> > -   ubsec.4 ucom.4 uchcom.4 ucrcom.4 ucycom.4 ukspan.4 uslhcom.4 \
> > +   ubsec.4 ucc.4 ucom.4 uchcom.4 ucrcom.4 ucycom.4 ukspan.4 uslhcom.4 \
> > udav.4 udcf.4 udl.4 udp.4 udsbr.4 \
> > uftdi.4 ugen.4 ugl.4 ugold.4 uguru.4 uhci.4 uhid.4 uhidev.4 uhidpp.4 \
> > uipaq.4 ujoy.4 uk.4 ukbd.4 \
> > diff --git share/man/man4/ucc.4 share/man/man4/ucc.4
> > new file mode 100644
> > index 000..413c88aa6af
> > --- /dev/null
> > +++ share/man/man4/ucc.4
> > @@ -0,0 +1,45 @@
> > +.\"$OpenBSD$
> > +.\"
> > +.\" Copyright (c) 2021 Anton Lindqvist 
> > +.\"
> > +.\" Permission to use, copy, modify, and distribute this software for any
> > +.\" purpose with or without fee is hereby granted, provided that the above
> > +.\" copyright notice and this permission notice appear in all copies.
> > +.\"
> > +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL 
> > WARRANTIES
> > +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> > +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> > +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> > +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> > +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> > +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> > +.\"
> > +.Dd $Mdocdate$
> > +.Dt UCC 4
> > +.Os
> > +.Sh NAME
> > +.Nm ucc
> > +.Nd Consumer Control keyboards
> > +.Sh SYNOPSIS
> > +.Cd "ucc* at uhidev?"
> > +.Cd "wsbkd* at ucc? mux 1"
> > +.Sh DESCRIPTION
> > +The
> > +.Nm
> > +driver provides support for Consumer Control pseudo keyboards, often used 
> > to
> > +expose audio and application launch keys.
> > +.Sh SEE ALSO
> > +.Xr intro 4 ,
> > +.Xr uhidev 4 ,
> > +.Xr usb 4 ,
> > +.Xr wskbd 4
> > +.Sh HISTORY
> > +The
> > +.Nm
> > +driver first appeared in
> > +.Ox 7.0 .
> > +.Sh AUTHORS
> > +The
> > +.Nm
> > +driver was written by
> > +.An Anton Lindqvist Aq Mt an...@openbsd.org .
> > diff --git share/man/man4/uhidev.4 share/man/man4/uhidev.4
> > index 02252789a3f..d398c564bd5 100644
> > --- share/man/man4/uhidev.4
> > +++ share/man/man4/uhidev.4
> > @@ -37,6 +37,7 @@
> >  .Sh SYNOPSIS
> >  .Cd "uhidev*  at uhub?"
> >  .Cd "fido*at uhidev?"
> > +.Cd "ucc* at uhidev?"
> >  .Cd "ucycom*  at uhidev?"
> >  .Cd "ugold*   at uhidev?"
> >  .Cd "uhid*at uhidev?"
> > @@ -72,6 +73,7 @@ only dispatches data to them based on the report id.
> >  .Sh SEE 

Re: Do not spin on the NET_LOCK() in kqueue

2021-08-02 Thread Martin Pieuchot
On 29/07/21(Thu) 15:36, Alexander Bluhm wrote:
> > > New diff fixing a locking dance pointed out by visa@.
> 
> Not tested this one yet.  But here is a combination of all the
> others.
> 
> http://bluhm.genua.de/perform/results/2021-07-27T07:41:29Z/perform.html

Thanks for testing.

These tests show that the contention is moving around.  But as it is now
there is no visible effect on having solock-free kqueue filters.  So I'm
retracting this diff for now as it only postpones what we are about:
remove the KERNEL_LOCK() from sowakeup().



Re: Do not spin on the NET_LOCK() in kqueue

2021-07-29 Thread Martin Pieuchot
On 26/07/21(Mon) 09:23, Martin Pieuchot wrote:
> On 26/07/21(Mon) 08:55, Martin Pieuchot wrote:
> > On 21/07/21(Wed) 10:18, Martin Pieuchot wrote:
> > > On 11/07/21(Sun) 14:45, Visa Hankala wrote:
> > > > On Sat, Jul 10, 2021 at 05:26:57PM +0200, Martin Pieuchot wrote:
> > > > > One of the reasons for the drop of performances in the kqueue-based
> > > > > poll/select is the fact that kqueue filters are called up to 3 times
> > > > > per syscall and that they all spin on the NET_LOCK() for TCP/UDP
> > > > > packets.
> > > > > 
> > > > > Diff below is a RFC for improving the situation.
> > > > > 
> > > > > socket kqueue filters mainly check for the amount of available items 
> > > > > to
> > > > > read/write.  This involves comparing various socket buffer fields 
> > > > > (sb_cc,
> > > > > sb_lowat, etc).  The diff below introduces a new mutex to serialize
> > > > > updates of those fields with reads in the kqueue filters.
> > > > > 
> > > > > Since these fields are always modified with the socket lock held, 
> > > > > either
> > > > > the mutex or the solock are enough to have a coherent view of them.
> > > > > Note that either of these locks is necessary only if multiple fields
> > > > > have to be read (like in sbspace()).
> > > > > 
> > > > > Other per-socket fields accessed in the kqueue filters are never
> > > > > combined (with &&) to determine a condition.  So assuming it is fine 
> > > > > to
> > > > > read register-sized fields w/o the socket lock we can safely remove it
> > > > > there.
> > > > > 
> > > > > Could such mutex also be used to serialize klist updates?
> > > > 
> > > > I think the lock should be such that it can serialize socket klists.
> > > > 
> > > > As the main motivator for this change is kqueue, the viability of using
> > > > the mutex for the klist locking should be checked now. The mutex has to
> > > > be held whenever calling KNOTE() on sb_sel.si_note, or selwakeup() on
> > > > sb_sel. Then the socket f_event callbacks will not need to lock the
> > > > mutex themselves.
> > > > 
> > > > I had a diff that serialized socket klists using solock(). It did not
> > > > work well because it increased lock contention, especially when using
> > > > kqueue as backend for poll(2) and select(2). The diff is not even
> > > > correct any longer since recent changes to socket locking have
> > > > introduced new lock order constraints that conflict with it.
> > > 
> > > Updated diff below does that.  It also uses a single per-socket mutex as
> > > suggested by bluhm@.
> > > 
> > > Note that as long poll(2) & select(2) use the current implementation a
> > > KERNEL_LOCK()/UNLOCK() dance is necessary in sowakeup().  The goal of
> > > this change combined with the poll/select rewrite is to get rid of this
> > > dance.
> > 
> > Updated diff after recent commits, more comments?  Oks?
> 
> Previous diff had a double mtx_enter() in filt_fifowrite_common(), this
> one use the *locked() version of sbspace() to prevent it.

New diff fixing a locking dance pointed out by visa@.

Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.264
diff -u -p -r1.264 uipc_socket.c
--- kern/uipc_socket.c  26 Jul 2021 05:51:13 -  1.264
+++ kern/uipc_socket.c  29 Jul 2021 07:31:32 -
@@ -84,7 +84,7 @@ int   filt_solistenprocess(struct knote *k
 intfilt_solisten_common(struct knote *kn, struct socket *so);
 
 const struct filterops solisten_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_solisten,
@@ -93,7 +93,7 @@ const struct filterops solisten_filtops 
 };
 
 const struct filterops soread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -102,7 +102,7 @@ const struct filterops soread_filtops = 
 };
 
 const struct filterops sowrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSA

Re: new kqueue-based select(2) implementation

2021-07-26 Thread Martin Pieuchot
On 21/07/21(Wed) 09:19, Martin Pieuchot wrote:
> On 23/06/21(Wed) 15:53, Alexander Bluhm wrote:
> > On Wed, Jun 23, 2021 at 11:40:18AM +0200, Martin Pieuchot wrote:
> > > Our previous attempt [0] to replace the current select(2) implementation
> > > has been reverted due to non-acceptable latency increase on sockets [1].
> > 
> > I have measured the performance difference.
> > 
> > http://bluhm.genua.de/perform/results/2021-06-21T09%3A44%3A18Z/perform.html
> > 
> > Worst 20% throughput drop is in 'iperf3 -c10.3.45.35 -u -b10G -w1m
> > -t10 -R' which can be seen here.
> > 
> > http://bluhm.genua.de/perform/results/2021-06-21T09%3A44%3A18Z/gnuplot/udp.html
> > 
> > Note that iperf3 calls select(2) multiple times per UDP packet.
> > 
> > As a new feature I have links to btrace kstack flame graphs in the
> > table.
> 
> Thanks a lot for the tests.  The FlameGraphs have shown that lazy
> removal wasn't working correctly.  Updated diff below now works as
> expected.
> 
> I'm aware of the throughput drop in the UDP iperf3 test, this is not a
> real case scenario so I don't consider it as a blocker.  However it is
> very useful to check the contention on the NET_LOCK() in select(2).  I'm
> working on this issue on another thread, but there's an interdependency
> between the two diffs, due to lock ordering. 

Updated diff after recent commits from visa@.

Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.135
diff -u -p -r1.135 sys_generic.c
--- kern/sys_generic.c  8 Jan 2021 09:29:04 -   1.135
+++ kern/sys_generic.c  26 Jul 2021 06:56:22 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -66,8 +67,21 @@
 
 #include 
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2) information, somewhat verbose
+ *  3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) {  \
+   printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);  \
+   printf(x);  \
+}
+
+int pselregister(struct proc *, fd_set *[], int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
+
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
 struct timespec *, const sigset_t *, register_t *);
@@ -582,13 +596,12 @@ sys_pselect(struct proc *p, void *v, reg
 
 int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
-struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
+struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
 {
+   struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
-   struct timespec elapsed, start, stop;
-   uint64_t nsecs;
-   int s, ncoll, error = 0;
+   int error, n, ncollected = 0, nevents = 0;
u_int ni;
 
if (nd < 0)
@@ -618,6 +631,8 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)[5];
}
 
+   kqpoll_init();
+
 #definegetbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
goto done;
@@ -636,43 +651,65 @@ dopselect(struct proc *p, int nd, fd_set
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-   ncoll = nselcoll;
-   atomic_setbits_int(>p_flag, P_SELECT);
-   error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
-   if (error || *retval)
+   /* Register kqueue events */
+   error = pselregister(p, pibits, nd, );
+   if (error != 0)
goto done;
-   if (timeout == NULL || timespecisset(timeout)) {
-   if (timeout != NULL) {
-   getnanouptime();
-   nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-   } else
-   nsecs = INFSLP;
-   s = splhigh();
-   if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-   splx(s);
-   goto retry;
-   }
-   atomic_clearbits_int(>p_flag, P_SELECT);
-   error = tsleep_nsec(, PSOCK | PCATCH, "select", nsecs);
-   splx(s);
-   if (timeout != NULL) {
-   getnanouptime();
-   timespecsub(, , );
-  

Re: Do not spin on the NET_LOCK() in kqueue

2021-07-26 Thread Martin Pieuchot
On 26/07/21(Mon) 08:55, Martin Pieuchot wrote:
> On 21/07/21(Wed) 10:18, Martin Pieuchot wrote:
> > On 11/07/21(Sun) 14:45, Visa Hankala wrote:
> > > On Sat, Jul 10, 2021 at 05:26:57PM +0200, Martin Pieuchot wrote:
> > > > One of the reasons for the drop of performances in the kqueue-based
> > > > poll/select is the fact that kqueue filters are called up to 3 times
> > > > per syscall and that they all spin on the NET_LOCK() for TCP/UDP
> > > > packets.
> > > > 
> > > > Diff below is a RFC for improving the situation.
> > > > 
> > > > socket kqueue filters mainly check for the amount of available items to
> > > > read/write.  This involves comparing various socket buffer fields 
> > > > (sb_cc,
> > > > sb_lowat, etc).  The diff below introduces a new mutex to serialize
> > > > updates of those fields with reads in the kqueue filters.
> > > > 
> > > > Since these fields are always modified with the socket lock held, either
> > > > the mutex or the solock are enough to have a coherent view of them.
> > > > Note that either of these locks is necessary only if multiple fields
> > > > have to be read (like in sbspace()).
> > > > 
> > > > Other per-socket fields accessed in the kqueue filters are never
> > > > combined (with &&) to determine a condition.  So assuming it is fine to
> > > > read register-sized fields w/o the socket lock we can safely remove it
> > > > there.
> > > > 
> > > > Could such mutex also be used to serialize klist updates?
> > > 
> > > I think the lock should be such that it can serialize socket klists.
> > > 
> > > As the main motivator for this change is kqueue, the viability of using
> > > the mutex for the klist locking should be checked now. The mutex has to
> > > be held whenever calling KNOTE() on sb_sel.si_note, or selwakeup() on
> > > sb_sel. Then the socket f_event callbacks will not need to lock the
> > > mutex themselves.
> > > 
> > > I had a diff that serialized socket klists using solock(). It did not
> > > work well because it increased lock contention, especially when using
> > > kqueue as backend for poll(2) and select(2). The diff is not even
> > > correct any longer since recent changes to socket locking have
> > > introduced new lock order constraints that conflict with it.
> > 
> > Updated diff below does that.  It also uses a single per-socket mutex as
> > suggested by bluhm@.
> > 
> > Note that as long poll(2) & select(2) use the current implementation a
> > KERNEL_LOCK()/UNLOCK() dance is necessary in sowakeup().  The goal of
> > this change combined with the poll/select rewrite is to get rid of this
> > dance.
> 
> Updated diff after recent commits, more comments?  Oks?

Previous diff had a double mtx_enter() in filt_fifowrite_common(), this
one use the *locked() version of sbspace() to prevent it.


Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.264
diff -u -p -r1.264 uipc_socket.c
--- kern/uipc_socket.c  26 Jul 2021 05:51:13 -  1.264
+++ kern/uipc_socket.c  26 Jul 2021 07:20:58 -
@@ -84,7 +84,7 @@ int   filt_solistenprocess(struct knote *k
 intfilt_solisten_common(struct knote *kn, struct socket *so);
 
 const struct filterops solisten_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_solisten,
@@ -93,7 +93,7 @@ const struct filterops solisten_filtops 
 };
 
 const struct filterops soread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -102,7 +102,7 @@ const struct filterops soread_filtops = 
 };
 
 const struct filterops sowrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sowdetach,
.f_event= filt_sowrite,
@@ -111,7 +111,7 @@ const struct filterops sowrite_filtops =
 };
 
 const struct filterops soexcept_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -181,6 +181,9 @@ socr

Re: Do not spin on the NET_LOCK() in kqueue

2021-07-26 Thread Martin Pieuchot
On 21/07/21(Wed) 10:18, Martin Pieuchot wrote:
> On 11/07/21(Sun) 14:45, Visa Hankala wrote:
> > On Sat, Jul 10, 2021 at 05:26:57PM +0200, Martin Pieuchot wrote:
> > > One of the reasons for the drop of performances in the kqueue-based
> > > poll/select is the fact that kqueue filters are called up to 3 times
> > > per syscall and that they all spin on the NET_LOCK() for TCP/UDP
> > > packets.
> > > 
> > > Diff below is a RFC for improving the situation.
> > > 
> > > socket kqueue filters mainly check for the amount of available items to
> > > read/write.  This involves comparing various socket buffer fields (sb_cc,
> > > sb_lowat, etc).  The diff below introduces a new mutex to serialize
> > > updates of those fields with reads in the kqueue filters.
> > > 
> > > Since these fields are always modified with the socket lock held, either
> > > the mutex or the solock are enough to have a coherent view of them.
> > > Note that either of these locks is necessary only if multiple fields
> > > have to be read (like in sbspace()).
> > > 
> > > Other per-socket fields accessed in the kqueue filters are never
> > > combined (with &&) to determine a condition.  So assuming it is fine to
> > > read register-sized fields w/o the socket lock we can safely remove it
> > > there.
> > > 
> > > Could such mutex also be used to serialize klist updates?
> > 
> > I think the lock should be such that it can serialize socket klists.
> > 
> > As the main motivator for this change is kqueue, the viability of using
> > the mutex for the klist locking should be checked now. The mutex has to
> > be held whenever calling KNOTE() on sb_sel.si_note, or selwakeup() on
> > sb_sel. Then the socket f_event callbacks will not need to lock the
> > mutex themselves.
> > 
> > I had a diff that serialized socket klists using solock(). It did not
> > work well because it increased lock contention, especially when using
> > kqueue as backend for poll(2) and select(2). The diff is not even
> > correct any longer since recent changes to socket locking have
> > introduced new lock order constraints that conflict with it.
> 
> Updated diff below does that.  It also uses a single per-socket mutex as
> suggested by bluhm@.
> 
> Note that as long poll(2) & select(2) use the current implementation a
> KERNEL_LOCK()/UNLOCK() dance is necessary in sowakeup().  The goal of
> this change combined with the poll/select rewrite is to get rid of this
> dance.

Updated diff after recent commits, more comments?  Oks?

Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.264
diff -u -p -r1.264 uipc_socket.c
--- kern/uipc_socket.c  26 Jul 2021 05:51:13 -  1.264
+++ kern/uipc_socket.c  26 Jul 2021 05:57:45 -
@@ -84,7 +84,7 @@ int   filt_solistenprocess(struct knote *k
 intfilt_solisten_common(struct knote *kn, struct socket *so);
 
 const struct filterops solisten_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_solisten,
@@ -93,7 +93,7 @@ const struct filterops solisten_filtops 
 };
 
 const struct filterops soread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -102,7 +102,7 @@ const struct filterops soread_filtops = 
 };
 
 const struct filterops sowrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sowdetach,
.f_event= filt_sowrite,
@@ -111,7 +111,7 @@ const struct filterops sowrite_filtops =
 };
 
 const struct filterops soexcept_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -181,6 +181,9 @@ socreate(int dom, struct socket **aso, i
so->so_egid = p->p_ucred->cr_gid;
so->so_cpid = p->p_p->ps_pid;
so->so_proto = prp;
+   mtx_init(>so_mtx, IPL_MPFLOOR);
+   klist_init_mutex(>so_snd.sb_sel.si_note, >so_mtx);
+   klist_init_mutex(>so_rcv.sb_sel.si_note, >so_mtx);
so->so_snd.sb_timeo_nsecs = INFSLP;
so->so_rcv.sb_timeo_nsecs = INF

Re: Pass "socket *" to sballoc/sbfree & co

2021-07-25 Thread Martin Pieuchot
On 24/07/21(Sat) 11:43, Martin Pieuchot wrote:
> Diff below adds an extra argument, a pointer to the socket corresponding
> to the buffer given to: sballoc(), sbfree(), sbcompress(), sbcheck() and
> sbdroprecord().
> 
> This pointer will be used to assert for or grab a per-socket lock.
> 
> There is no functional change in this diff.  Its goal is to simplify the
> review (and possible revert) of the change introducing `so_mtx' [0].
> 
> Note that this diff includes the removal of sbinsertoob() sent
> previously.

Updated diff now that sbinsertoob() has been removed.

> [0] https://marc.info/?l=openbsd-tech=162685565421248=2

ok?

Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.263
diff -u -p -r1.263 uipc_socket.c
--- kern/uipc_socket.c  28 May 2021 16:24:53 -  1.263
+++ kern/uipc_socket.c  25 Jul 2021 14:14:28 -
@@ -860,7 +860,7 @@ dontblock:
*paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
m = m->m_next;
} else {
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
if (paddr) {
*paddr = m;
so->so_rcv.sb_mb = m->m_next;
@@ -884,7 +884,7 @@ dontblock:
*controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
m = m->m_next;
} else {
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
so->so_rcv.sb_mb = m->m_next;
m->m_nextpkt = m->m_next = NULL;
cm = m;
@@ -984,7 +984,7 @@ dontblock:
orig_resid = 0;
} else {
nextrecord = m->m_nextpkt;
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
if (mp) {
*mp = m;
mp = >m_next;
@@ -1065,7 +1065,7 @@ dontblock:
if (m && pr->pr_flags & PR_ATOMIC) {
flags |= MSG_TRUNC;
if ((flags & MSG_PEEK) == 0)
-   (void) sbdroprecord(>so_rcv);
+   (void) sbdroprecord(so, >so_rcv);
}
if ((flags & MSG_PEEK) == 0) {
if (m == NULL) {
@@ -1452,7 +1452,7 @@ somove(struct socket *so, int wait)
while (m && m->m_type == MT_CONTROL)
m = m->m_next;
if (m == NULL) {
-   sbdroprecord(>so_rcv);
+   sbdroprecord(so, >so_rcv);
if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb)
(so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
NULL, NULL, NULL);
@@ -1492,7 +1492,7 @@ somove(struct socket *so, int wait)
 * that the whole first record can be processed.
 */
m = so->so_rcv.sb_mb;
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
so->so_rcv.sb_mb = m_free(m);
sbsync(>so_rcv, nextrecord);
}
@@ -1502,7 +1502,7 @@ somove(struct socket *so, int wait)
 */
m = so->so_rcv.sb_mb;
while (m && m->m_type == MT_CONTROL) {
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
so->so_rcv.sb_mb = m_free(m);
m = so->so_rcv.sb_mb;
sbsync(>so_rcv, nextrecord);
@@ -1541,7 +1541,7 @@ somove(struct socket *so, int wait)
so->so_rcv.sb_datacc -= size;
} else {
*mp = so->so_rcv.sb_mb;
-   sbfree(>so_rcv, *mp);
+   sbfree(so, >so_rcv, *mp);
so->so_rcv.sb_mb = (*mp)->m_next;
sbsync(>so_rcv, nextrecord);
}
@@ -1550,7 +1550,7 @@ somove(struct socket *so, int wait)
 
SBLASTRECORDCHK(>so_rcv, "somove 3");
SBLASTMBUFCHK(>so_rcv, "somove 3");
-   SBCHECK(>so_rcv);
+   SBCHECK(so, >so_rcv);
if (m == NULL)
goto release;
m->m_nextpkt = NULL;
Index: kern/uipc_socket2.c
===
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.112
diff -u -p -r1.112 uipc_socket2.c
--- kern/uipc_socket2.c 25 Jul 2021 14:13:47 -  1.112
+++ kern/uipc_socket2.c 25 Jul 2021 14:14:28 -
@@ -654,7 +654,7 @@ sbappend(struct 

Pass "socket *" to sballoc/sbfree & co

2021-07-24 Thread Martin Pieuchot
Diff below adds an extra argument, a pointer to the socket corresponding
to the buffer given to: sballoc(), sbfree(), sbcompress(), sbcheck() and
sbdroprecord().

This pointer will be used to assert for or grab a per-socket lock.

There is no functional change in this diff.  Its goal is to simplify the
review (and possible revert) of the change introducing `so_mtx' [0].

Note that this diff includes the removal of sbinsertoob() sent
previously.

ok?

[0] https://marc.info/?l=openbsd-tech=162685565421248=2

Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.263
diff -u -p -r1.263 uipc_socket.c
--- kern/uipc_socket.c  28 May 2021 16:24:53 -  1.263
+++ kern/uipc_socket.c  24 Jul 2021 09:32:10 -
@@ -860,7 +860,7 @@ dontblock:
*paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
m = m->m_next;
} else {
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
if (paddr) {
*paddr = m;
so->so_rcv.sb_mb = m->m_next;
@@ -884,7 +884,7 @@ dontblock:
*controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
m = m->m_next;
} else {
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
so->so_rcv.sb_mb = m->m_next;
m->m_nextpkt = m->m_next = NULL;
cm = m;
@@ -984,7 +984,7 @@ dontblock:
orig_resid = 0;
} else {
nextrecord = m->m_nextpkt;
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
if (mp) {
*mp = m;
mp = >m_next;
@@ -1065,7 +1065,7 @@ dontblock:
if (m && pr->pr_flags & PR_ATOMIC) {
flags |= MSG_TRUNC;
if ((flags & MSG_PEEK) == 0)
-   (void) sbdroprecord(>so_rcv);
+   (void) sbdroprecord(so, >so_rcv);
}
if ((flags & MSG_PEEK) == 0) {
if (m == NULL) {
@@ -1452,7 +1452,7 @@ somove(struct socket *so, int wait)
while (m && m->m_type == MT_CONTROL)
m = m->m_next;
if (m == NULL) {
-   sbdroprecord(>so_rcv);
+   sbdroprecord(so, >so_rcv);
if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb)
(so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
NULL, NULL, NULL);
@@ -1492,7 +1492,7 @@ somove(struct socket *so, int wait)
 * that the whole first record can be processed.
 */
m = so->so_rcv.sb_mb;
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
so->so_rcv.sb_mb = m_free(m);
sbsync(>so_rcv, nextrecord);
}
@@ -1502,7 +1502,7 @@ somove(struct socket *so, int wait)
 */
m = so->so_rcv.sb_mb;
while (m && m->m_type == MT_CONTROL) {
-   sbfree(>so_rcv, m);
+   sbfree(so, >so_rcv, m);
so->so_rcv.sb_mb = m_free(m);
m = so->so_rcv.sb_mb;
sbsync(>so_rcv, nextrecord);
@@ -1541,7 +1541,7 @@ somove(struct socket *so, int wait)
so->so_rcv.sb_datacc -= size;
} else {
*mp = so->so_rcv.sb_mb;
-   sbfree(>so_rcv, *mp);
+   sbfree(so, >so_rcv, *mp);
so->so_rcv.sb_mb = (*mp)->m_next;
sbsync(>so_rcv, nextrecord);
}
@@ -1550,7 +1550,7 @@ somove(struct socket *so, int wait)
 
SBLASTRECORDCHK(>so_rcv, "somove 3");
SBLASTMBUFCHK(>so_rcv, "somove 3");
-   SBCHECK(>so_rcv);
+   SBCHECK(so, >so_rcv);
if (m == NULL)
goto release;
m->m_nextpkt = NULL;
Index: kern/uipc_socket2.c
===
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.111
diff -u -p -r1.111 uipc_socket2.c
--- kern/uipc_socket2.c 7 Jun 2021 09:10:32 -   1.111
+++ kern/uipc_socket2.c 24 Jul 2021 09:34:21 -
@@ -654,7 +654,7 @@ sbappend(struct socket *so, struct sockb
 */
sb->sb_lastrecord = m;
}
-   sbcompress(sb, m, n);
+   sbcompress(so, sb, m, n);
SBLASTRECORDCHK(sb, "sbappend 2");
 }
 
@@ -673,7 +673,7 @@ sbappendstream(struct socket *so, struct
 
SBLASTMBUFCHK(sb, __func__);
 
-   sbcompress(sb, m, sb->sb_mbtail);
+   sbcompress(so, sb, m, sb->sb_mbtail);
 

Kill sbinsertoob()

2021-07-24 Thread Martin Pieuchot
This function is unused, killing it means we need fewer refactoring to
switch to a per-socket mutex serializing event notifications.

ok?

Index: kern/uipc_socket2.c
===
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.111
diff -u -p -r1.111 uipc_socket2.c
--- kern/uipc_socket2.c 7 Jun 2021 09:10:32 -   1.111
+++ kern/uipc_socket2.c 24 Jul 2021 09:24:01 -
@@ -737,55 +737,6 @@ sbappendrecord(struct socket *so, struct
 }
 
 /*
- * As above except that OOB data
- * is inserted at the beginning of the sockbuf,
- * but after any other OOB data.
- */
-void
-sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
-{
-   struct mbuf *m, **mp;
-
-   if (m0 == NULL)
-   return;
-
-   SBLASTRECORDCHK(sb, "sbinsertoob 1");
-
-   for (mp = >sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
-   again:
-   switch (m->m_type) {
-
-   case MT_OOBDATA:
-   continue;   /* WANT next train */
-
-   case MT_CONTROL:
-   if ((m = m->m_next) != NULL)
-   goto again; /* inspect THIS train further */
-   }
-   break;
-   }
-   /*
-* Put the first mbuf on the queue.
-* Note this permits zero length records.
-*/
-   sballoc(sb, m0);
-   m0->m_nextpkt = *mp;
-   if (*mp == NULL) {
-   /* m0 is actually the new tail */
-   sb->sb_lastrecord = m0;
-   }
-   *mp = m0;
-   m = m0->m_next;
-   m0->m_next = NULL;
-   if (m && (m0->m_flags & M_EOR)) {
-   m0->m_flags &= ~M_EOR;
-   m->m_flags |= M_EOR;
-   }
-   sbcompress(sb, m, m0);
-   SBLASTRECORDCHK(sb, "sbinsertoob 2");
-}
-
-/*
  * Append address and data, and optionally, control (ancillary) data
  * to the receive queue of a socket.  If present,
  * m0 must include a packet header with total length.
Index: sys/socketvar.h
===
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.98
diff -u -p -r1.98 socketvar.h
--- sys/socketvar.h 7 Jun 2021 09:10:32 -   1.98
+++ sys/socketvar.h 24 Jul 2021 09:23:59 -
@@ -293,7 +293,6 @@ struct mbuf *
 void   sbdrop(struct socket *, struct sockbuf *, int);
 void   sbdroprecord(struct sockbuf *);
 void   sbflush(struct socket *, struct sockbuf *);
-void   sbinsertoob(struct sockbuf *, struct mbuf *);
 void   sbrelease(struct socket *, struct sockbuf *);
 intsbcheckreserve(u_long, u_long);
 intsbchecklowmem(void);



Re: Do not spin on the NET_LOCK() in kqueue

2021-07-21 Thread Martin Pieuchot
On 11/07/21(Sun) 14:45, Visa Hankala wrote:
> On Sat, Jul 10, 2021 at 05:26:57PM +0200, Martin Pieuchot wrote:
> > One of the reasons for the drop of performances in the kqueue-based
> > poll/select is the fact that kqueue filters are called up to 3 times
> > per syscall and that they all spin on the NET_LOCK() for TCP/UDP
> > packets.
> > 
> > Diff below is a RFC for improving the situation.
> > 
> > socket kqueue filters mainly check for the amount of available items to
> > read/write.  This involves comparing various socket buffer fields (sb_cc,
> > sb_lowat, etc).  The diff below introduces a new mutex to serialize
> > updates of those fields with reads in the kqueue filters.
> > 
> > Since these fields are always modified with the socket lock held, either
> > the mutex or the solock are enough to have a coherent view of them.
> > Note that either of these locks is necessary only if multiple fields
> > have to be read (like in sbspace()).
> > 
> > Other per-socket fields accessed in the kqueue filters are never
> > combined (with &&) to determine a condition.  So assuming it is fine to
> > read register-sized fields w/o the socket lock we can safely remove it
> > there.
> > 
> > Could such mutex also be used to serialize klist updates?
> 
> I think the lock should be such that it can serialize socket klists.
> 
> As the main motivator for this change is kqueue, the viability of using
> the mutex for the klist locking should be checked now. The mutex has to
> be held whenever calling KNOTE() on sb_sel.si_note, or selwakeup() on
> sb_sel. Then the socket f_event callbacks will not need to lock the
> mutex themselves.
> 
> I had a diff that serialized socket klists using solock(). It did not
> work well because it increased lock contention, especially when using
> kqueue as backend for poll(2) and select(2). The diff is not even
> correct any longer since recent changes to socket locking have
> introduced new lock order constraints that conflict with it.

Updated diff below does that.  It also uses a single per-socket mutex as
suggested by bluhm@.

Note that as long poll(2) & select(2) use the current implementation a
KERNEL_LOCK()/UNLOCK() dance is necessary in sowakeup().  The goal of
this change combined with the poll/select rewrite is to get rid of this
dance.

Comments?

Index: kern/kern_event.c
===
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.167
diff -u -p -r1.167 kern_event.c
--- kern/kern_event.c   16 Jun 2021 14:26:30 -  1.167
+++ kern/kern_event.c   21 Jul 2021 07:21:56 -
@@ -1884,6 +1884,9 @@ knote_dequeue(struct knote *kn)
 void
 knote_modify(const struct kevent *kev, struct knote *kn)
 {
+   if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0)
+   KERNEL_ASSERT_LOCKED();
+
kn->kn_sfflags = kev->fflags;
kn->kn_sdata = kev->data;
kn->kn_udata = kev->udata;
@@ -1897,6 +1900,9 @@ knote_modify(const struct kevent *kev, s
 void
 knote_submit(struct knote *kn, struct kevent *kev)
 {
+   if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0)
+   KERNEL_ASSERT_LOCKED();
+
if (kev != NULL) {
*kev = kn->kn_kevent;
if (kn->kn_flags & EV_CLEAR) {
Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.263
diff -u -p -r1.263 uipc_socket.c
--- kern/uipc_socket.c  28 May 2021 16:24:53 -  1.263
+++ kern/uipc_socket.c  21 Jul 2021 07:21:56 -
@@ -84,7 +84,7 @@ int   filt_solistenprocess(struct knote *k
 intfilt_solisten_common(struct knote *kn, struct socket *so);
 
 const struct filterops solisten_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_solisten,
@@ -93,7 +93,7 @@ const struct filterops solisten_filtops 
 };
 
 const struct filterops soread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -102,7 +102,7 @@ const struct filterops soread_filtops = 
 };
 
 const struct filterops sowrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sowdetach,
.f_event= filt_sowrite,
@@ -111,7 +111,7 @@ const struct filterops sowrite_filtops =
 };
 
 const struct filterops soexcept_filtops = {
- 

Re: new kqueue-based select(2) implementation

2021-07-21 Thread Martin Pieuchot
On 23/06/21(Wed) 15:53, Alexander Bluhm wrote:
> On Wed, Jun 23, 2021 at 11:40:18AM +0200, Martin Pieuchot wrote:
> > Our previous attempt [0] to replace the current select(2) implementation
> > has been reverted due to non-acceptable latency increase on sockets [1].
> 
> I have measured the performance difference.
> 
> http://bluhm.genua.de/perform/results/2021-06-21T09%3A44%3A18Z/perform.html
> 
> Worst 20% throughput drop is in 'iperf3 -c10.3.45.35 -u -b10G -w1m
> -t10 -R' which can be seen here.
> 
> http://bluhm.genua.de/perform/results/2021-06-21T09%3A44%3A18Z/gnuplot/udp.html
> 
> Note that iperf3 calls select(2) multiple times per UDP packet.
> 
> As a new feature I have links to btrace kstack flame graphs in the
> table.

Thanks a lot for the tests.  The FlameGraphs have shown that lazy
removal wasn't working correctly.  Updated diff below now works as
expected.

I'm aware of the throughput drop in the UDP iperf3 test, this is not a
real case scenario so I don't consider it as a blocker.  However it is
very useful to check the contention on the NET_LOCK() in select(2).  I'm
working on this issue on another thread, but there's an interdependency
between the two diffs, due to lock ordering. 

Comments?

Index: kern/kern_event.c
===
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.167
diff -u -p -r1.167 kern_event.c
--- kern/kern_event.c   16 Jun 2021 14:26:30 -  1.167
+++ kern/kern_event.c   13 Jul 2021 07:21:03 -
@@ -92,7 +92,7 @@ void  kqueue_do_check(struct kqueue *kq, 
 #define kqueue_check(kq)   do {} while (0)
 #endif
 
-void   kqpoll_dequeue(struct proc *p);
+void   kqpoll_dequeue(struct proc *p, int all);
 
 static int filter_attach(struct knote *kn);
 static voidfilter_detach(struct knote *kn);
@@ -720,12 +720,12 @@ kqpoll_init(void)
 
if (p->p_kq != NULL) {
/*
-* Discard any knotes that have been enqueued after
+* Discard any badfd knotes that have been enqueued after
 * previous scan.
-* This prevents accumulation of enqueued badfd knotes
-* in case scan does not make progress for some reason.
+* This prevents them from accumulating in case
+* scan does not make progress for some reason.
 */
-   kqpoll_dequeue(p);
+   kqpoll_dequeue(p, 0);
return;
}
 
@@ -747,7 +747,7 @@ kqpoll_exit(void)
 
kqueue_purge(p, p->p_kq);
/* Clear any detached knotes that remain in the queue. */
-   kqpoll_dequeue(p);
+   kqpoll_dequeue(p, 1);
kqueue_terminate(p, p->p_kq);
KASSERT(p->p_kq->kq_refs == 1);
KQRELE(p->p_kq);
@@ -755,33 +755,50 @@ kqpoll_exit(void)
 }
 
 void
-kqpoll_dequeue(struct proc *p)
+kqpoll_dequeue(struct proc *p, int all)
 {
+   struct knote marker;
struct knote *kn;
struct kqueue *kq = p->p_kq;
 
+   /* Bail out early without locking if the queue appears empty. */
+   if (kq->kq_count == 0)
+   return;
+
+   memset(, 0, sizeof(marker));
+   marker.kn_filter = EVFILT_MARKER;
+   marker.kn_status = KN_PROCESSING;
+
mtx_enter(>kq_lock);
-   while ((kn = TAILQ_FIRST(>kq_head)) != NULL) {
+   kn = TAILQ_FIRST(>kq_head);
+   while (kn != NULL) {
/* This kqueue should not be scanned by other threads. */
KASSERT(kn->kn_filter != EVFILT_MARKER);
 
-   if (!knote_acquire(kn, NULL, 0)) {
-   /* knote_acquire() has released kq_lock. */
-   mtx_enter(>kq_lock);
+   if (all == 0 && (kn->kn_status & KN_ATTACHED)) {
+   kn = TAILQ_NEXT(kn, kn_tqe);
continue;
}
 
-   kqueue_check(kq);
-   TAILQ_REMOVE(>kq_head, kn, kn_tqe);
-   kn->kn_status &= ~KN_QUEUED;
-   kq->kq_count--;
-   mtx_leave(>kq_lock);
+   TAILQ_INSERT_BEFORE(kn, , kn_tqe);
+
+   if (!knote_acquire(kn, NULL, 0)) {
+   /* knote_acquire() has released kq_lock. */
+   } else {
+   kqueue_check(kq);
+   TAILQ_REMOVE(>kq_head, kn, kn_tqe);
+   kn->kn_status &= ~KN_QUEUED;
+   kq->kq_count--;
+   mtx_leave(>kq_lock);
 
-   filter_detach(kn);
-   knote_drop(kn, p);
+   filter_detach(kn);
+   knote_drop(kn, p);
+   }
 
mtx_enter(>kq_lock);
kqueue_check(kq);
+ 

Re: forwarding in parallel ipsec workaround

2021-07-21 Thread Martin Pieuchot
On 20/07/21(Tue) 15:46, Alexander Bluhm wrote:
> On Tue, Jul 20, 2021 at 02:26:02PM +0200, Alexander Bluhm wrote:
> > > Note that having multiple threads competing for an exclusive rwlock will
> > > generate unnecessary wakeup/sleep cycles every time the lock is released.
> > > It is valuable to keep this in mind as it might add extra latency when
> > > processing packets.
> > 
> > Of course.  What do you recommend?
> 
> We may have another alternative.
> 
> - Always use a shared net lock but also aquire kernel lock.

Using an exclusive NET_LOCK() seems better, that's what we have now.

My point is just that if we're using an exclusive NET_LOCK() we lose the
gain of having multiple softnet threads, so if we could reduce the
number of threads so would be better.  That said, if it isn't trivial to
do so, I'd better spend the time into making IPsec works with parallel
threads.

Another reason for not using an exclusive lock in the softnet thread is
to be able to execute read ioctls at the same time as the forwarding
path.  This has been reverted due to a bug elsewhere last year and never
got activated again.  But somebody might want to revisit this, 'cause
doing ifconfig(8) on a busy machine was hanging for a very long time.



Re: forwarding in parallel ipsec workaround

2021-07-21 Thread Martin Pieuchot
On 20/07/21(Tue) 14:26, Alexander Bluhm wrote:
> On Tue, Jul 20, 2021 at 10:08:09AM +0200, Martin Pieuchot wrote:
> > On 19/07/21(Mon) 17:53, Alexander Bluhm wrote:
> > > Hi,
> > > 
> > > I found why the IPsec workaround did not work.
> > > 
> > > At init time we set ifiq->ifiq_softnet = net_tq(ifp->if_index +
> > > idx), but the workaround modifies net_tq() at runtime.  Modifying
> > > net_tq() at runtime is bad anyway as task_add() and task_del() could
> > > be called with different task queues.
> > > 
> > > So better use exclusive lock if IPsec is in use.  For me this is
> > > running stable.
> > 
> > Note that having multiple threads competing for an exclusive rwlock will
> > generate unnecessary wakeup/sleep cycles every time the lock is released.  
> > It is valuable to keep this in mind as it might add extra latency when
> > processing packets.
> 
> Of course.  What do you recommend?

What you find easier to make progress with :)

> - Develop outside of the tree until all problems are fixed.
> - Delay work on parallel forwarding until IPsec is MP safe.
> - Accept a possible slowdown of IPsec.  In my measurements it gets
>   faster even with the exclusive lock.
> - Concentrate on making IPsec faster.  By removing the crypto
>   queues you gain much more performance than the exclusive lock may
>   cost.  Did you see the massive kernel locks in my graph?
>   
> http://bluhm.genua.de/perform/results/latest/patch-sys-ip-multiqueue.1/btrace/ssh_perform%40lt13_iperf3_-c10.4.56.36_-P10_-t10_-R-btrace-kstack.0.svg

I didn't see this, nice to know.

> - Make ARP MP safe.  Currently we need the kernel lock there or
>   it crashes.  This creates latency for all kind of packets.
> - Convert the rwlock in pf to mutex.  I think your argument counts
>   much more there.  But I cannot prove it.

My argument about what?  Didn't we all agree about converting the rwlock
to a mutex for now?

> My plan is to commit what we have and improve where most pain is.
> This makes incremental steps easier.

Makes sense.



Re: forwarding in parallel ipsec workaround

2021-07-20 Thread Martin Pieuchot
On 19/07/21(Mon) 17:53, Alexander Bluhm wrote:
> Hi,
> 
> I found why the IPsec workaround did not work.
> 
> At init time we set ifiq->ifiq_softnet = net_tq(ifp->if_index +
> idx), but the workaround modifies net_tq() at runtime.  Modifying
> net_tq() at runtime is bad anyway as task_add() and task_del() could
> be called with different task queues.
> 
> So better use exclusive lock if IPsec is in use.  For me this is
> running stable.

Note that having multiple threads competing for an exclusive rwlock will
generate unnecessary wakeup/sleep cycles every time the lock is released.  
It is valuable to keep this in mind as it might add extra latency when
processing packets.

> Index: net/if.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> retrieving revision 1.642
> diff -u -p -r1.642 if.c
> --- net/if.c  30 Jun 2021 13:23:33 -  1.642
> +++ net/if.c  19 Jul 2021 14:51:31 -
> @@ -109,6 +109,10 @@
>  #include 
>  #endif
>  
> +#ifdef IPSEC
> +#include 
> +#endif
> +
>  #ifdef MPLS
>  #include 
>  #endif
> @@ -238,7 +242,7 @@ int   ifq_congestion;
>  
>  int   netisr;
>  
> -#define  NET_TASKQ   1
> +#define  NET_TASKQ   4
>  struct taskq *nettqmp[NET_TASKQ];
>  
>  struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
> @@ -815,6 +819,7 @@ void
>  if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
>  {
>   struct mbuf *m;
> + int exclusive_lock = 0;
>  
>   if (ml_empty(ml))
>   return;
> @@ -834,10 +839,25 @@ if_input_process(struct ifnet *ifp, stru
>* to PF globals, pipex globals, unicast and multicast addresses
>* lists and the socket layer.
>*/
> - NET_LOCK();
> +
> + /*
> +  * XXXSMP IPsec data structures are not ready to be
> +  * accessed by multiple Network threads in parallel.
> +  */
> + if (ipsec_in_use)
> + exclusive_lock = 1;
> + if (exclusive_lock)
> + NET_LOCK();
> + else
> + NET_RLOCK_IN_SOFTNET();
> +
>   while ((m = ml_dequeue(ml)) != NULL)
>   (*ifp->if_input)(ifp, m);
> - NET_UNLOCK();
> +
> + if (exclusive_lock)
> + NET_UNLOCK();
> + else
> + NET_RUNLOCK_IN_SOFTNET();
>  }
>  
>  void
> @@ -895,6 +915,12 @@ if_netisr(void *unused)
>   KERNEL_UNLOCK();
>   }
>  #endif
> + if (n & (1 << NETISR_IP))
> + ipintr();
> +#ifdef INET6
> + if (n & (1 << NETISR_IPV6))
> + ip6intr();
> +#endif
>  #if NPPP > 0
>   if (n & (1 << NETISR_PPP)) {
>   KERNEL_LOCK();
> @@ -3311,17 +3337,14 @@ unhandled_af(int af)
>   panic("unhandled af %d", af);
>  }
>  
> -/*
> - * XXXSMP This tunable is here to work around the fact that IPsec
> - * globals aren't ready to be accessed by multiple threads in
> - * parallel.
> - */
> -int   nettaskqs = NET_TASKQ;
> -
>  struct taskq *
>  net_tq(unsigned int ifindex)
>  {
>   struct taskq *t = NULL;
> + static int nettaskqs;
> +
> + if (nettaskqs == 0)
> + nettaskqs = min(NET_TASKQ, ncpus);
>  
>   t = nettqmp[ifindex % nettaskqs];
>  
> Index: net/if_ethersubr.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_ethersubr.c,v
> retrieving revision 1.275
> diff -u -p -r1.275 if_ethersubr.c
> --- net/if_ethersubr.c7 Jul 2021 20:19:01 -   1.275
> +++ net/if_ethersubr.c19 Jul 2021 14:32:48 -
> @@ -222,7 +222,10 @@ ether_resolve(struct ifnet *ifp, struct 
>  
>   switch (af) {
>   case AF_INET:
> + KERNEL_LOCK();
> + /* XXXSMP there is a MP race in arpresolve() */
>   error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
> + KERNEL_UNLOCK();
>   if (error)
>   return (error);
>   eh->ether_type = htons(ETHERTYPE_IP);
> @@ -245,7 +248,10 @@ ether_resolve(struct ifnet *ifp, struct 
>   break;
>  #ifdef INET6
>   case AF_INET6:
> + KERNEL_LOCK();
> + /* XXXSMP there is a MP race in nd6_resolve() */
>   error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
> + KERNEL_UNLOCK();
>   if (error)
>   return (error);
>   eh->ether_type = htons(ETHERTYPE_IPV6);
> Index: net/ifq.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 ifq.c
> --- net/ifq.c 9 Jul 2021 01:22:05 -   1.44
> +++ net/ifq.c 19 Jul 2021 14:32:48 -
> @@ -243,7 +243,7 @@ void
>  ifq_init(struct ifqueue *ifq, struct ifnet *ifp, unsigned int idx)
>  {
>   ifq->ifq_if = ifp;
> - ifq->ifq_softnet = 

Re: Do not spin on the NET_LOCK() in kqueue

2021-07-10 Thread Martin Pieuchot
On 10/07/21(Sat) 21:53, Vitaliy Makkoveev wrote:
> Hi,
> 
> In filt_solisten_common() you touches `so_qlen’ only. It’s not
> related to buffer and not protected by introduced `sb_mtx’ so
> the solock() replacement in filt_solisten*() is wrong.
> 
> However, in filt_solisten_common() you only checks is
> `so_qlen’ != 0 condition and such check could be performed lockless.
> I propose you to commit this by separate diff.

The dance is there because knote needs a lock which could be the
mutex.

> > @@ -208,8 +208,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf 
> > *m, struct mbuf *nam,
> >  * Adjust backpressure on sender
> >  * and wakeup any waiting to write.
> >  */
> > +   mtx_enter(>so_snd.sb_mtx);
> > so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
> > so2->so_snd.sb_cc = so->so_rcv.sb_cc;
> > +   mtx_leave(>so_snd.sb_mtx);
> > sowwakeup(so2);
> > break;
> > 
> 
> This is 'PRU_RCVD’ case, so you hold solock() on `so’ and it’s receive
> buffer is locked by sblock(). Is it assumed `sb_mbcnt’ and `sb_cc’
> modification of `so_rcv’ protected by solock()? Should the both buffers
> be locked here? I’m asking because you only remove solock() from kqueue(9)
> path and the solock() still serialises the rest of sockets, but you are
> going to reduce solock().
> 
> The same question for 'PRU_SEND’ case.

In this case the fields of "so2" are modified.  Modifications are always
serialized by the solock.  The mutex is there to prevent another thread
running the kqueue filters to read between the update of `sb_mbcnt' and
`sb_cc'.

> > @@ -284,8 +286,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf 
> > *m, struct mbuf *nam,
> > sbappendrecord(so2, >so_rcv, m);
> > else
> > sbappend(so2, >so_rcv, m);
> > +   mtx_enter(>so_snd.sb_mtx);
> > so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
> > so->so_snd.sb_cc = so2->so_rcv.sb_cc;
> > +   mtx_leave(>so_snd.sb_mtx);
> > if (so2->so_rcv.sb_cc > 0)
> > sorwakeup(so2);
> 
> 
> Since you touch 'so2->so_rcv’ content here, you want to lock it instead
> of 'so2->so_snd’, right?

Indeed, that's a mistake, thanks!



Do not spin on the NET_LOCK() in kqueue

2021-07-10 Thread Martin Pieuchot
One of the reasons for the drop of performances in the kqueue-based
poll/select is the fact that kqueue filters are called up to 3 times
per syscall and that they all spin on the NET_LOCK() for TCP/UDP
packets.

Diff below is a RFC for improving the situation.

socket kqueue filters mainly check for the amount of available items to
read/write.  This involves comparing various socket buffer fields (sb_cc,
sb_lowat, etc).  The diff below introduces a new mutex to serialize
updates of those fields with reads in the kqueue filters.

Since these fields are always modified with the socket lock held, either
the mutex or the solock are enough to have a coherent view of them.
Note that either of these locks is necessary only if multiple fields
have to be read (like in sbspace()).

Other per-socket fields accessed in the kqueue filters are never
combined (with &&) to determine a condition.  So assuming it is fine to
read register-sized fields w/o the socket lock we can safely remove it
there.

Could such mutex also be used to serialize klist updates?

Comments?

diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
index dce20208828..d1cb9f4fc3b 100644
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -181,6 +181,8 @@ socreate(int dom, struct socket **aso, int type, int proto)
so->so_egid = p->p_ucred->cr_gid;
so->so_cpid = p->p_p->ps_pid;
so->so_proto = prp;
+   mtx_init(>so_snd.sb_mtx, IPL_MPFLOOR);
+   mtx_init(>so_rcv.sb_mtx, IPL_MPFLOOR);
so->so_snd.sb_timeo_nsecs = INFSLP;
so->so_rcv.sb_timeo_nsecs = INFSLP;
 
@@ -276,7 +278,9 @@ sofree(struct socket *so, int s)
}
}
 #endif /* SOCKET_SPLICE */
+   mtx_enter(>so_snd.sb_mtx);
sbrelease(so, >so_snd);
+   mtx_leave(>so_snd.sb_mtx);
sorflush(so);
sounlock(so, s);
 #ifdef SOCKET_SPLICE
@@ -1019,8 +1023,10 @@ dontblock:
*mp = m_copym(m, 0, len, M_WAIT);
m->m_data += len;
m->m_len -= len;
+   mtx_enter(>so_rcv.sb_mtx);
so->so_rcv.sb_cc -= len;
so->so_rcv.sb_datacc -= len;
+   mtx_leave(>so_rcv.sb_mtx);
}
}
if (so->so_oobmark) {
@@ -1537,8 +1543,10 @@ somove(struct socket *so, int wait)
}
so->so_rcv.sb_mb->m_data += size;
so->so_rcv.sb_mb->m_len -= size;
+   mtx_enter(>so_rcv.sb_mtx);
so->so_rcv.sb_cc -= size;
so->so_rcv.sb_datacc -= size;
+   mtx_leave(>so_rcv.sb_mtx);
} else {
*mp = so->so_rcv.sb_mb;
sbfree(>so_rcv, *mp);
@@ -1777,30 +1785,40 @@ sosetopt(struct socket *so, int level, int optname, 
struct mbuf *m)
case SO_SNDBUF:
if (so->so_state & SS_CANTSENDMORE)
return (EINVAL);
+   mtx_enter(>so_snd.sb_mtx);
if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
sbreserve(so, >so_snd, cnt))
-   return (ENOBUFS);
-   so->so_snd.sb_wat = cnt;
+   error = ENOBUFS;
+   if (error == 0)
+   so->so_snd.sb_wat = cnt;
+   mtx_leave(>so_snd.sb_mtx);
break;
 
case SO_RCVBUF:
if (so->so_state & SS_CANTRCVMORE)
return (EINVAL);
+   mtx_enter(>so_rcv.sb_mtx);
if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
sbreserve(so, >so_rcv, cnt))
-   return (ENOBUFS);
-   so->so_rcv.sb_wat = cnt;
+   error = ENOBUFS;
+   if (error == 0)
+   so->so_rcv.sb_wat = cnt;
+   mtx_leave(>so_rcv.sb_mtx);
break;
 
case SO_SNDLOWAT:
+   mtx_enter(>so_snd.sb_mtx);
so->so_snd.sb_lowat =
(cnt > so->so_snd.sb_hiwat) ?
so->so_snd.sb_hiwat : cnt;
+   mtx_leave(>so_snd.sb_mtx);
break;
case SO_RCVLOWAT:
+   mtx_leave(>so_rcv.sb_mtx);

Re: pthread_cond, futex(2) & ECANCELED

2021-07-10 Thread Martin Pieuchot
On 19/01/20(Sun) 14:44, Martin Pieuchot wrote:
> On 18/01/20(Sat) 14:16, Martin Pieuchot wrote:
> > When futex(2) got imported it didn't return ECANCELED.  This was changed
> > later with futex-based semaphores.
> > 
> > This modification introduced a behavior change in pthread_cond_*wait(3).
> > The diff below restores the previous behavior by treating ECANCELED like
> > EINTR.
> > 
> > Note that the __thrsleep(2) version also doesn't completely check for
> > ECANCELED, this diff also changes that.
> 
> Updated version below includes a check missed previously in the
> __thrsleep(2) based implementation, pointed out by visa@

I still have these M in my tree, any ok?

Index: thread/rthread_cond.c
===
RCS file: /cvs/src/lib/libc/thread/rthread_cond.c,v
retrieving revision 1.5
diff -u -p -r1.5 rthread_cond.c
--- thread/rthread_cond.c   29 Jan 2019 17:40:26 -  1.5
+++ thread/rthread_cond.c   18 Jan 2020 13:10:56 -
@@ -109,13 +109,13 @@ _rthread_cond_timedwait(pthread_cond_t c
* we should just go back to sleep without changing state
* (timeouts, etc).
*/
-   } while ((error == EINTR) &&
+   } while ((error == EINTR || error == ECANCELED) &&
   (tib->tib_canceled == 0 || (tib->tib_cantcancel & CANCEL_DISABLED)));
 
/* if timeout or canceled, make note of that */
if (error == ETIMEDOUT)
rv = ETIMEDOUT;
-   else if (error == EINTR)
+   else if (error == EINTR || error == ECANCELED)
canceled = 1;
 
pthread_mutex_lock(mutexp);
Index: thread/rthread_sync.c
===
RCS file: /cvs/src/lib/libc/thread/rthread_sync.c,v
retrieving revision 1.5
diff -u -p -r1.5 rthread_sync.c
--- thread/rthread_sync.c   24 Apr 2018 16:28:42 -  1.5
+++ thread/rthread_sync.c   19 Jan 2020 09:50:06 -
@@ -407,7 +407,7 @@ pthread_cond_timedwait(pthread_cond_t *c
/* if timeout or canceled, make note of that */
if (error == EWOULDBLOCK)
rv = ETIMEDOUT;
-   else if (error == EINTR)
+   else if (error == EINTR || error = ECANCELED)
canceled = 1;
 
/* transfer between the queues */
@@ -544,7 +544,7 @@ pthread_cond_wait(pthread_cond_t *condp,
assert(self->blocking_cond == cond);
 
/* if canceled, make note of that */
-   if (error == EINTR)
+   if (error == EINTR || error == ECANCELED)
canceled = 1;
 
/* transfer between the queues */



Re: gprof: Profiling a multi-threaded application

2021-07-10 Thread Martin Pieuchot
Hello Yuichiro, thanks for your work !

> On 2021/06/16 16:34, Yuichiro NAITO wrote:
> > When I compile a multi-threaded application with '-pg' option, I always get 
> > no
> > results in gmon.out. With bad luck, my application gets SIGSEGV while 
> > running.
> > Because the buffer that holds number of caller/callee count is the only one
> > in the process and will be broken by multi-threaded access.
> > 
> > I get the idea to solve this problem from NetBSD. NetBSD has individual 
> > buffers
> > for each thread and merges them at the end of profiling.

Note that the kernel use a similar approach but doesn't merge the buffer,
instead it generates a file for each CPU.

> > 
> > NetBSD stores the reference to the individual buffer by 
> > pthread_setspecific(3).
> > I think it causes infinite recursive call if whole libc library (except
> > mcount.c) is compiled with -pg.
> > 
> > The compiler generates '_mcount' function call at the beginning of every
> > functions. If '_mcount' calls pthread_getspecific(3) to get the individual
> > buffer, pthread_getspecific() calls '_mcount' again and causes infinite
> > recursion.
> > 
> > NetBSD prevents from infinite recursive call by checking a global variable. 
> > But
> > this approach also prevents simultaneously call of '_mcount' on a 
> > multi-threaded
> > application. It makes a little bit wrong results of profiling.
> > 
> > So I added a pointer to the buffer in `struct pthread` that can be 
> > accessible
> > via macro call as same as pthread_self(3). This approach can prevent of
> > infinite recursive call of '_mcount'.

Not calling a libc function for this makes sense.  However I'm not
convinced that accessing `tib_thread' before _rthread_init() has been
called is safe.

I'm not sure where is the cleanest way to place the per-thread buffer,
I'd suggest you ask guenther@ about this.

Maybe the initialization can be done outside of _mcount()?

> > I obtained merging function from NetBSD that is called in '_mcleanup' 
> > function.
> > Merging function needs to walk through all the individual buffers,
> > I added SLIST_ENTRY member in 'struct gmonparam' to make a list of the 
> > buffers.
> > And also added '#ifndef _KERNEL' for the SLIST_ENTRY member not to be used 
> > for
> > the kernel.
> > 
> > But I still use pthread_getspecific(3) for that can call destructor when
> > a thread is terminated. The individual buffer is moved to free list to reuse
> > for a new thread.
> 
> Here is a patch for this approach.

I have various comments:

We choose not to use C++ style comment (// comment) in the tree, could you
fix yours?

There's two copies of mcount.c, the other one lies in sys/lib/libkern could
you keep them in sync?

gmon.c is only compiled in userland and don't need #ifndef _KERNEL

In libc there's also the use of _MUTEX_LOCK/UNLOCK() macro instead of
calling pthread_mutex* directly.  This might help reduce the differences
between ST and MT paths.

> diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c
> index 1ce0a1c289e..6887f4f5987 100644
> --- a/lib/libc/gmon/gmon.c
> +++ b/lib/libc/gmon/gmon.c
> @@ -42,6 +42,16 @@
>  
>  struct gmonparam _gmonparam = { GMON_PROF_OFF };
>  
> +#ifndef _KERNEL
> +#include 
> +
> +SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree);
> +SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse);
> +pthread_mutex_t _gmonlock = PTHREAD_MUTEX_INITIALIZER;
> +pthread_key_t _gmonkey;
> +struct gmonparam _gmondummy;
> +#endif
> +
>  static int   s_scale;
>  /* see profil(2) where this is describe (incorrectly) */
>  #define  SCALE_1_TO_10x1L
> @@ -52,6 +62,13 @@ PROTO_NORMAL(moncontrol);
>  PROTO_DEPRECATED(monstartup);
>  static int hertz(void);
>  
> +#ifndef _KERNEL
> +static void _gmon_destructor(void *);
> +struct gmonparam *_gmon_alloc(void);
> +static void _gmon_merge(void);
> +static void _gmon_merge_two(struct gmonparam *, struct gmonparam *);
> +#endif
> +
>  void
>  monstartup(u_long lowpc, u_long highpc)
>  {
> @@ -114,6 +131,11 @@ monstartup(u_long lowpc, u_long highpc)
>   } else
>   s_scale = SCALE_1_TO_1;
>  
> +#ifndef _KERNEL
> + _gmondummy.state = GMON_PROF_BUSY;
> + pthread_key_create(&_gmonkey, _gmon_destructor);
> +#endif
> +
>   moncontrol(1);
>   return;
>  
> @@ -134,6 +156,194 @@ mapfailed:
>  }
>  __strong_alias(_monstartup,monstartup);
>  
> +#ifndef _KERNEL
> +static void
> +_gmon_destructor(void *arg)
> +{
> + struct gmonparam *p = arg, *q, **prev;
> +
> + if (p == &_gmondummy)
> + return;
> +
> + pthread_setspecific(_gmonkey, &_gmondummy);
> +
> + pthread_mutex_lock(&_gmonlock);
> + SLIST_REMOVE(&_gmoninuse, p, gmonparam, next);
> + SLIST_INSERT_HEAD(&_gmonfree, p, next);
> + pthread_mutex_unlock(&_gmonlock);
> +
> + pthread_setspecific(_gmonkey, NULL);
> +}
> +
> +struct gmonparam *
> +_gmon_alloc(void)
> +{
> + void *addr;
> + struct gmonparam *p;
> +
> + 

Re: Read/Write whole fusebufs

2021-07-09 Thread Martin Pieuchot
On 08/06/21(Tue) 23:32, Helg wrote:
> Hello tech@
> 
> Due to the challenges of having a large diff reviewed I've had another
> think about how I can break up the FUSE changes so that they are smaller
> and easier to review.
> 
> This is the first of these diffs.
> 
> The current design uses a fixed size fusebuf that consists of a header
> and a union of structs that are used for different VFS operations. In
> addition, there may be data of variable size associated with an
> operation. e.g. the buffer passed to write(2). see fb_setup(9).
> 
> If there is additional data to be exchanged between libfuse and the
> kernel then libfuse uses an ioctl on the device to read or write this
> variable sized data after the fusebuf has been read or written.  This is
> not how the fuse protocol works on Linux.  Instead, the fusebuf is read
> or written in a single read(2) or write(2).  This change is the first
> step in setting the OpenBSD implementation up for improved compatibility
> in the fuse kernel interface.
> 
> The fusebuf struct is shared between the kernel and libfuse but its
> layout differs slightly between the two. The kernel has knowledge of the
> size of data that it is sending or receiving (e.g. read, write, readdir,
> link, lookup) and so can malloc the exact amount of memory required.
> libfuse must read the entire fusebuf but doesn't know its size in
> advance so must have a buffer large enough to cater for the worst case
> scenario. Since libfuse now uses a fixed size fusebuf, it no longer
> needs to free the variable memory previously allocated for the data.
> 
> stsp@ has been kind enough to provide initial feedback. Is it now ready
> for an official OK?

Please do not use MIN() provided by  in userland.  The
consensus is to define MINIMUM locally.  If you need 
please annotate why.  You can grep for examples in src/bin...

With that fixed the diff is ok mpi@.

Note that your change is an ABI break, that said the fuse(4) device isn't
standard and I doubt anything use it outside of the base libfuse.  So I
don't think any special care is needed.  However I don't know if it is
worth mentioning it somehow that people don't end up with a mismatch of
userland/kernel.


> Index: lib/libfuse/fuse.c
> ===
> RCS file: /cvs/src/lib/libfuse/fuse.c,v
> retrieving revision 1.51
> diff -u -p -r1.51 fuse.c
> --- lib/libfuse/fuse.c28 Jun 2019 13:32:42 -  1.51
> +++ lib/libfuse/fuse.c8 Jun 2021 14:15:29 -
> @@ -154,9 +154,9 @@ fuse_loop(struct fuse *fuse)
>  {
>   struct fusebuf fbuf;
>   struct fuse_context ctx;
> - struct fb_ioctl_xch ioexch;
>   struct kevent event[5];
>   struct kevent ev;
> + ssize_t fbuf_size;
>   ssize_t n;
>   int ret;
>  
> @@ -201,29 +201,15 @@ fuse_loop(struct fuse *fuse)
>   strsignal(signum));
>   }
>   } else if (ret > 0) {
> - n = read(fuse->fc->fd, , sizeof(fbuf));
> - if (n != sizeof(fbuf)) {
> + n = read(fuse->fc->fd, , FUSEBUFSIZE);
> + fbuf_size = sizeof(fbuf.fb_hdr) + sizeof(fbuf.FD) +
> + fbuf.fb_len;
> + if (n != fbuf_size) {
>   fprintf(stderr, "%s: bad fusebuf read\n",
>   __func__);
>   return (-1);
>   }
>  
> - /* check if there is data something present */
> - if (fbuf.fb_len) {
> - fbuf.fb_dat = malloc(fbuf.fb_len);
> - if (fbuf.fb_dat == NULL)
> - return (-1);
> - ioexch.fbxch_uuid = fbuf.fb_uuid;
> - ioexch.fbxch_len = fbuf.fb_len;
> - ioexch.fbxch_data = fbuf.fb_dat;
> -
> - if (ioctl(fuse->fc->fd, FIOCGETFBDAT,
> - ) == -1) {
> - free(fbuf.fb_dat);
> - return (-1);
> - }
> - }
> -
>   ctx.fuse = fuse;
>   ctx.uid = fbuf.fb_uid;
>   ctx.gid = fbuf.fb_gid;
> @@ -238,26 +224,13 @@ fuse_loop(struct fuse *fuse)
>   return (-1);
>   }
>  
> - n = write(fuse->fc->fd, , sizeof(fbuf));
> - if (fbuf.fb_len) {
> - if (fbuf.fb_dat == NULL) {
> - fprintf(stderr, "%s: fb_dat is Null\n",
> - __func__);
> - return (-1);
> - }
> - 

Re: more MAKEDEV cleanup

2021-07-09 Thread Martin Pieuchot
On 05/04/21(Mon) 09:25, Miod Vallat wrote:
> The following diff attempts to clean up a few loose ends in the current
> MAKEDEV files:
> 
> - remove no-longer applicable device definitions (MSCP and SMD disks,
>   this kind of thing).
> - makes sure all platforms use the same `ramdisk' target for
>   installation media devices, rather than a mix of `ramd' and `ramdisk'.
> - moves as many `ramdisk' devices to MI land (bio, diskmap, random,
>   etc).
> - reduces the number of block devices in `ramdisk' targets to only one
>   per device, since the installer script will invoke MAKEDEV by itself
>   for the devices it needs to use.
> - sort device names in `all' and `ramdisk' MI lists to make maintainence
>   easier. This causes some ordering change in the `all' target in the
>   generated MAKEDEVs.

Looks good to me.

> Index: MAKEDEV.common
> ===
> RCS file: /OpenBSD/src/etc/MAKEDEV.common,v
> retrieving revision 1.113
> diff -u -p -r1.113 MAKEDEV.common
> --- MAKEDEV.common12 Feb 2021 10:26:33 -  1.113
> +++ MAKEDEV.common5 Apr 2021 09:18:49 -
> @@ -114,7 +114,7 @@ dnl make a 'disktgt' macro that automati
>  dnl disktgt(rd, {-rd-})
>  dnl
>  dnl  target(all,rd,0)
> -dnl  target(ramd,rd,0)
> +dnl  target(ramdisk,rd,0)
>  dnl  disk_q(rd)
>  dnl  __devitem(rd, {-rd*-}, {-rd-})dnl
>  dnl
> @@ -122,62 +122,60 @@ dnl  Note: not all devices are generated
>  dnlits own extra list.
>  dnl
>  divert(1)dnl
> +target(all, acpi)dnl
> +target(all, apm)dnl
> +target(all, bio)dnl
> +target(all, bpf)dnl
> +twrget(all, com, tty0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b)dnl
> +twrget(all, czs, cua, a, b, c, d)dnl
> +target(all, diskmap)dnl
> +target(all, dt)dnl
>  twrget(all, fdesc, fd)dnl
> -target(all, st, 0, 1)dnl
> -target(all, std)dnl
> -target(all, ra, 0, 1, 2, 3)dnl
> -target(all, rx, 0, 1)dnl
> -target(all, wd, 0, 1, 2, 3)dnl
> -target(all, xd, 0, 1, 2, 3)dnl
> +target(all, fuse)dnl
> +target(all, hotplug)dnl
> +target(all, joy, 0, 1)dnl
> +target(all, kcov)dnl
> +target(all, kstat)dnl
> +target(all, local)dnl
> +target(all, lpt, 0, 1, 2)dnl
> +twrget(all, lpt, lpa, 0, 1, 2)dnl
> +target(all, par, 0)dnl
> +target(all, pci, 0, 1, 2, 3)dnl
>  target(all, pctr)dnl
>  target(all, pctr0)dnl
>  target(all, pf)dnl
> -target(all, apm)dnl
> -target(all, acpi)dnl
> +target(all, pppac)dnl
> +target(all, pppx)dnl
> +target(all, ptm)dnl
> +target(all, pty, 0)dnl
> +target(all, pvbus, 0, 1)dnl
> +target(all, radio, 0)dnl
> +target(all, rmidi, 0, 1, 2, 3, 4, 5, 6, 7)dnl
> +twrget(all, rnd, random)dnl
> +twrget(all, speak, speaker)dnl
> +target(all, st, 0, 1)dnl
> +target(all, std)dnl
> +target(all, switch, 0, 1, 2, 3)dnl
> +target(all, tap, 0, 1, 2, 3)dnl
>  twrget(all, tth, ttyh, 0, 1)dnl
>  target(all, ttyA, 0, 1)dnl
> -twrget(all, mac_tty0, tty0, 0, 1)dnl
> -twrget(all, tzs, tty, a, b, c, d)dnl
> -twrget(all, czs, cua, a, b, c, d)dnl
>  target(all, ttyc, 0, 1, 2, 3, 4, 5, 6, 7)dnl
> -twrget(all, com, tty0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b)dnl
> -twrget(all, mmcl, mmclock)dnl
> -target(all, lpt, 0, 1, 2)dnl
> -twrget(all, lpt, lpa, 0, 1, 2)dnl
> -target(all, joy, 0, 1)dnl
> -twrget(all, rnd, random)dnl
> -target(all, uk, 0)dnl
> -twrget(all, vi, video, 0, 1)dnl
> -twrget(all, speak, speaker)dnl
> -target(all, asc, 0)dnl
> -target(all, radio, 0)dnl
> +target(all, tun, 0, 1, 2, 3)dnl
>  target(all, tuner, 0)dnl
> -target(all, rmidi, 0, 1, 2, 3, 4, 5, 6, 7)dnl
> +twrget(all, tzs, tty, a, b, c, d)dnl
>  target(all, uall)dnl
> -target(all, pci, 0, 1, 2, 3)dnl
> -twrget(all, wsmouse, wscons)dnl
> -target(all, par, 0)dnl
> -target(all, apci, 0)dnl
> -target(all, local)dnl
> -target(all, ptm)dnl
> -target(all, hotplug)dnl
> -target(all, pppx)dnl
> -target(all, pppac)dnl
> -target(all, fuse)dnl
> +target(all, uk, 0)dnl
> +twrget(all, vi, video, 0, 1)dnl
>  target(all, vmm)dnl
> -target(all, pvbus, 0, 1)dnl
> -target(all, bpf)dnl
> -target(all, kcov)dnl
> -target(all, dt)dnl
> -target(all, kstat)dnl
> +target(all, vnd, 0, 1, 2, 3)dnl
> +target(all, vscsi, 0)dnl
> +target(all, wd, 0, 1, 2, 3)dnl
> +twrget(all, wsmouse, wscons)dnl
>  dnl
>  _mkdev(all, {-all-}, {-dnl
>  show_target(all)dnl
>  -})dnl
>  dnl
> -dnl XXX some arches use ramd, others ramdisk - needs to be fixed eventually
> -__devitem(ramdisk, ramdisk, Ramdisk kernel devices,nothing)dnl
> -dnl
>  target(usb, usb, 0, 1, 2, 3, 4, 5, 6, 7)dnl
>  target(usb, uhid, 0, 1, 2, 3, 4, 5, 6, 7)dnl
>  twrget(usb, fido, fido)dnl
> @@ -208,26 +206,26 @@ __devitem(ch, {-ch*-}, SCSI media change
>  _mcdev(ch, ch*, ch, {-major_ch_c-}, 660, operator)dnl
>  __devitem(uk, uk*, Unknown SCSI devices)dnl
>  _mcdev(uk, uk*, uk, {-major_uk_c-}, 640, operator)dnl
> -dnl XXX see ramdisk above
> -__devitem(ramd, ramdisk, Ramdisk kernel devices,nothing)dnl
>  dnl
> -_mkdev(ramd, ramdisk, {-dnl
> -show_target(ramd)dnl
> +__devitem(ramdisk, ramdisk, Ramdisk kernel devices,nothing)dnl
> +_mkdev(ramdisk, ramdisk, {-dnl
> 

Re: netlock ktrace nfs

2021-07-04 Thread Martin Pieuchot
On 02/07/21(Fri) 15:01, Alexander Bluhm wrote:
> On Fri, Jul 02, 2021 at 01:05:39PM +0200, Martin Pieuchot wrote:
> > Looks good to me.  Grabbing solock() after calling pledge_socket() in
> > sys_connect(), like it is already done in sys_bind(), means it is ok
> > to read this field w/o lock.  Is it true?
> 
> I guess it is good enough.  If MP rules are followed stictly, every
> access without lock or memory barrier is problematic.  But here the
> SS_DNS flag is set during socket creation.  Also racing against
> pledge does not look like something we must be aware of.

Does good enough means it OK to read so_state w/o serialization
mechanism in poll & kqueue handlers?  If so that would simplify a lot
the work to reduce the contention on the NET_LOCK() in these code paths.



Re: systat(1) counter overflow

2021-07-02 Thread Martin Pieuchot
On 01/07/21(Thu) 13:53, Anindya Mukherjee wrote:
> Hi,
> 
> I noticed that if I leave the system running for more than about a month, some
> of the counters in the uvm view of systat(1) overflow and become negative. 
> This
> is because the members of struct uvmexp in sys/uvm/uvmexp.h are ints. The
> kernel's internal counters are of course uint64_t so they don't overflow. It
> only happens during the uvm_sysctl(9) call which casts the numbers to 
> integers.
> The function is uvmexp_read.
> 
> In the attached diff I took the path of least resistance and promoted some of
> the counters to unsigned int. Ideally I would have liked to use int64_t or 
> even
> uint64_t, but I hit an issue in some of the architecture dependent code. An
> example is:
> /usr/src/sys/arch/alpha/alpha/trap.c:536 atomic_add_int(, 1);
> In other places the ++ operator is used to increment the counters and the 64 
> bit
> types can be used.
> 
> I am not completely sure this is the best way to proceed, but even if this 
> diff
> is horrifying, I'd appreciate some feedback and advice, thanks!

I wonder if we shouldn't use uint64_t for those and embrace the ABI
break, that would at least simplify the kernel side and remove a
truncation.

Do you have an idea of the different consumers of the "struct uvmexp"
apart from systat(1)?  What is the impact in userland & ports of such
change?

> Index: sys/uvm/uvm_meter.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
> retrieving revision 1.42
> diff -u -p -r1.42 uvm_meter.c
> --- sys/uvm/uvm_meter.c   28 Dec 2020 14:01:23 -  1.42
> +++ sys/uvm/uvm_meter.c   28 Jun 2021 05:24:56 -
> @@ -329,7 +329,7 @@ uvmexp_read(struct uvmexp *uexp)
>   counters_read(uvmexp_counters, counters, exp_ncounters);
>  
>   /* stat counters */
> - uexp->faults = (int)counters[faults];
> + uexp->faults = (unsigned int)counters[faults];
>   uexp->pageins = (int)counters[pageins];
>  
>   /* fault subcounters */
> @@ -379,10 +379,10 @@ uvmexp_print(int (*pr)(const char *, ...
>   (*pr)("  freemin=%d, free-target=%d, inactive-target=%d, "
>   "wired-max=%d\n", uexp.freemin, uexp.freetarg, uexp.inactarg,
>   uexp.wiredmax);
> - (*pr)("  faults=%d, traps=%d, intrs=%d, ctxswitch=%d fpuswitch=%d\n",
> + (*pr)("  faults=%u, traps=%u, intrs=%u, ctxswitch=%u fpuswitch=%d\n",
>   uexp.faults, uexp.traps, uexp.intrs, uexp.swtch,
>   uexp.fpswtch);
> - (*pr)("  softint=%d, syscalls=%d, kmapent=%d\n",
> + (*pr)("  softint=%u, syscalls=%u, kmapent=%d\n",
>   uexp.softs, uexp.syscalls, uexp.kmapent);
>  
>   (*pr)("  fault counts:\n");
> Index: sys/uvm/uvmexp.h
> ===
> RCS file: /cvs/src/sys/uvm/uvmexp.h,v
> retrieving revision 1.9
> diff -u -p -r1.9 uvmexp.h
> --- sys/uvm/uvmexp.h  4 Mar 2021 09:00:03 -   1.9
> +++ sys/uvm/uvmexp.h  28 Jun 2021 05:24:56 -
> @@ -90,12 +90,12 @@ struct uvmexp {
>   int unused06;   /* formerly nfreeanon */
>  
>   /* stat counters */
> - int faults; /* page fault count */
> - int traps;  /* trap count */
> - int intrs;  /* interrupt count */
> - int swtch;  /* context switch count */
> - int softs;  /* software interrupt count */
> - int syscalls;   /* system calls */
> + unsigned int faults;/* page fault count */
> + unsigned int traps; /* trap count */
> + unsigned int intrs; /* interrupt count */
> + unsigned int swtch; /* context switch count */
> + unsigned int softs; /* software interrupt count */
> + unsigned int syscalls;  /* system calls */
>   int pageins;/* pagein operation count */
>   /* pageouts are in pdpageouts below */
>   int unused07;   /* formerly obsolete_swapins */
> Index: usr.bin/systat/uvm.c
> ===
> RCS file: /cvs/src/usr.bin/systat/uvm.c,v
> retrieving revision 1.5
> diff -u -p -r1.5 uvm.c
> --- usr.bin/systat/uvm.c  28 Jun 2019 13:35:04 -  1.5
> +++ usr.bin/systat/uvm.c  28 Jun 2021 05:24:57 -
> @@ -37,22 +37,23 @@ void print_uvm(void);
>  int  read_uvm(void);
>  int  select_uvm(void);
>  
> -void print_uvmexp_field(field_def *, field_def *, int *, int *, const char 
> *);
> +void print_uvmexp_field(field_def *, field_def *, unsigned int *,
> +unsigned int *, const char *);
>  void print_uvmexp_line(int);
>  
>  struct uvmexp uvmexp;
>  struct uvmexp last_uvmexp;
>  
>  struct uvmline {
> - int *v1;
> - int *ov1;
> - char*n1;
> - int *v2;
> - int *ov2;
> - char*n2;
> - int *v3;
> - int *ov3;
> - char*n3;
> + unsigned int

Re: netlock ktrace nfs

2021-07-02 Thread Martin Pieuchot
On 01/07/21(Thu) 21:27, Alexander Bluhm wrote:
> Hi,
> 
> Writing ktrace files to NFS must no be done while holding the net
> lock.  accept(2) panics, connect(2) dead locks.  Additionally copy
> in or out must not hold the net lock as it may be a mmapped file
> on NFS.
> 
> - Simplify dns_portcheck(), it does not modify namelen anymore.
> - In doaccept() release the socket lock before calling copyaddrout().
> - Rearrange the checks in sys_connect() like they are in sys_bind().

Looks good to me.  Grabbing solock() after calling pledge_socket() in
sys_connect(), like it is already done in sys_bind(), means it is ok
to read this field w/o lock.  Is it true?

ok mpi@ for this diff because it makes things coherent and fix a bug,
then we can figure out and document the truth about `so_state'.

> Index: kern/uipc_syscalls.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_syscalls.c,v
> retrieving revision 1.192
> diff -u -p -r1.192 uipc_syscalls.c
> --- kern/uipc_syscalls.c  2 Jun 2021 11:30:23 -   1.192
> +++ kern/uipc_syscalls.c  1 Jul 2021 18:34:21 -
> @@ -124,20 +124,20 @@ isdnssocket(struct socket *so)
>  
>  /* For SS_DNS sockets, only allow port DNS (port 53) */
>  static int
> -dns_portcheck(struct proc *p, struct socket *so, void *nam, u_int *namelen)
> +dns_portcheck(struct proc *p, struct socket *so, void *nam, size_t namelen)
>  {
>   int error = EINVAL;
>  
>   switch (so->so_proto->pr_domain->dom_family) {
>   case AF_INET:
> - if (*namelen < sizeof(struct sockaddr_in))
> + if (namelen < sizeof(struct sockaddr_in))
>   break;
>   if (((struct sockaddr_in *)nam)->sin_port == htons(53))
>   error = 0;
>   break;
>  #ifdef INET6
>   case AF_INET6:
> - if (*namelen < sizeof(struct sockaddr_in6))
> + if (namelen < sizeof(struct sockaddr_in6))
>   break;
>   if (((struct sockaddr_in6 *)nam)->sin6_port == htons(53))
>   error = 0;
> @@ -315,18 +315,17 @@ doaccept(struct proc *p, int sock, struc
>   fp->f_ops = 
>   fp->f_data = so;
>   error = soaccept(so, nam);
> +out:
> + sounlock(head, s);
>   if (!error && name != NULL)
>   error = copyaddrout(p, nam, name, namelen, anamelen);
> -out:
>   if (!error) {
> - sounlock(head, s);
>   fdplock(fdp);
>   fdinsert(fdp, tmpfd, cloexec, fp);
>   fdpunlock(fdp);
>   FRELE(fp, p);
>   *retval = tmpfd;
>   } else {
> - sounlock(head, s);
>   fdplock(fdp);
>   fdremove(fdp, tmpfd);
>   fdpunlock(fdp);
> @@ -348,44 +347,40 @@ sys_connect(struct proc *p, void *v, reg
>   } */ *uap = v;
>   struct file *fp;
>   struct socket *so;
> - struct mbuf *nam = NULL;
> + struct mbuf *nam;
>   int error, s, interrupted = 0;
>  
>   if ((error = getsock(p, SCARG(uap, s), )) != 0)
>   return (error);
>   so = fp->f_data;
> - s = solock(so);
> - if (so->so_state & SS_ISCONNECTING) {
> - error = EALREADY;
> + error = pledge_socket(p, so->so_proto->pr_domain->dom_family,
> + so->so_state);
> + if (error)
>   goto out;
> - }
>   error = sockargs(, SCARG(uap, name), SCARG(uap, namelen),
>   MT_SONAME);
>   if (error)
>   goto out;
> - error = pledge_socket(p, so->so_proto->pr_domain->dom_family,
> - so->so_state);
> - if (error)
> - goto out;
>  #ifdef KTRACE
>   if (KTRPOINT(p, KTR_STRUCT))
>   ktrsockaddr(p, mtod(nam, caddr_t), SCARG(uap, namelen));
>  #endif
> -
> + s = solock(so);
>   if (isdnssocket(so)) {
> - u_int namelen = nam->m_len;
> - error = dns_portcheck(p, so, mtod(nam, void *), );
> + error = dns_portcheck(p, so, mtod(nam, void *), nam->m_len);
>   if (error)
> - goto out;
> - nam->m_len = namelen;
> + goto unlock;
> + }
> + if (so->so_state & SS_ISCONNECTING) {
> + error = EALREADY;
> + goto unlock;
>   }
> -
>   error = soconnect(so, nam);
>   if (error)
>   goto bad;
>   if ((fp->f_flag & FNONBLOCK) && (so->so_state & SS_ISCONNECTING)) {
>   error = EINPROGRESS;
> - goto out;
> + goto unlock;
>   }
>   while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
>   error = sosleep_nsec(so, >so_timeo, PSOCK | PCATCH,
> @@ -403,10 +398,11 @@ sys_connect(struct proc *p, void *v, reg
>  bad:
>   if (!interrupted)
>   so->so_state &= ~SS_ISCONNECTING;
> -out:
> +unlock:
>   sounlock(so, s);
> - FRELE(fp, p);
>   m_freem(nam);

Re: crypto kernel lock

2021-06-30 Thread Martin Pieuchot
On 21/06/21(Mon) 23:45, Alexander Bluhm wrote:
> On Thu, Jun 17, 2021 at 03:19:11PM +0200, Alexander Bluhm wrote:
> > On Thu, Jun 17, 2021 at 10:09:47AM +0200, Martin Pieuchot wrote:
> > > Could you annotate which field is being protected by the KERNEL_LOCK()?
> > 
> > No.  I do not want to invest into fine grained crypto locking.  I
> > need a stable test machine.
> 
> Now my machine is stable again, I can do some annotations.
> 
> - remove unused variable cryptodesc_pool
> - document global variables in crypto.c
> - assert kernel lock where needed
> - remove dead code from crypto_get_driverid()
> - move crypto_init() prototype into header file

Diff is ok mpi@.

The annotations are not really coherent with the rest of the tree but this
can be improved in tree, especially if we start using a mutex to replace
the various splvm/splx dances and protect the cc_* fields.

> 
> ok?
> 
> bluhm
> 
> Index: crypto/crypto.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/crypto/crypto.c,v
> retrieving revision 1.82
> diff -u -p -r1.82 crypto.c
> --- crypto/crypto.c   30 Mar 2020 17:48:39 -  1.82
> +++ crypto/crypto.c   21 Jun 2021 21:00:19 -
> @@ -27,16 +27,23 @@
>  
>  #include 
>  
> -void crypto_init(void);
> -
> -struct cryptocap *crypto_drivers = NULL;
> -int crypto_drivers_num = 0;
> -
> -struct pool cryptop_pool;
> -struct pool cryptodesc_pool;
> +/*
> + * Locks used to protect struct members in this file:
> + *   A   allocated during driver attach, no hotplug, no detach
> + *   I   initialized by main()

This doesn't tell if a lock is needed or not.  Other data structure use:

"I   immutable after creation"

Is it what you meant?  That the variables aren't marked as 'const'
because they are initialized once but after that never change?

> + *   K   modified with kernel lock

Can we use the same wording as other files:

"K   kernel lock"

> + */
>  
> -struct taskq *crypto_taskq;
> -struct taskq *crypto_taskq_mpsafe;
> +struct cryptocap *crypto_drivers;/* [A] array allocated by driver

Isn't it possible to have a USB driver calling crypto_get_driverid()?
If that's the case what's protecting the structure is the KERNEL_LOCK().

If we want to simplify things and prevent hotpluging then maybe we
should assert for `cold'?

> +[K] driver data and session count */

Until now we've been annotating the fields in the data structure
definitions, you're referring to `cc_sessions', `cc_newsession', etc,
right?

> +int crypto_drivers_num = 0;  /* [A] attached drivers array size */
> +
> +struct pool cryptop_pool;/* [I] set of crypto descriptors */

Pool are protected by their own lock, not sure we should start annotating
them.

> +
> +struct taskq *crypto_taskq;  /* [I] run crypto_invoke() and callback
> +with kernel lock */
> +struct taskq *crypto_taskq_mpsafe;   /* [I] run crypto_invoke()
> +without kernel lock */
>  
>  /*
>   * Create a new session.
> @@ -52,6 +59,8 @@ crypto_newsession(u_int64_t *sid, struct
>   if (crypto_drivers == NULL)
>   return EINVAL;
>  
> + KERNEL_ASSERT_LOCKED();
> +
>   s = splvm();
>  
>   /*
> @@ -186,6 +195,8 @@ crypto_freesession(u_int64_t sid)
>   if (hid >= crypto_drivers_num)
>   return ENOENT;
>  
> + KERNEL_ASSERT_LOCKED();
> +
>   s = splvm();
>  
>   if (crypto_drivers[hid].cc_sessions)
> @@ -215,6 +226,9 @@ crypto_get_driverid(u_int8_t flags)
>  {
>   struct cryptocap *newdrv;
>   int i, s;
> +
> + /* called from attach routines */
> + KERNEL_ASSERT_LOCKED();
>   
>   s = splvm();
>  
> @@ -241,39 +255,33 @@ crypto_get_driverid(u_int8_t flags)
>   }
>  
>   /* Out of entries, allocate some more. */
> - if (i == crypto_drivers_num) {
> - if (crypto_drivers_num >= CRYPTO_DRIVERS_MAX) {
> - splx(s);
> - return -1;
> - }
> -
> - newdrv = mallocarray(crypto_drivers_num,
> - 2 * sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT);
> - if (newdrv == NULL) {
> - splx(s);
> - return -1;
> - }
> + if (crypto_drivers_num >= CRYPTO_DRIVERS_MAX) {
> + splx(s);
> + return -1;
> + }
>  
> - memcpy(newdrv, crypto_drivers,
> - crypto_drivers_num *

Re: uao references & uao_swap_off() cleanup

2021-06-28 Thread Martin Pieuchot
On 23/06/21(Wed) 23:03, Jonathan Matthew wrote:
> On Wed, Jun 23, 2021 at 09:37:10AM +0200, Martin Pieuchot wrote:
> > On 16/06/21(Wed) 11:26, Martin Pieuchot wrote:
> > > Diff below does two things:
> > > 
> > > - Use atomic operations for incrementing/decrementing references of
> > >   anonymous objects.  This allows us to manipulate them without holding
> > >   the KERNEL_LOCK().
> > > 
> > > - Rewrite the loop from uao_swap_off() to only keep a reference to the
> > >   next item in the list.  This is imported from NetBSD and is necessary
> > >   to introduce locking around uao_pagein().
> > > 
> > > ok?
> > 
> > Anyone?
> 
> uao_reference_locked() and uao_detach_locked() are prototyped in
> uvm_extern.h, so they should be removed here too.

Thanks, I'll do that.
 
> It doesn't look like uao_detach() is safe to call without the
> kernel lock; it calls uao_dropswap() for each page, which calls
> uao_set_swslot(), which includes a KERNEL_ASSERT_LOCKED().
> Should we keep the KERNEL_ASSERT_LOCKED() in uao_detach()?

I prefer to keep the KERNEL_ASSERT_LOCKED() where it is needed and not
spread it to all the callers.  My current plan is to trade those assert
by assertions on the vmobjlock so I don't want to add new ones.



sparc64: enable dt(4) in GENERIC

2021-06-23 Thread Martin Pieuchot
Similar to what has been done on x86 & arm64, ok?

Index: conf/GENERIC
===
RCS file: /cvs/src/sys/arch/sparc64/conf/GENERIC,v
retrieving revision 1.316
diff -u -p -r1.316 GENERIC
--- conf/GENERIC4 Feb 2021 16:25:39 -   1.316
+++ conf/GENERIC23 Jun 2021 07:39:53 -
@@ -556,4 +556,5 @@ owtemp* at onewire? # Temperature
 owctr* at onewire? # Counter device
 
 pseudo-device  hotplug 1   # devices hot plugging
+pseudo-device  dt
 pseudo-device  wsmux   2   # mouse & keyboard multiplexor



new kqueue-based select(2) implementation

2021-06-23 Thread Martin Pieuchot
Our previous attempt [0] to replace the current select(2) implementation
has been reverted due to non-acceptable latency increase on sockets [1].

This performance regression has been analysed and partially addressed
thanks to bluhm@ and visa@.  The cost of allocating/freeing 'knote'
descriptors has been mitigated by using a pool cache and by using lazy
removal of items.

The next bottleneck is due to contention on the NET_LOCK() intensified
by the fact that kqueue hooks might be checked multiple times per
syscall.  We are aware of those and we are interested in improving the
situation, however I believe we should get this diff in first.  Keeping
it out of tree is starting to be painful.

So I'm asking for tests.  Darren I'd greatly appreciate if you could
check if the ssh regression suite is working well with this.

Thanks,
Martin

[0] https://marc.info/?l=openbsd-tech=160675386103259=2
[1] https://marc.info/?l=openbsd-bugs=161003823423816=2

Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.135
diff -u -p -r1.135 sys_generic.c
--- kern/sys_generic.c  8 Jan 2021 09:29:04 -   1.135
+++ kern/sys_generic.c  21 Jun 2021 07:58:07 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -66,8 +67,21 @@
 
 #include 
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2) information, somewhat verbose
+ *  3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) {  \
+   printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);  \
+   printf(x);  \
+}
+
+int pselregister(struct proc *, fd_set *[], int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
+
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
 struct timespec *, const sigset_t *, register_t *);
@@ -584,11 +598,10 @@ int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
 {
+   struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
-   struct timespec elapsed, start, stop;
-   uint64_t nsecs;
-   int s, ncoll, error = 0;
+   int error, ncollected = 0, nevents = 0;
u_int ni;
 
if (nd < 0)
@@ -618,6 +631,8 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)[5];
}
 
+   kqpoll_init();
+
 #definegetbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
goto done;
@@ -636,43 +651,61 @@ dopselect(struct proc *p, int nd, fd_set
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-   ncoll = nselcoll;
-   atomic_setbits_int(>p_flag, P_SELECT);
-   error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
-   if (error || *retval)
+   /* Register kqueue events */
+   error = pselregister(p, pibits, nd, );
+   if (error != 0)
goto done;
-   if (timeout == NULL || timespecisset(timeout)) {
-   if (timeout != NULL) {
-   getnanouptime();
-   nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-   } else
-   nsecs = INFSLP;
-   s = splhigh();
-   if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-   splx(s);
-   goto retry;
-   }
-   atomic_clearbits_int(>p_flag, P_SELECT);
-   error = tsleep_nsec(, PSOCK | PCATCH, "select", nsecs);
-   splx(s);
+
+   /*
+* The poll/select family of syscalls has been designed to
+* block when file descriptors are not available, even if
+* there's nothing to wait for.
+*/
+   if (nevents == 0) {
+   uint64_t nsecs = INFSLP;
+
if (timeout != NULL) {
-   getnanouptime();
-   timespecsub(, , );
-   timespecsub(timeout, , timeout);
-   if (timeout->tv_sec < 0)
-   timespecclear(timeout);
+   if (!timespecisset(timeout))
+   goto done;
+   nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
}
-   if (error == 0 || error == EWOULDBLOCK)
-   goto retry;
+   error = 

Re: uao references & uao_swap_off() cleanup

2021-06-23 Thread Martin Pieuchot
On 16/06/21(Wed) 11:26, Martin Pieuchot wrote:
> Diff below does two things:
> 
> - Use atomic operations for incrementing/decrementing references of
>   anonymous objects.  This allows us to manipulate them without holding
>   the KERNEL_LOCK().
> 
> - Rewrite the loop from uao_swap_off() to only keep a reference to the
>   next item in the list.  This is imported from NetBSD and is necessary
>   to introduce locking around uao_pagein().
> 
> ok?

Anyone?

> 
> Index: uvm/uvm_aobj.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
> retrieving revision 1.98
> diff -u -p -r1.98 uvm_aobj.c
> --- uvm/uvm_aobj.c16 Jun 2021 09:02:21 -  1.98
> +++ uvm/uvm_aobj.c16 Jun 2021 09:20:26 -
> @@ -779,19 +779,11 @@ uao_init(void)
>  void
>  uao_reference(struct uvm_object *uobj)
>  {
> - KERNEL_ASSERT_LOCKED();
> - uao_reference_locked(uobj);
> -}
> -
> -void
> -uao_reference_locked(struct uvm_object *uobj)
> -{
> -
>   /* Kernel object is persistent. */
>   if (UVM_OBJ_IS_KERN_OBJECT(uobj))
>   return;
>  
> - uobj->uo_refs++;
> + atomic_inc_int(>uo_refs);
>  }
>  
>  
> @@ -801,34 +793,19 @@ uao_reference_locked(struct uvm_object *
>  void
>  uao_detach(struct uvm_object *uobj)
>  {
> - KERNEL_ASSERT_LOCKED();
> - uao_detach_locked(uobj);
> -}
> -
> -
> -/*
> - * uao_detach_locked: drop a reference to an aobj
> - *
> - * => aobj may freed upon return.
> - */
> -void
> -uao_detach_locked(struct uvm_object *uobj)
> -{
>   struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
>   struct vm_page *pg;
>  
>   /*
>* Detaching from kernel_object is a NOP.
>*/
> - if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
> + if (UVM_OBJ_IS_KERN_OBJECT(uobj))
>   return;
> - }
>  
>   /*
>* Drop the reference.  If it was the last one, destroy the object.
>*/
> - uobj->uo_refs--;
> - if (uobj->uo_refs) {
> + if (atomic_dec_int_nv(>uo_refs) > 0) {
>   return;
>   }
>  
> @@ -1265,68 +1242,54 @@ uao_dropswap(struct uvm_object *uobj, in
>  boolean_t
>  uao_swap_off(int startslot, int endslot)
>  {
> - struct uvm_aobj *aobj, *nextaobj, *prevaobj = NULL;
> + struct uvm_aobj *aobj;
>  
>   /*
> -  * Walk the list of all anonymous UVM objects.
> +  * Walk the list of all anonymous UVM objects.  Grab the first.
>*/
>   mtx_enter(_list_lock);
> + if ((aobj = LIST_FIRST(_list)) == NULL) {
> + mtx_leave(_list_lock);
> + return FALSE;
> + }
> + uao_reference(>u_obj);
>  
> - for (aobj = LIST_FIRST(_list);
> -  aobj != NULL;
> -  aobj = nextaobj) {
> + do {
> + struct uvm_aobj *nextaobj;
>   boolean_t rv;
>  
>   /*
> -  * add a ref to the aobj so it doesn't disappear
> -  * while we're working.
> -  */
> - uao_reference_locked(>u_obj);
> -
> - /*
> -  * now it's safe to unlock the uao list.
> -  * note that lock interleaving is alright with IPL_NONE mutexes.
> +  * Prefetch the next object and immediately hold a reference
> +  * on it, so neither the current nor the next entry could
> +  * disappear while we are iterating.
>*/
> - mtx_leave(_list_lock);
> -
> - if (prevaobj) {
> - uao_detach_locked(>u_obj);
> - prevaobj = NULL;
> + if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) {
> + uao_reference(>u_obj);
>   }
> + mtx_leave(_list_lock);
>  
>   /*
> -  * page in any pages in the swslot range.
> -  * if there's an error, abort and return the error.
> +  * Page in all pages in the swap slot range.
>*/
>   rv = uao_pagein(aobj, startslot, endslot);
> +
> + /* Drop the reference of the current object. */
> + uao_detach(>u_obj);
>   if (rv) {
> - uao_detach_locked(>u_obj);
> + if (nextaobj) {
> + uao_detach(>u_obj);
> + }
>   return rv;
>   }
>  
> - /*
> -  * we're done with this aobj.
> -  * relock the lis

Re: ipsec crypto kernel lock

2021-06-17 Thread Martin Pieuchot
On 16/06/21(Wed) 22:05, Alexander Bluhm wrote:
> Hi,
> 
> I have seen a kernel crash with while forwarding and encrypting
> much traffic through OpenBSD IPsec gateways running iked.
> 
> kernel: protection fault trap, code=0
> Stopped at  aes_ctr_crypt+0x1e: addb$0x1,0x2e3(%rdi)
> 
> ddb{2}> trace
> aes_ctr_crypt(16367ed4be021a53,8000246e1db0) at aes_ctr_crypt+0x1e
> swcr_authenc(fd8132a21b08) at swcr_authenc+0x5c3
> swcr_process(fd8132a21b08) at swcr_process+0x1e8
> crypto_invoke(fd8132a21b08) at crypto_invoke+0xde
> taskq_thread(80200500) at taskq_thread+0x81
> end trace frame: 0x0, count: -5
> 
> *64926  109760  0  0  7 0x14200crypto
> 
> swcr_authenc() passes swe->sw_kschedule to aes_ctr_crypt(), swe has
> been freed and contains deaf006cdeaf4152, which looks like some
> sort of poison.  I suspect a use after free.
> 
> The swe value comes from the swcr_sessions global pointers.  Its
> content looks sane in ddb.  Noone touches it in swcr_authenc().  So
> I guess that an other CPU changes the global structures while
> swcr_authenc() is working with it.
> 
> The crypto thread is protected by kernel lock, both network stack
> and pfkey use net lock.  The kernel lock has been recently removed
> from pfkey.
> 
> I think the required lock for the crypto framework is the kernel
> lock.  If crypto_ functions are called, IPsec must grab the kernel
> lock.  pfkey accesses crypto only via tdb_ functions, so this diff
> also covers that case.

It's not clear to me which field is the KERNEL_LOCK() protecting.  Is it
the access to `swcr_sessions'?  Is it a reference?  If so grabbing/releasing
the lock might not be enough to fix the use-after-free.

Could you annotate which field is being protected by the KERNEL_LOCK()? 

> Index: netinet/ip_ah.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_ah.c,v
> retrieving revision 1.146
> diff -u -p -r1.146 ip_ah.c
> --- netinet/ip_ah.c   25 Feb 2021 02:48:21 -  1.146
> +++ netinet/ip_ah.c   16 Jun 2021 17:29:37 -
> @@ -98,6 +98,7 @@ ah_init(struct tdb *tdbp, struct xformsw
>  {
>   struct auth_hash *thash = NULL;
>   struct cryptoini cria, crin;
> + int error;
>  
>   /* Authentication operation. */
>   switch (ii->ii_authalg) {
> @@ -162,7 +163,10 @@ ah_init(struct tdb *tdbp, struct xformsw
>   cria.cri_next = 
>   }
>  
> - return crypto_newsession(>tdb_cryptoid, , 0);
> + KERNEL_LOCK();
> + error = crypto_newsession(>tdb_cryptoid, , 0);
> + KERNEL_UNLOCK();
> + return error;
>  }
>  
>  /*
> @@ -171,7 +175,7 @@ ah_init(struct tdb *tdbp, struct xformsw
>  int
>  ah_zeroize(struct tdb *tdbp)
>  {
> - int err;
> + int error;
>  
>   if (tdbp->tdb_amxkey) {
>   explicit_bzero(tdbp->tdb_amxkey, tdbp->tdb_amxkeylen);
> @@ -179,9 +183,11 @@ ah_zeroize(struct tdb *tdbp)
>   tdbp->tdb_amxkey = NULL;
>   }
>  
> - err = crypto_freesession(tdbp->tdb_cryptoid);
> + KERNEL_LOCK();
> + error = crypto_freesession(tdbp->tdb_cryptoid);
> + KERNEL_UNLOCK();
>   tdbp->tdb_cryptoid = 0;
> - return err;
> + return error;
>  }
>  
>  /*
> @@ -626,7 +632,9 @@ ah_input(struct mbuf *m, struct tdb *tdb
>   }
>  
>   /* Get crypto descriptors. */
> + KERNEL_LOCK();
>   crp = crypto_getreq(1);
> + KERNEL_UNLOCK();
>   if (crp == NULL) {
>   DPRINTF(("%s: failed to acquire crypto descriptors\n",
>   __func__));
> @@ -696,11 +704,16 @@ ah_input(struct mbuf *m, struct tdb *tdb
>   tc->tc_rdomain = tdb->tdb_rdomain;
>   memcpy(>tc_dst, >tdb_dst, sizeof(union sockaddr_union));
>  
> - return crypto_dispatch(crp);
> + KERNEL_LOCK();
> + error = crypto_dispatch(crp);
> + KERNEL_UNLOCK();
> + return error;
>  
>   drop:
>   m_freem(m);
> + KERNEL_LOCK();
>   crypto_freereq(crp);
> + KERNEL_UNLOCK();
>   free(tc, M_XDATA, 0);
>   return error;
>  }
> @@ -1047,7 +1060,9 @@ ah_output(struct mbuf *m, struct tdb *td
>  #endif
>  
>   /* Get crypto descriptors. */
> + KERNEL_LOCK();
>   crp = crypto_getreq(1);
> + KERNEL_UNLOCK();
>   if (crp == NULL) {
>   DPRINTF(("%s: failed to acquire crypto descriptors\n",
>   __func__));
> @@ -1144,11 +1159,16 @@ ah_output(struct mbuf *m, struct tdb *td
>   tc->tc_rdomain = tdb->tdb_rdomain;
>   memcpy(>tc_dst, >tdb_dst, sizeof(union sockaddr_union));
>  
> - return crypto_dispatch(crp);
> + KERNEL_LOCK();
> + error = crypto_dispatch(crp);
> + KERNEL_UNLOCK();
> + return error;
>  
>   drop:
>   m_freem(m);
> + KERNEL_LOCK();
>   crypto_freereq(crp);
> + KERNEL_UNLOCK();
>   free(tc, M_XDATA, 0);
>   return error;
>  }
> Index: netinet/ip_esp.c
> 

uao references & uao_swap_off() cleanup

2021-06-16 Thread Martin Pieuchot
Diff below does two things:

- Use atomic operations for incrementing/decrementing references of
  anonymous objects.  This allows us to manipulate them without holding
  the KERNEL_LOCK().

- Rewrite the loop from uao_swap_off() to only keep a reference to the
  next item in the list.  This is imported from NetBSD and is necessary
  to introduce locking around uao_pagein().

ok?

Index: uvm/uvm_aobj.c
===
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.98
diff -u -p -r1.98 uvm_aobj.c
--- uvm/uvm_aobj.c  16 Jun 2021 09:02:21 -  1.98
+++ uvm/uvm_aobj.c  16 Jun 2021 09:20:26 -
@@ -779,19 +779,11 @@ uao_init(void)
 void
 uao_reference(struct uvm_object *uobj)
 {
-   KERNEL_ASSERT_LOCKED();
-   uao_reference_locked(uobj);
-}
-
-void
-uao_reference_locked(struct uvm_object *uobj)
-{
-
/* Kernel object is persistent. */
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
return;
 
-   uobj->uo_refs++;
+   atomic_inc_int(>uo_refs);
 }
 
 
@@ -801,34 +793,19 @@ uao_reference_locked(struct uvm_object *
 void
 uao_detach(struct uvm_object *uobj)
 {
-   KERNEL_ASSERT_LOCKED();
-   uao_detach_locked(uobj);
-}
-
-
-/*
- * uao_detach_locked: drop a reference to an aobj
- *
- * => aobj may freed upon return.
- */
-void
-uao_detach_locked(struct uvm_object *uobj)
-{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct vm_page *pg;
 
/*
 * Detaching from kernel_object is a NOP.
 */
-   if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
+   if (UVM_OBJ_IS_KERN_OBJECT(uobj))
return;
-   }
 
/*
 * Drop the reference.  If it was the last one, destroy the object.
 */
-   uobj->uo_refs--;
-   if (uobj->uo_refs) {
+   if (atomic_dec_int_nv(>uo_refs) > 0) {
return;
}
 
@@ -1265,68 +1242,54 @@ uao_dropswap(struct uvm_object *uobj, in
 boolean_t
 uao_swap_off(int startslot, int endslot)
 {
-   struct uvm_aobj *aobj, *nextaobj, *prevaobj = NULL;
+   struct uvm_aobj *aobj;
 
/*
-* Walk the list of all anonymous UVM objects.
+* Walk the list of all anonymous UVM objects.  Grab the first.
 */
mtx_enter(_list_lock);
+   if ((aobj = LIST_FIRST(_list)) == NULL) {
+   mtx_leave(_list_lock);
+   return FALSE;
+   }
+   uao_reference(>u_obj);
 
-   for (aobj = LIST_FIRST(_list);
-aobj != NULL;
-aobj = nextaobj) {
+   do {
+   struct uvm_aobj *nextaobj;
boolean_t rv;
 
/*
-* add a ref to the aobj so it doesn't disappear
-* while we're working.
-*/
-   uao_reference_locked(>u_obj);
-
-   /*
-* now it's safe to unlock the uao list.
-* note that lock interleaving is alright with IPL_NONE mutexes.
+* Prefetch the next object and immediately hold a reference
+* on it, so neither the current nor the next entry could
+* disappear while we are iterating.
 */
-   mtx_leave(_list_lock);
-
-   if (prevaobj) {
-   uao_detach_locked(>u_obj);
-   prevaobj = NULL;
+   if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) {
+   uao_reference(>u_obj);
}
+   mtx_leave(_list_lock);
 
/*
-* page in any pages in the swslot range.
-* if there's an error, abort and return the error.
+* Page in all pages in the swap slot range.
 */
rv = uao_pagein(aobj, startslot, endslot);
+
+   /* Drop the reference of the current object. */
+   uao_detach(>u_obj);
if (rv) {
-   uao_detach_locked(>u_obj);
+   if (nextaobj) {
+   uao_detach(>u_obj);
+   }
return rv;
}
 
-   /*
-* we're done with this aobj.
-* relock the list and drop our ref on the aobj.
-*/
+   aobj = nextaobj;
mtx_enter(_list_lock);
-   nextaobj = LIST_NEXT(aobj, u_list);
-   /*
-* prevaobj means that we have an object that we need
-* to drop a reference for. We can't just drop it now with
-* the list locked since that could cause lock recursion in
-* the case where we reduce the refcount to 0. It will be
-* released the next time we drop the list lock.
-*/
-   prevaobj = aobj;
-   }
+   } while (aobj);
 
/*
 * done with 

Re: ifnewlladdr spl

2021-06-16 Thread Martin Pieuchot
On 16/06/21(Wed) 14:26, David Gwynne wrote:
> 
> 
> > On 16 Jun 2021, at 00:39, Martin Pieuchot  wrote:
> > 
> > On 15/06/21(Tue) 22:52, David Gwynne wrote:
> >> On Mon, Jun 14, 2021 at 10:07:58AM +0200, Martin Pieuchot wrote:
> >>> On 10/06/21(Thu) 19:17, Alexander Bluhm wrote:
> >> [...] 
> >>>> The in6_ functions need netlock.  And driver SIOCSIFFLAGS ioctl
> >>>> must not have splnet().
> >>> 
> >>> Why not?  This is new since the introduction of intr_barrier() or this
> >>> is an old issue?
> >>> 
> >>>> Is reducing splnet() the correct aproach?
> >> 
> >> yes.
> >> 
> >>> I doubt it is possible to answer this question without defining who owns
> >>> `if_flags' and how can it be read/written to.
> >> 
> >> NET_LOCK is what "owns" updates to if_flags.
> > 
> > Why does reducing splnet() is the correct approach?  It isn't clear to
> > me.  What's splnet() protecting then?
> 
> splnet() and all the other splraise() variants only raise the IPL on the 
> current CPU. Unless you have some other lock to coordinate with other CPUs 
> (eg KERNEL_LOCK) it doesn't really prevent other code running. ixl in 
> particular has mpsafe interrupts, so unless your ioctl code is running on the 
> same CPU that ixl is interrupting, it's not helping.
> 
> splnet() with KERNEL_LOCK provides backward compat for with legacy drivers. 
> The reason it doesn't really help with the network stack is because the stack 
> runs from nettq under NET_LOCK without KERNEL_LOCK, it's no longer a softint 
> at an IPL lower than net.

The diff discussed in this thread reduces the scope of the splnet/splx()
dance to only surround the modification of `if_flags'.   How is this
related to what you said?  Is it because `if_flags' is read in interrupt
handlers and it isn't modified atomically?  Does that imply that every
modification of `if_flags' should be done at IPL_NET?  Does that mean
some love is needed to ensure reading `if_flags' is coherent?

Does that change also imply that it is safe to issue a SIOCSIFFLAGS on
a legacy drivers without blocking interrupts?  If the IPL needs to be
raised, this is left to the driver, right?

Was the current splnet/splx() dance an easy way to block packet reception
between multiple SIOCSIFFLAGS ioctls?  This might have been a way to not
receive packets on a DOWN interface.  This is no longer the case on MP
kernels as there's a window between the UP and DOWN ioctls.  Do we care?
Is this down/up/down dance fine for legacy a modern drivers?

> >>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> >>>> retrieving revision 1.641
> >>>> diff -u -p -r1.641 if.c
> >>>> --- net/if.c 25 May 2021 22:45:09 -  1.641
> >>>> +++ net/if.c 10 Jun 2021 14:32:12 -
> >>>> @@ -3109,6 +3109,8 @@ ifnewlladdr(struct ifnet *ifp)
> >>>>  short up;
> >>>>  int s;
> >>>> 
> >>>> +NET_ASSERT_LOCKED();
> >>>> +
> >>>>  s = splnet();
> >>>>  up = ifp->if_flags & IFF_UP;
> >>>> 
> >>>> @@ -3116,11 +3118,14 @@ ifnewlladdr(struct ifnet *ifp)
> >>>>  /* go down for a moment... */
> >>>>  ifp->if_flags &= ~IFF_UP;
> >>>>  ifrq.ifr_flags = ifp->if_flags;
> >>>> +splx(s);
> >>>>  (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
> >>>> +s = splnet();
> >>>>  }
> >>>> 
> >>>>  ifp->if_flags |= IFF_UP;
> >>>>  ifrq.ifr_flags = ifp->if_flags;
> >>>> +splx(s);
> >>>>  (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
> >>>> 
> >>>> #ifdef INET6
> >>>> @@ -3139,11 +3144,12 @@ ifnewlladdr(struct ifnet *ifp)
> >>>> #endif
> >>>>  if (!up) {
> >>>>  /* go back down */
> >>>> +s = splnet();
> >>>>  ifp->if_flags &= ~IFF_UP;
> >>>>  ifrq.ifr_flags = ifp->if_flags;
> >>>> +splx(s);
> >>>>  (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
> >>>>  }
> >>>> -splx(s);
> >>>> }
> >>>> 
> >>>> void
> >>>> 
> >>> 
> >> 
> 



Re: ifnewlladdr spl

2021-06-15 Thread Martin Pieuchot
On 15/06/21(Tue) 22:52, David Gwynne wrote:
> On Mon, Jun 14, 2021 at 10:07:58AM +0200, Martin Pieuchot wrote:
> > On 10/06/21(Thu) 19:17, Alexander Bluhm wrote:
> [...] 
> > > The in6_ functions need netlock.  And driver SIOCSIFFLAGS ioctl
> > > must not have splnet().
> > 
> > Why not?  This is new since the introduction of intr_barrier() or this
> > is an old issue?
> > 
> > > Is reducing splnet() the correct aproach?
> 
> yes.
> 
> > I doubt it is possible to answer this question without defining who owns
> > `if_flags' and how can it be read/written to.
> 
> NET_LOCK is what "owns" updates to if_flags.

Why does reducing splnet() is the correct approach?  It isn't clear to
me.  What's splnet() protecting then?

> > > Index: net/if.c
> > > ===
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> > > retrieving revision 1.641
> > > diff -u -p -r1.641 if.c
> > > --- net/if.c  25 May 2021 22:45:09 -  1.641
> > > +++ net/if.c  10 Jun 2021 14:32:12 -
> > > @@ -3109,6 +3109,8 @@ ifnewlladdr(struct ifnet *ifp)
> > >   short up;
> > >   int s;
> > >  
> > > + NET_ASSERT_LOCKED();
> > > +
> > >   s = splnet();
> > >   up = ifp->if_flags & IFF_UP;
> > >  
> > > @@ -3116,11 +3118,14 @@ ifnewlladdr(struct ifnet *ifp)
> > >   /* go down for a moment... */
> > >   ifp->if_flags &= ~IFF_UP;
> > >   ifrq.ifr_flags = ifp->if_flags;
> > > + splx(s);
> > >   (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
> > > + s = splnet();
> > >   }
> > >  
> > >   ifp->if_flags |= IFF_UP;
> > >   ifrq.ifr_flags = ifp->if_flags;
> > > + splx(s);
> > >   (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
> > >  
> > >  #ifdef INET6
> > > @@ -3139,11 +3144,12 @@ ifnewlladdr(struct ifnet *ifp)
> > >  #endif
> > >   if (!up) {
> > >   /* go back down */
> > > + s = splnet();
> > >   ifp->if_flags &= ~IFF_UP;
> > >   ifrq.ifr_flags = ifp->if_flags;
> > > + splx(s);
> > >   (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
> > >   }
> > > - splx(s);
> > >  }
> > >  
> > >  void
> > > 
> > 
> 



Re: kq_lock is required when updating kn_status

2021-06-14 Thread Martin Pieuchot
On 14/06/21(Mon) 13:45, Visa Hankala wrote:
> When a knote's kn_status is updated, it is necessary to lock the kqueue
> that owns the knote, to ensure proper serialization. filt_proc() has
> a mistake in this, and the following diff fixes it.

The fix is here to ensure `kn_status' cannot be written by two different
threads at the same time, right?

> proc_filtops is MP-unsafe and all its callbacks run with the kernel
> locked. The kernel lock should provide sufficient memory synchronization
> for filt_procdetach() to check condition (kn->kn_status & KN_DETACHED)
> without kq_lock; the value of kn_status seen by filt_procdetach() should
> be at least as recent as seen by the latest filt_proc(NOTE_EXIT) call.

I understand that protecting the read in filt_procdetach() is not
strictly necessary, but isn't it too clever?  I haven't seen any other
piece of code that access `kn_status' w/o holding `kq_lock'.  Unless you
have pending plans for this, I'd suggest we also grab the lock there.

> Also, the splitting of the splhigh() section in filt_proc() should be
> harmless. The KN_DETACHED flag is only checked by filt_procdetach()
> which runs in process context.
> 
> OK?
> 
> Index: kern/kern_event.c
> ===
> RCS file: src/sys/kern/kern_event.c,v
> retrieving revision 1.166
> diff -u -p -r1.166 kern_event.c
> --- kern/kern_event.c 11 Jun 2021 04:29:54 -  1.166
> +++ kern/kern_event.c 14 Jun 2021 13:37:55 -
> @@ -331,6 +331,8 @@ filt_procdetach(struct knote *kn)
>   struct process *pr = kn->kn_ptr.p_process;
>   int s;
>  
> + KERNEL_ASSERT_LOCKED();
> +
>   if (kn->kn_status & KN_DETACHED)
>   return;
>  
> @@ -342,6 +344,7 @@ filt_procdetach(struct knote *kn)
>  int
>  filt_proc(struct knote *kn, long hint)
>  {
> + struct kqueue *kq = kn->kn_kq;
>   u_int event;
>  
>   /*
> @@ -363,8 +366,11 @@ filt_proc(struct knote *kn, long hint)
>   struct process *pr = kn->kn_ptr.p_process;
>   int s;
>  
> - s = splhigh();
> + mtx_enter(>kq_lock);
>   kn->kn_status |= KN_DETACHED;
> + mtx_leave(>kq_lock);
> +
> + s = splhigh();
>   kn->kn_flags |= (EV_EOF | EV_ONESHOT);
>   kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig);
>   klist_remove_locked(>ps_klist, kn);
> @@ -391,7 +397,7 @@ filt_proc(struct knote *kn, long hint)
>   kev.fflags = kn->kn_sfflags;
>   kev.data = kn->kn_id;   /* parent */
>   kev.udata = kn->kn_udata;   /* preserve udata */
> - error = kqueue_register(kn->kn_kq, , NULL);
> + error = kqueue_register(kq, , NULL);
>   if (error)
>   kn->kn_fflags |= NOTE_TRACKERR;
>   }
> Index: sys/event.h
> ===
> RCS file: src/sys/sys/event.h,v
> retrieving revision 1.55
> diff -u -p -r1.55 event.h
> --- sys/event.h   2 Jun 2021 13:56:28 -   1.55
> +++ sys/event.h   14 Jun 2021 13:37:55 -
> @@ -228,6 +228,7 @@ struct filterops {
>   * Locking:
>   *   I   immutable after creation
>   *   o   object lock
> + *   q   kn_kq->kq_lock
>   */
>  struct knote {
>   SLIST_ENTRY(knote)  kn_link;/* for fd */
> @@ -235,7 +236,7 @@ struct knote {
>   TAILQ_ENTRY(knote)  kn_tqe;
>   struct  kqueue *kn_kq;  /* [I] which queue we are on */
>   struct  kevent kn_kevent;
> - int kn_status;
> + int kn_status;  /* [q] */
>   int kn_sfflags; /* [o] saved filter flags */
>   __int64_t   kn_sdata;   /* [o] saved data field */
>   union {
> 



Introduce UVM_OBJ_IS_AOBJ()

2021-06-14 Thread Martin Pieuchot
The diff below introduces a new macro to generalize the test currently
present in uvm_km_pgremove().  It also uses it in new places to reduce
the differences with NetBSD.

This helps me shrink upcoming vmobjlock diff.

ok?

Index: uvm/uvm_aobj.c
===
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.96
diff -u -p -r1.96 uvm_aobj.c
--- uvm/uvm_aobj.c  20 May 2021 08:03:35 -  1.96
+++ uvm/uvm_aobj.c  14 Jun 2021 09:39:45 -
@@ -143,7 +143,7 @@ struct pool uvm_aobj_pool;
 
 static struct uao_swhash_elt   *uao_find_swhash_elt(struct uvm_aobj *, int,
 boolean_t);
-static int  uao_find_swslot(struct uvm_aobj *, int);
+static int  uao_find_swslot(struct uvm_object *, int);
 static boolean_tuao_flush(struct uvm_object *, voff_t,
 voff_t, int);
 static void uao_free(struct uvm_aobj *);
@@ -242,8 +242,11 @@ uao_find_swhash_elt(struct uvm_aobj *aob
  * uao_find_swslot: find the swap slot number for an aobj/pageidx
  */
 inline static int
-uao_find_swslot(struct uvm_aobj *aobj, int pageidx)
+uao_find_swslot(struct uvm_object *uobj, int pageidx)
 {
+   struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
+
+   KASSERT(UVM_OBJ_IS_AOBJ(uobj));
 
/*
 * if noswap flag is set, then we never return a slot
@@ -284,6 +287,7 @@ uao_set_swslot(struct uvm_object *uobj, 
int oldslot;
 
KERNEL_ASSERT_LOCKED();
+   KASSERT(UVM_OBJ_IS_AOBJ(uobj));
 
/*
 * if noswap flag is set, then we can't set a slot
@@ -353,6 +357,7 @@ uao_free(struct uvm_aobj *aobj)
 {
struct uvm_object *uobj = >u_obj;
 
+   KASSERT(UVM_OBJ_IS_AOBJ(uobj));
uao_dropswap_range(uobj, 0, 0);
 
if (UAO_USES_SWHASH(aobj)) {
@@ -881,6 +886,7 @@ uao_flush(struct uvm_object *uobj, voff_
struct vm_page *pp;
voff_t curoff;
 
+   KASSERT(UVM_OBJ_IS_AOBJ(uobj));
KERNEL_ASSERT_LOCKED();
 
if (flags & PGO_ALLPAGES) {
@@ -1007,6 +1013,7 @@ uao_get(struct uvm_object *uobj, voff_t 
int lcv, gotpages, maxpages, swslot, rv, pageidx;
boolean_t done;
 
+   KASSERT(UVM_OBJ_IS_AOBJ(uobj));
KERNEL_ASSERT_LOCKED();
 
/*
@@ -1036,7 +1043,7 @@ uao_get(struct uvm_object *uobj, voff_t 
 * if page is new, attempt to allocate the page,
 * zero-fill'd.
 */
-   if (ptmp == NULL && uao_find_swslot(aobj,
+   if (ptmp == NULL && uao_find_swslot(uobj,
current_offset >> PAGE_SHIFT) == 0) {
ptmp = uvm_pagealloc(uobj, current_offset,
NULL, UVM_PGA_ZERO);
@@ -1175,7 +1182,7 @@ uao_get(struct uvm_object *uobj, voff_t 
 * we have a "fake/busy/clean" page that we just allocated.  
 * do the needed "i/o", either reading from swap or zeroing.
 */
-   swslot = uao_find_swslot(aobj, pageidx);
+   swslot = uao_find_swslot(uobj, pageidx);
 
/* just zero the page if there's nothing in swap.  */
if (swslot == 0) {
@@ -1241,6 +1248,8 @@ uao_dropswap(struct uvm_object *uobj, in
 {
int slot;
 
+   KASSERT(UVM_OBJ_IS_AOBJ(uobj));
+
slot = uao_set_swslot(uobj, pageidx, 0);
if (slot) {
uvm_swap_free(slot, 1);
@@ -1456,6 +1465,7 @@ uao_dropswap_range(struct uvm_object *uo
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int swpgonlydelta = 0;
 
+   KASSERT(UVM_OBJ_IS_AOBJ(uobj));
/* KASSERT(mutex_owned(uobj->vmobjlock)); */
 
if (end == 0) {
Index: uvm/uvm_km.c
===
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
retrieving revision 1.144
diff -u -p -r1.144 uvm_km.c
--- uvm/uvm_km.c16 May 2021 15:10:20 -  1.144
+++ uvm/uvm_km.c14 Jun 2021 09:40:39 -
@@ -246,7 +246,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
int slot;
int swpgonlydelta = 0;
 
-   KASSERT(uobj->pgops == _pager);
+   KASSERT(UVM_OBJ_IS_AOBJ(uobj));
 
for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
pp = uvm_pagelookup(uobj, curoff);
Index: uvm/uvm_object.h
===
RCS file: /cvs/src/sys/uvm/uvm_object.h,v
retrieving revision 1.24
diff -u -p -r1.24 uvm_object.h
--- uvm/uvm_object.h21 Oct 2020 09:08:14 -  1.24
+++ uvm/uvm_object.h14 Jun 2021 09:34:34 -
@@ -82,12 +82,15 @@ RBT_PROTOTYPE(uvm_objtree, vm_page, objt
 #defineUVM_OBJ_IS_VNODE(uobj)  
\
((uobj)->pgops == _vnodeops)
 
-#define UVM_OBJ_IS_DEVICE(uobj)

Reaper & amaps

2021-06-14 Thread Martin Pieuchot
Now that operations on amaps are serialized using a per-map rwlock
the KERNEL_LOCK() shouldn't be necessary to call amap_unref().  The
diff below allows the reaper to do this operation before grabbing it.

I haven't seen any relevant contention on the reaper in my profilings,
so I don't expect any visible change related to this change.  However
this reflects the current state of locking in UVM and helps me shrink
my diff.

ok?

Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.275
diff -u -p -r1.275 uvm_map.c
--- uvm/uvm_map.c   22 May 2021 08:38:29 -  1.275
+++ uvm/uvm_map.c   14 Jun 2021 09:32:04 -
@@ -1571,10 +1571,16 @@ uvm_unmap_detach(struct uvm_map_deadq *d
 
TAILQ_FOREACH_SAFE(entry, deadq, dfree.deadq, tmp) {
/* Skip entries for which we have to grab the kernel lock. */
-   if (entry->aref.ar_amap || UVM_ET_ISSUBMAP(entry) ||
-   UVM_ET_ISOBJ(entry))
+   if (UVM_ET_ISSUBMAP(entry) || UVM_ET_ISOBJ(entry))
continue;
 
+   /* Drop reference to amap, if we've got one. */
+   if (entry->aref.ar_amap)
+   amap_unref(entry->aref.ar_amap,
+   entry->aref.ar_pageoff,
+   atop(entry->end - entry->start),
+   flags & AMAP_REFALL);
+
TAILQ_REMOVE(deadq, entry, dfree.deadq);
uvm_mapent_free(entry);
}
@@ -1586,12 +1592,6 @@ uvm_unmap_detach(struct uvm_map_deadq *d
while ((entry = TAILQ_FIRST(deadq)) != NULL) {
if (waitok)
uvm_pause();
-   /* Drop reference to amap, if we've got one. */
-   if (entry->aref.ar_amap)
-   amap_unref(entry->aref.ar_amap,
-   entry->aref.ar_pageoff,
-   atop(entry->end - entry->start),
-   flags & AMAP_REFALL);
 
/* Drop reference to our backing object, if we've got one. */
if (UVM_ET_ISSUBMAP(entry)) {



Re: ifnewlladdr spl

2021-06-14 Thread Martin Pieuchot
On 10/06/21(Thu) 19:17, Alexander Bluhm wrote:
> Hi,
> 
> I have seen this crash trace on a 6.6 based system, but I think the
> bug exists still in -current.  It happened when an ixl(4) interface
> was removed from trunk(4).
> 
> uvm_fault(0xfd8739dc6558, 0x0, 0, 1) -> e
> fatal page fault in supervisor mode
> trap type 6 code 0 rip 81012a86 cs 8 rflags 10202 cr2 0 cpl 7 rsp 
> 80002e4bd170
> gsbase 0x816d6ff0  kgsbase 0x0
> panic: trap type 6, code=0, pc=81012a86
> Starting stack trace...
> panic() at panic+0x113
> kerntrap(80bf3000) at kerntrap+0xdc
> alltraps_kern_meltdown(6,0,4,0,80bf3000,0) at 
> alltraps_kern_meltdown+0x7b
> ixl_intr(80bf3000) at ixl_intr+0x3e6
> intr_handler(816d6ff0,80b57200) at intr_handler+0x5b
> Xintr_ioapic_edge30_untramp(4,814d3a00,4,18041969,816d6ff0,d) 
> at Xintr_ioapic_edge30_untramp+0x19f
> Xspllower(8178fb58,816d6ff0,8139d743,80bffd00,8178fb40,10)
>  at Xspllower+0xc
> softintr_dispatch(0) at softintr_dispatch+0xc5
> Xsoftclock(80bf3048,0,8139d743,80bf3048,81207800,814e862f)
>  at Xsoftclock+0x1f
> ifnewlladdr(81624e10) at ifnewlladdr+0xf8
> trunk_port_destroy(80002e4bd6e0) at trunk_port_destroy+0x2fd
> trunk_ioctl(fd87387594c0,81207800,8048698e) at trunk_ioctl+0x6a6
> ifioctl(fd87623df448,80002e46e2d8,48,80002e4bd7d0) at 
> ifioctl+0x2d6
> sys_ioctl(360,80002e46e2d8,36) at sys_ioctl+0x3cd
> syscall(0) at syscall+0x3d1
> Xsyscall(6,36,1,36,7f7da720,7f7daa21) at Xsyscall+0x128
> end of kernel
> end trace frame: 0x7f7da780, count: 241
> End of stack trace.
> syncing disks...
> 
> ifnewlladdr() is interrupted by ixl transmit interrupt.  There it
> crashes in ixl_txeof as txr is NULL.  The code in -current if_ixl.c
> has changed, so it might not happen anymore.  But I think the bug
> is in ifnewlladdr().

Hard to say.

> ifnewlladdr() sets splnet() and configures the interface up and
> down.  The ixl_down() code has some interrupt barriers which cannot
> work while interrupts are blocked by splnet().  So interrupts fire
> at splx() when the driver does not expect them.

If intr_barrier() or ixl_down() need a certain IPL level to properly
work then something has been overlooked.  Should we add an assert?

> Combining interrupt barriers with spl protection looks like a bad
> idea.
> 
> Is there anything that lowers spl in all cases during intr_barrier(),
> ifq_barrier() or timeout_del_barrier()?
> 
> How should spls work together with barriers?
> 
> The integrity of ifnewlladdr() state should be guaranteed by netlock.
>
> Changing if_flags needs splnet() as they are used by all drivers.

This isn't clear to me.  splnet() used to be needed but nowadays this
seems to questionable depending on the driver.

> The in6_ functions need netlock.  And driver SIOCSIFFLAGS ioctl
> must not have splnet().

Why not?  This is new since the introduction of intr_barrier() or this
is an old issue?

> Is reducing splnet() the correct aproach?

I doubt it is possible to answer this question without defining who owns
`if_flags' and how can it be read/written to.

I'd question if splnet() is needed at all here.  Why is it here in the
first place?  I'd guess to prevent the interrupt handler to run while 
SIOCSIFFLAGS ioctl is being executed...  Your diff suggest something
else...

> Index: net/if.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> retrieving revision 1.641
> diff -u -p -r1.641 if.c
> --- net/if.c  25 May 2021 22:45:09 -  1.641
> +++ net/if.c  10 Jun 2021 14:32:12 -
> @@ -3109,6 +3109,8 @@ ifnewlladdr(struct ifnet *ifp)
>   short up;
>   int s;
>  
> + NET_ASSERT_LOCKED();
> +
>   s = splnet();
>   up = ifp->if_flags & IFF_UP;
>  
> @@ -3116,11 +3118,14 @@ ifnewlladdr(struct ifnet *ifp)
>   /* go down for a moment... */
>   ifp->if_flags &= ~IFF_UP;
>   ifrq.ifr_flags = ifp->if_flags;
> + splx(s);
>   (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
> + s = splnet();
>   }
>  
>   ifp->if_flags |= IFF_UP;
>   ifrq.ifr_flags = ifp->if_flags;
> + splx(s);
>   (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
>  
>  #ifdef INET6
> @@ -3139,11 +3144,12 @@ ifnewlladdr(struct ifnet *ifp)
>  #endif
>   if (!up) {
>   /* go back down */
> + s = splnet();
>   ifp->if_flags &= ~IFF_UP;
>   ifrq.ifr_flags = ifp->if_flags;
> + splx(s);
>   (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
>   }
> - splx(s);
>  }
>  
>  void
> 



Kill SS_ASYNC

2021-06-01 Thread Martin Pieuchot
The socket flag SS_ASYNC is redundant with the socket buffer flag
SB_ASYNC.  Both are set & unset via FIOASYNC.  So the diff below gets
rid of SS_ASYNC.

Checking states on socket buffers will help reduce the scope of the
NET_LOCK() when we'll introduce a socket buffer lock.

ok?

Index: kern/sys_socket.c
===
RCS file: /cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.45
diff -u -p -r1.45 sys_socket.c
--- kern/sys_socket.c   22 Feb 2020 11:58:29 -  1.45
+++ kern/sys_socket.c   29 May 2021 07:53:49 -
@@ -96,11 +96,9 @@ soo_ioctl(struct file *fp, u_long cmd, c
case FIOASYNC:
s = solock(so);
if (*(int *)data) {
-   so->so_state |= SS_ASYNC;
so->so_rcv.sb_flags |= SB_ASYNC;
so->so_snd.sb_flags |= SB_ASYNC;
} else {
-   so->so_state &= ~SS_ASYNC;
so->so_rcv.sb_flags &= ~SB_ASYNC;
so->so_snd.sb_flags &= ~SB_ASYNC;
}
Index: kern/uipc_socket2.c
===
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.110
diff -u -p -r1.110 uipc_socket2.c
--- kern/uipc_socket2.c 26 May 2021 08:28:34 -  1.110
+++ kern/uipc_socket2.c 29 May 2021 07:53:41 -
@@ -409,7 +409,7 @@ sbunlock(struct socket *so, struct sockb
 /*
  * Wakeup processes waiting on a socket buffer.
  * Do asynchronous notification via SIGIO
- * if the socket has the SS_ASYNC flag set.
+ * if the socket buffer has the SB_ASYNC flag set.
  */
 void
 sowakeup(struct socket *so, struct sockbuf *sb)
@@ -421,7 +421,7 @@ sowakeup(struct socket *so, struct sockb
sb->sb_flags &= ~SB_WAIT;
wakeup(>sb_cc);
}
-   if (so->so_state & SS_ASYNC)
+   if (sb->sb_flags & SB_ASYNC)
pgsigio(>so_sigio, SIGIO, 0);
selwakeup(>sb_sel);
 }
Index: sys/socketvar.h
===
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.97
diff -u -p -r1.97 socketvar.h
--- sys/socketvar.h 21 May 2021 10:59:02 -  1.97
+++ sys/socketvar.h 29 May 2021 07:54:27 -
@@ -147,7 +147,6 @@ struct socket {
 #defineSS_ISDISCONNECTED   0x800   /* socket disconnected from 
peer */
 
 #defineSS_PRIV 0x080   /* privileged for broadcast, 
raw... */
-#defineSS_ASYNC0x200   /* async i/o notify */
 #defineSS_CONNECTOUT   0x1000  /* connect, not accept, at this 
end */
 #defineSS_ISSENDING0x2000  /* hint for lower layer */
 #defineSS_DNS  0x4000  /* created using SOCK_DNS 
socket(2) */



Re: Add f_modify and f_process callbacks to socket filterops

2021-05-25 Thread Martin Pieuchot
On 20/05/21(Thu) 14:16, Visa Hankala wrote:
> On Thu, May 20, 2021 at 11:35:32AM +0200, Martin Pieuchot wrote:
> > On 18/05/21(Tue) 14:22, Visa Hankala wrote:
> > > This diff adds f_modify and f_process callbacks to socket event filters.
> > > As a result, socket events are handled using the non-legacy paths in
> > > filter_modify() and filter_process() of kern_event.c This a step toward
> > > MP-safety. However, everything still runs under the kernel lock.
> > > 
> > > The change has three intended effects:
> > > 
> > > * Socket events are handled without raising the system priority level.
> > >   This makes the activity observable with btrace(8).
> > > 
> > > * kqueue itself no longer calls f_event of socket filterops, which
> > >   allows replacing the conditional, NOTE_SUBMIT-based locking with
> > >   a fixed call pattern.
> > 
> > I love this.
> > 
> > > * The state of a socket event is now always rechecked before delivery
> > >   to user. Before, the recheck was skipped if the event was registered
> > >   with EV_ONESHOT.
> > 
> > To me this sounds sane.  I can't think of a way to rely on the current
> > behavior.  However if there's an easy way to split these changes in two
> > commits, I'd prefer to stay on the safe side.
> 
> Below is an updated diff that preserves the current EV_ONESHOT
> behaviour. I have just adapted a part of the compatibility logic
> from function filter_process().
> 
> When f_process is given a non-NULL kev argument, it is known that
> the callback is invoked from kqueue_scan(). If kev is NULL,
> kqueue_register() is checking if the knote should be activated and
> there is no intent to deliver the event right now.

ok mpi@

> Index: kern/uipc_socket.c
> ===
> RCS file: src/sys/kern/uipc_socket.c,v
> retrieving revision 1.261
> diff -u -p -r1.261 uipc_socket.c
> --- kern/uipc_socket.c13 May 2021 19:43:11 -  1.261
> +++ kern/uipc_socket.c20 May 2021 14:01:18 -
> @@ -70,15 +70,26 @@ void  sorflush(struct socket *);
>  
>  void filt_sordetach(struct knote *kn);
>  int  filt_soread(struct knote *kn, long hint);
> +int  filt_soreadmodify(struct kevent *kev, struct knote *kn);
> +int  filt_soreadprocess(struct knote *kn, struct kevent *kev);
> +int  filt_soread_common(struct knote *kn, struct socket *so);
>  void filt_sowdetach(struct knote *kn);
>  int  filt_sowrite(struct knote *kn, long hint);
> +int  filt_sowritemodify(struct kevent *kev, struct knote *kn);
> +int  filt_sowriteprocess(struct knote *kn, struct kevent *kev);
> +int  filt_sowrite_common(struct knote *kn, struct socket *so);
>  int  filt_solisten(struct knote *kn, long hint);
> +int  filt_solistenmodify(struct kevent *kev, struct knote *kn);
> +int  filt_solistenprocess(struct knote *kn, struct kevent *kev);
> +int  filt_solisten_common(struct knote *kn, struct socket *so);
>  
>  const struct filterops solisten_filtops = {
>   .f_flags= FILTEROP_ISFD,
>   .f_attach   = NULL,
>   .f_detach   = filt_sordetach,
>   .f_event= filt_solisten,
> + .f_modify   = filt_solistenmodify,
> + .f_process  = filt_solistenprocess,
>  };
>  
>  const struct filterops soread_filtops = {
> @@ -86,6 +97,8 @@ const struct filterops soread_filtops = 
>   .f_attach   = NULL,
>   .f_detach   = filt_sordetach,
>   .f_event= filt_soread,
> + .f_modify   = filt_soreadmodify,
> + .f_process  = filt_soreadprocess,
>  };
>  
>  const struct filterops sowrite_filtops = {
> @@ -93,6 +106,8 @@ const struct filterops sowrite_filtops =
>   .f_attach   = NULL,
>   .f_detach   = filt_sowdetach,
>   .f_event= filt_sowrite,
> + .f_modify   = filt_sowritemodify,
> + .f_process  = filt_sowriteprocess,
>  };
>  
>  const struct filterops soexcept_filtops = {
> @@ -100,6 +115,8 @@ const struct filterops soexcept_filtops 
>   .f_attach   = NULL,
>   .f_detach   = filt_sordetach,
>   .f_event= filt_soread,
> + .f_modify   = filt_soreadmodify,
> + .f_process  = filt_soreadprocess,
>  };
>  
>  #ifndef SOMINCONN
> @@ -2056,13 +2073,12 @@ filt_sordetach(struct knote *kn)
>  }
>  
>  int
> -filt_soread(struct knote *kn, long hint)
> +filt_soread_common(struct knote *kn, struct socket *so)
>  {
> - struct socket *so = kn->kn_fp->f_data;
> - int s, rv = 0;
> + int rv = 0;
> +
> + soassertlocked(so);
>  
> - if ((hint

Re: xhci early enumeration

2021-05-21 Thread Martin Pieuchot
On 21/05/21(Fri) 10:48, Patrick Wildt wrote:
> Am Wed, May 19, 2021 at 07:15:50AM + schrieb Christian Ludwig:
> > The usb(4) driver allows to enumerate the bus early during boot by
> > setting its driver flags to 0x1 in UKC. This mechanism can enable a USB
> > console keyboard early during autoconf(9), which can come in handy at
> > times. This needs USB polling mode to work, which is a bit broken. Here
> > is my attempt to fix it for xhci(4) controllers.
> > 
> > According to the xHCI specification section 4.2 "Host Controller
> > Initalization", the host controller must be fully initialized before
> > descending into device enumeration. Then xhci(4) sends command TRBs to
> > open new pipes during enumeration. They wait for completion using
> > tsleep(). This is bad when in polling mode at boot. And finally, the
> > behavior should be the same on resume as it is at boot. Therefore also
> > enumerate USB devices during resume when the flag is set.
> > 
> > I am specifically looking for tests on xhci controllers with usb(4)
> > flags set to 1 in UKC.
> > 
> > So long,
> > 
> > 
> >  - Christian
> > 
> > 
> > diff --git a/sys/arch/armv7/marvell/mvxhci.c 
> > b/sys/arch/armv7/marvell/mvxhci.c
> > index 38a636fd123..2137f68b816 100644
> > --- a/sys/arch/armv7/marvell/mvxhci.c
> > +++ b/sys/arch/armv7/marvell/mvxhci.c
> > @@ -155,12 +155,12 @@ mvxhci_attach(struct device *parent, struct device 
> > *self, void *aux)
> > goto disestablish_ret;
> > }
> >  
> > -   /* Attach usb device. */
> > -   config_found(self, >sc.sc_bus, usbctlprint);
> > -
> > /* Now that the stack is ready, config' the HC and enable interrupts. */
> > xhci_config(>sc);
> >  
> > +   /* Attach usb device. */
> > +   config_found(self, >sc.sc_bus, usbctlprint);
> > +
> > return;
> >  
> >  disestablish_ret:
> > diff --git a/sys/dev/acpi/xhci_acpi.c b/sys/dev/acpi/xhci_acpi.c
> > index 95e69cee896..d762f69a00e 100644
> > --- a/sys/dev/acpi/xhci_acpi.c
> > +++ b/sys/dev/acpi/xhci_acpi.c
> > @@ -112,12 +112,12 @@ xhci_acpi_attach(struct device *parent, struct device 
> > *self, void *aux)
> > goto disestablish_ret;
> > }
> >  
> > -   /* Attach usb device. */
> > -   config_found(self, >sc.sc_bus, usbctlprint);
> > -
> > /* Now that the stack is ready, config' the HC and enable interrupts. */
> > xhci_config(>sc);
> >  
> > +   /* Attach usb device. */
> > +   config_found(self, >sc.sc_bus, usbctlprint);
> > +
> > return;
> >  
> >  disestablish_ret:
> > diff --git a/sys/dev/fdt/xhci_fdt.c b/sys/dev/fdt/xhci_fdt.c
> > index 38c976a6b24..84e00bdadc5 100644
> > --- a/sys/dev/fdt/xhci_fdt.c
> > +++ b/sys/dev/fdt/xhci_fdt.c
> > @@ -116,12 +116,12 @@ xhci_fdt_attach(struct device *parent, struct device 
> > *self, void *aux)
> > goto disestablish_ret;
> > }
> >  
> > -   /* Attach usb device. */
> > -   config_found(self, >sc.sc_bus, usbctlprint);
> > -
> > /* Now that the stack is ready, config' the HC and enable interrupts. */
> > xhci_config(>sc);

> >  
> > +   /* Attach usb device. */
> > +   config_found(self, >sc.sc_bus, usbctlprint);
> > +
> > return;
> >  
> >  disestablish_ret:
> > diff --git a/sys/dev/pci/xhci_pci.c b/sys/dev/pci/xhci_pci.c
> > index fa3271b0d30..0b46083b705 100644
> > --- a/sys/dev/pci/xhci_pci.c
> > +++ b/sys/dev/pci/xhci_pci.c
> > @@ -195,12 +195,12 @@ xhci_pci_attach(struct device *parent, struct device 
> > *self, void *aux)
> > if (PCI_VENDOR(psc->sc_id) == PCI_VENDOR_INTEL)
> > xhci_pci_port_route(psc);
> >  
> > -   /* Attach usb device. */
> > -   config_found(self, >sc.sc_bus, usbctlprint);
> > -
> > /* Now that the stack is ready, config' the HC and enable interrupts. */
> > xhci_config(>sc);
> >  
> > +   /* Attach usb device. */
> > +   config_found(self, >sc.sc_bus, usbctlprint);
> > +
> > return;
> >  
> >  disestablish_ret:
> 
> The interesting thing is that xhci_config() used to be part of
> xhci_init() and was explicitly taken out from it to fix a panic
> that showed up when enumeration happened afterwards.
> 
> https://github.com/openbsd/src/commit/48155c88d2b90737b892a715e56d81bc73254308
> 
> Is it possible that this works in polling mode, but not without?
> 
> While I agree that moving xhci_config() before enumeration creates
> consistency with the others, this change was done deliberately and
> we should find out why.
> 
> mpi, do you still happen to have the logs or the machine for that
> particular issue?

I don't.  If I recall correctly the problem what about calling the
interrupt handler calling usb_schedsoftintr() before usb_attach() had
the time to execute softintr_establish().

> > diff --git a/sys/dev/usb/usb.c b/sys/dev/usb/usb.c
> > index b8943882d0a..f9aff94bfee 100644
> > --- a/sys/dev/usb/usb.c
> > +++ b/sys/dev/usb/usb.c
> > @@ -911,8 +911,19 @@ usb_activate(struct device *self, int act)
> >  * hub transfers do not need to sleep.
> >  */
> >   

Re: Add f_modify and f_process callbacks to socket filterops

2021-05-20 Thread Martin Pieuchot
On 18/05/21(Tue) 14:22, Visa Hankala wrote:
> This diff adds f_modify and f_process callbacks to socket event filters.
> As a result, socket events are handled using the non-legacy paths in
> filter_modify() and filter_process() of kern_event.c This a step toward
> MP-safety. However, everything still runs under the kernel lock.
> 
> The change has three intended effects:
> 
> * Socket events are handled without raising the system priority level.
>   This makes the activity observable with btrace(8).
> 
> * kqueue itself no longer calls f_event of socket filterops, which
>   allows replacing the conditional, NOTE_SUBMIT-based locking with
>   a fixed call pattern.

I love this.

> * The state of a socket event is now always rechecked before delivery
>   to user. Before, the recheck was skipped if the event was registered
>   with EV_ONESHOT.

To me this sounds sane.  I can't think of a way to rely on the current
behavior.  However if there's an easy way to split these changes in two
commits, I'd prefer to stay on the safe side.

> However, the change of behaviour with EV_ONESHOT is questionable.
> When an activated event is being processed, the code will acquire the
> socket lock anyway. Skipping the state check would only be a minor
> optimization. In addition, I think the behaviour becomes more
> consistent as now a delivered EV_ONESHOT event really was active at
> the time of delivery.
> 
> Consider the following program. It creates a socket pair, writes a byte
> to the socket, registers an EV_ONESHOT event, and reads the byte from
> the socket. Next it checks how kevent(2) behaves.
> 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> 
> int
> main(void)
> {
>   struct kevent kev[1];
>   struct timespec ts = {};
>   int fds[2], flags, kq, n;
>   char b;
> 
>   if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == -1)
>   err(1, "socketpair");
>   flags = fcntl(fds[0], F_GETFL, 0);
>   fcntl(fds[0], F_SETFL, flags | O_NONBLOCK);
> 
>   printf("write 1\n");
>   write(fds[1], "x", 1);
> 
>   kq = kqueue();
>   if (kq == -1)
>   err(1, "kqueue");
>   EV_SET([0], fds[0], EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, NULL);
>   if (kevent(kq, kev, 1, NULL, 0, NULL) == -1)
>   err(1, "kevent");
> 
>   n = read(fds[0], , 1);
>   printf("read %d\n", n);
>   n = read(fds[0], , 1);
>   printf("read %d\n", n);
> 
>   n = kevent(kq, NULL, 0, kev, 1, );
>   printf("kevent %d\n", n);
>   n = read(fds[0], , 1);
>   printf("read %d\n", n);
> 
>   n = kevent(kq, NULL, 0, kev, 1, );
>   printf("kevent %d\n", n);
> 
>   printf("write 1\n");
>   write(fds[1], "x", 1);
> 
>   n = kevent(kq, NULL, 0, kev, 1, );
>   printf("kevent %d\n", n);
>   n = read(fds[0], , 1);
>   printf("read %d\n", n);
> 
>   return 0;
> }
> 
> With an unpatched kernel, the EV_ONESHOT event gets activated by the
> pending byte when the event is registered. The event remains active
> until delivery, and the delivery happens even though it is clear that
> reading from the socket will fail. The program prints:
> 
> write 1
> read 1
> read -1
> kevent 1
> read -1
> kevent 0
> write 1
> kevent 0
> read 1
> 
> With the patch applied, the event gets delivered only if the socket
> has bytes pending.
> 
> write 1
> read 1
> read -1
> kevent 0
> read -1
> kevent 0
> write 1
> kevent 1
> read 1
> 
> So, is this EV_ONESHOT change reasonable, or should the implementation
> stick with the old way? FreeBSD appears to follow the old way. MacOS
> might perform differently, though I am not sure about that.
> 
> It is not essential to change EV_ONESHOT, however.
> 
> Feedback and tests are welcome.
> 
> Index: kern/uipc_socket.c
> ===
> RCS file: src/sys/kern/uipc_socket.c,v
> retrieving revision 1.261
> diff -u -p -r1.261 uipc_socket.c
> --- kern/uipc_socket.c13 May 2021 19:43:11 -  1.261
> +++ kern/uipc_socket.c18 May 2021 12:56:24 -
> @@ -70,15 +70,26 @@ void  sorflush(struct socket *);
>  
>  void filt_sordetach(struct knote *kn);
>  int  filt_soread(struct knote *kn, long hint);
> +int  filt_soreadmodify(struct kevent *kev, struct knote *kn);
> +int  filt_soreadprocess(struct knote *kn, struct kevent *kev);
> +int  filt_soread_common(struct knote *kn, struct socket *so);
>  void filt_sowdetach(struct knote *kn);
>  int  filt_sowrite(struct knote *kn, long hint);
> +int  filt_sowritemodify(struct kevent *kev, struct knote *kn);
> +int  filt_sowriteprocess(struct knote *kn, struct kevent *kev);
> +int  filt_sowrite_common(struct knote *kn, struct socket *so);
>  int  filt_solisten(struct knote *kn, long hint);
> +int  filt_solistenmodify(struct kevent *kev, struct knote *kn);
> +int  filt_solistenprocess(struct knote *kn, struct kevent *kev);
> +int  filt_solisten_common(struct 

Re: Use atomic op for UVM map refcount

2021-05-20 Thread Martin Pieuchot
On 19/05/21(Wed) 16:17, Mark Kettenis wrote:
> > Date: Tue, 18 May 2021 13:24:42 +0200
> > From: Martin Pieuchot 
> > 
> > On 18/05/21(Tue) 12:07, Mark Kettenis wrote:
> > > > Date: Tue, 18 May 2021 12:02:19 +0200
> > > > From: Martin Pieuchot 
> > > > 
> > > > This allows us to not rely on the KERNEL_LOCK() to check reference
> > > > counts.
> > > > 
> > > > Also reduces differences with NetBSD and shrink my upcoming `vmobjlock'
> > > > diff.
> > > > 
> > > > ok?
> > > 
> > > Shouldn't we make ref_count volatile in that case?
> > 
> > I don't know,  I couldn't find any evidence about where to use "volatile"
> > in the kernel.
> > 
> > My understanding is that using "volatile" tells the compiler to not
> > "cache" the value of such field in a register because it can change at
> > any time.  Is it so?
> 
> Right.  So if you want the access to be atomic, it needs to be
> "uncached" and therefore you need to use volatile.  Now the atomic
> APIs explicitly cast their pointer arguments to volatile, so if you
> exclusively through those APIs you don't strictly need the variable
> itself to be declared volatile.  But I think it still is a good idea
> to declare them as such.

Thanks for the explanation.  Do you suggest we use the "volatile"
keyword as a hint and/or to avoid surprises?  If we agree on this
I'll look at similar uses of atomic operations to unify them.

> > There's only a couple of 'volatile' usages in sys/sys.  These annotations
> > do not explicitly indicate which piece of code requires it.  Maybe it would
> > be clearer to use a cast or a macro where necessary.  This might help us
> > understand why and where "volatile" is needed.
> 
> There are the READ_ONCE() and WRITE_ONCE() macros.  I'm not a big fan
> of those (since they add clutter) but they do take care of dependency
> ordering issues that exist in the alpha memory model.  Must admit that
> I only vaguely understand that issue, but I think it involves ordered
> access to two atomic variables which doesn't seem to be the case.

These macros are used in places where declaring the field as "volatile"
could also work, no?  We can look at __mp_lock and SMR implementations.
So could we agree one way to do things?

Visa, David, why did you pick READ_ONCE() in SMR and veb(4)?  Anything
we overlooked regarding the use of "volatile"?



Re: move copyout() in DIOCGETSTATES outside of NET_LOCK() and state_lcok

2021-05-20 Thread Martin Pieuchot
On 20/05/21(Thu) 03:23, Alexandr Nedvedicky wrote:
> Hrvoje gave a try to experimental diff, which trades rw-locks in pf(4)
> for mutexes [1]. Hrvoje soon discovered machine panics, when doing 'pfctl -ss'
> The callstack looks as follows:
>
> [...]
> specific to experimental diff [1]. However this made me thinking, that
> it's not a good idea to do copyout() while holding NET_LOCK() and state_lock.

malloc(9) and copyout(9) are kind of ok while using the NET_LOCK() but
if a deadlock occurs while a global rwlock is held, debugging becomes
harder.

As long as the `state_lock' and PF_LOCK() are mutexes all allocations
and copyin/copyout(9) must be done without holding them.

> Diff below moves copyout() at line 1784 outside of protection of both locks.
> The approach I took is relatively straightforward:
> 
> let DIOCGETSTATES to allocate hold_states array, which will keep
> references to states.
> 
> grab locks and take references, keep those references in hold
> array.
> 
> drop locks, export states and do copyout, while walking
> array of references.
> 
> drop references, release hold_states array.
> 
> does it make sense? If we agree that this approach makes sense

In my opinion it does.  The other approach would be to (ab)use the
NET_LOCK() to serialize updates, like bluhm@'s diff does.  Both
approaches have pros and cons.

> I'll commit this diff and revisit other such places we currently
> have in pfioctl().
> 
> thanks and
> regards
> sashan
> 
> [1] https://marc.info/?l=openbsd-tech=162138181106887=2
> 
> 8<---8<---8<--8<
> diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c
> index ae7bb008351..0d4ac97a92c 100644
> --- a/sys/net/pf_ioctl.c
> +++ b/sys/net/pf_ioctl.c
> @@ -1762,43 +1762,58 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int 
> flags, struct proc *p)
>   struct pf_state *state;
>   struct pfsync_state *p, *pstore;
>   u_int32_tnr = 0;
> + struct pf_state **hold_states;
> + u_int32_thold_len, i;
>  
>   if (ps->ps_len == 0) {
>   nr = pf_status.states;
>   ps->ps_len = sizeof(struct pfsync_state) * nr;
>   break;
> + } else {
> + hold_len = ps->ps_len / sizeof(struct pfsync_state);
> + hold_len = MIN(hold_len, pf_status.states);
>   }
>  
>   pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
> + hold_states = mallocarray(hold_len + 1,
> + sizeof(struct pf_state *), M_TEMP, M_WAITOK | M_ZERO);
>  
>   p = ps->ps_states;
>  
> + i = 0;
>   NET_LOCK();
>   PF_STATE_ENTER_READ();
> - state = TAILQ_FIRST(_list);
> - while (state) {
> + TAILQ_FOREACH(state, _list, entry_list) {
> + hold_states[i++] = pf_state_ref(state);
> + if (i >= hold_len)
> + break;
> + }
> + PF_STATE_EXIT_READ();
> + NET_UNLOCK();
> +
> + i = 0;
> + while ((state = hold_states[i++]) != NULL) {
>   if (state->timeout != PFTM_UNLINKED) {
> - if ((nr+1) * sizeof(*p) > ps->ps_len)
> - break;
>   pf_state_export(pstore, state);
>   error = copyout(pstore, p, sizeof(*p));
> - if (error) {
> - free(pstore, M_TEMP, sizeof(*pstore));
> - PF_STATE_EXIT_READ();
> - NET_UNLOCK();
> - goto fail;
> - }
> + if (error)
> + break;
>   p++;
>   nr++;
>   }
> - state = TAILQ_NEXT(state, entry_list);
> + pf_state_unref(state);
>   }
> - PF_STATE_EXIT_READ();
> - NET_UNLOCK();
>  
> - ps->ps_len = sizeof(struct pfsync_state) * nr;
> + if (error) {
> + pf_state_unref(state);
> + while ((state = hold_states[i++]) != NULL)
> + pf_state_unref(state);
> + } else
> + ps->ps_len = sizeof(struct pfsync_state) * nr;
>  
>   free(pstore, M_TEMP, sizeof(*pstore));
> + free(hold_states, M_TEMP,
> + sizeof(struct pf_state *) * (hold_len + 1));
>   break;
>   }
>  
> 



Re: Use atomic op for UVM map refcount

2021-05-18 Thread Martin Pieuchot
On 18/05/21(Tue) 12:07, Mark Kettenis wrote:
> > Date: Tue, 18 May 2021 12:02:19 +0200
> > From: Martin Pieuchot 
> > 
> > This allows us to not rely on the KERNEL_LOCK() to check reference
> > counts.
> > 
> > Also reduces differences with NetBSD and shrink my upcoming `vmobjlock'
> > diff.
> > 
> > ok?
> 
> Shouldn't we make ref_count volatile in that case?

I don't know,  I couldn't find any evidence about where to use "volatile"
in the kernel.

My understanding is that using "volatile" tells the compiler to not
"cache" the value of such field in a register because it can change at
any time.  Is it so?

If that's correct, we should look at any piece of code reading such field
multiple times without using atomic operation, right?

In this case `ref_count' is used once for sanity checks in UVM_MAP_REQ_WRITE()
and after calling atomic_dec_int_nv() in uvm_map_deallocate().  So, I don't
see "volatile" necessary here.  Did I miss anything?

There's only a couple of 'volatile' usages in sys/sys.  These annotations
do not explicitly indicate which piece of code requires it.  Maybe it would
be clearer to use a cast or a macro where necessary.  This might help us
understand why and where "volatile" is needed.

> > Index: uvm/uvm_map.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> > retrieving revision 1.274
> > diff -u -p -r1.274 uvm_map.c
> > --- uvm/uvm_map.c   26 Mar 2021 13:40:05 -  1.274
> > +++ uvm/uvm_map.c   18 May 2021 09:36:55 -
> > @@ -491,12 +491,13 @@ uvm_mapent_addr_remove(struct vm_map *ma
> >  /*
> >   * uvm_map_reference: add reference to a map
> >   *
> > - * XXX check map reference counter lock
> > + * => map need not be locked
> >   */
> > -#define uvm_map_reference(_map)
> > \
> > -   do {\
> > -   map->ref_count++;   \
> > -   } while (0)
> > +void
> > +uvm_map_reference(struct vm_map *map)
> > +{
> > +   atomic_inc_int(>ref_count);
> > +}
> >  
> >  /*
> >   * Calculate the dused delta.
> > @@ -4292,7 +4293,7 @@ uvm_map_deallocate(vm_map_t map)
> > int c;
> > struct uvm_map_deadq dead;
> >  
> > -   c = --map->ref_count;
> > +   c = atomic_dec_int_nv(>ref_count);
> > if (c > 0) {
> > return;
> > }
> > Index: uvm/uvm_map.h
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_map.h,v
> > retrieving revision 1.69
> > diff -u -p -r1.69 uvm_map.h
> > --- uvm/uvm_map.h   12 Mar 2021 14:15:49 -  1.69
> > +++ uvm/uvm_map.h   18 May 2021 09:36:36 -
> > @@ -259,6 +259,7 @@ RBT_PROTOTYPE(uvm_map_addr, vm_map_entry
> >   * read_locks and write_locks are used in lock debugging code.
> >   *
> >   *  Locks used to protect struct members in this file:
> > + * a   atomic operations
> >   * I   immutable after creation or exec(2)
> >   * v   `vm_map_lock' (this map `lock' or `mtx')
> >   */
> > @@ -272,7 +273,7 @@ struct vm_map {
> > struct uvm_map_addr addr;   /* [v] Entry tree, by addr */
> >  
> > vsize_t size;   /* virtual size */
> > -   int ref_count;  /* Reference count */
> > +   int ref_count;  /* [a] Reference count */
> > int flags;  /* flags */
> > struct mutexflags_lock; /* flags lock */
> > unsigned inttimestamp;  /* Version number */
> > 
> > 



Use atomic op for UVM map refcount

2021-05-18 Thread Martin Pieuchot
This allows us to not rely on the KERNEL_LOCK() to check reference
counts.

Also reduces differences with NetBSD and shrink my upcoming `vmobjlock'
diff.

ok?

Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.274
diff -u -p -r1.274 uvm_map.c
--- uvm/uvm_map.c   26 Mar 2021 13:40:05 -  1.274
+++ uvm/uvm_map.c   18 May 2021 09:36:55 -
@@ -491,12 +491,13 @@ uvm_mapent_addr_remove(struct vm_map *ma
 /*
  * uvm_map_reference: add reference to a map
  *
- * XXX check map reference counter lock
+ * => map need not be locked
  */
-#define uvm_map_reference(_map)
\
-   do {\
-   map->ref_count++;   \
-   } while (0)
+void
+uvm_map_reference(struct vm_map *map)
+{
+   atomic_inc_int(>ref_count);
+}
 
 /*
  * Calculate the dused delta.
@@ -4292,7 +4293,7 @@ uvm_map_deallocate(vm_map_t map)
int c;
struct uvm_map_deadq dead;
 
-   c = --map->ref_count;
+   c = atomic_dec_int_nv(>ref_count);
if (c > 0) {
return;
}
Index: uvm/uvm_map.h
===
RCS file: /cvs/src/sys/uvm/uvm_map.h,v
retrieving revision 1.69
diff -u -p -r1.69 uvm_map.h
--- uvm/uvm_map.h   12 Mar 2021 14:15:49 -  1.69
+++ uvm/uvm_map.h   18 May 2021 09:36:36 -
@@ -259,6 +259,7 @@ RBT_PROTOTYPE(uvm_map_addr, vm_map_entry
  * read_locks and write_locks are used in lock debugging code.
  *
  *  Locks used to protect struct members in this file:
+ * a   atomic operations
  * I   immutable after creation or exec(2)
  * v   `vm_map_lock' (this map `lock' or `mtx')
  */
@@ -272,7 +273,7 @@ struct vm_map {
struct uvm_map_addr addr;   /* [v] Entry tree, by addr */
 
vsize_t size;   /* virtual size */
-   int ref_count;  /* Reference count */
+   int ref_count;  /* [a] Reference count */
int flags;  /* flags */
struct mutexflags_lock; /* flags lock */
unsigned inttimestamp;  /* Version number */



Re: [External] : Re: parallel forwarding vs. bridges

2021-05-17 Thread Martin Pieuchot
On 17/05/21(Mon) 19:52, Alexandr Nedvedicky wrote:
> [...] 
> I don't mind to trade pf_lock and pf_state_lock for mutexes, however
> I see such step is kind of 'NO-OP'. We do have sufficient measure
> currently, which is: keep NET_LOCK() as is. May be I'm not seeing
> your idea/plan behind changing pf's rw-locks to mutexes. If you feel
> there is a benefit to go that way, then let's do it, but I'd like
> to understand where we will be going/what to expect.

I've no idea or plan.  I'm just pointing out that using rwlocks, for the
moment, add extra work.  If it's easy to use mutexes then you might want
to start with that.  The whole network processing path assumes it runs
without sleeping.  I've no idea what can happen when this assumption
will be broken.

I'm well aware that using a single big pf lock is not the best for
performances, but maybe it's easier to do baby steps.

That said, I don't want to stop or discourage anyone.  If you're
confident enough that rwlocks are the way to go, then please, go
ahead.

Cheers,
Martin



Re: parallel forwarding vs. bridges

2021-05-17 Thread Martin Pieuchot
On 17/05/21(Mon) 16:24, Alexandr Nedvedicky wrote:
> Hrvoje,
> 
> managed to trigger diagnostic panics with diff [1] sent by bluhm@
> some time ago. The panic Hrvoje sees comes from ether_input() here:
> 
>  414 
>  415 /*
>  416  * Third phase: bridge processing.
>  417  *
>  418  * Give the packet to a bridge interface, ie, bridge(4),
>  419  * switch(4), or tpmr(4), if it is configured. A bridge
>  420  * may take the packet and forward it to another port, or it
>  421  * may return it here to ether_input() to support local
>  422  * delivery to this port.
>  423  */
>  424 
>  425 ac = (struct arpcom *)ifp;
>  426 
>  427 smr_read_enter();
>  428 eb = SMR_PTR_GET(>ac_brport);
>  429 if (eb != NULL) {
>  430 m = (*eb->eb_input)(ifp, m, dst, eb->eb_port);
>  431 if (m == NULL) {
>  432 smr_read_leave();
>  433 return;
>  434 }
>  435 }
>  436 smr_read_leave();
>  437 
> 
> in current tree the ether_input() is protected by NET_LOCK(), which is grabbed
> by caller as a writer. bluhm's diff changes NET_LOCK() readlock, so
> ether_input() can run concurrently. Switching NET_LOCK() to r-lock has
> implications on smr read section above. The ting is the call to eb->eb_input()
> can sleep now. This is something what needs to be avoided within smr section.

Is the new sleeping point introduced by the fact the PF_LOCK() is a
rwlock?  Did you consider using a mutex, at least for the time being,
in order to not run in such issues?



uao_dropswap_range()

2021-05-17 Thread Martin Pieuchot
Diff below makes use of uao_dropswap_range() in uao_free() instead of
duplicating it.  This function has been imported from NetBSD along with
TMPFS.  I'd like to use it to reduce the difference with their tree and
reduce the size of my upcoming `vmobjlock' diff.

ok?

Index: uvm/uvm_aobj.c
===
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.95
diff -u -p -r1.95 uvm_aobj.c
--- uvm/uvm_aobj.c  22 Apr 2021 11:54:32 -  1.95
+++ uvm/uvm_aobj.c  11 May 2021 11:26:15 -
@@ -351,58 +351,16 @@ uao_set_swslot(struct uvm_object *uobj, 
 static void
 uao_free(struct uvm_aobj *aobj)
 {
+   struct uvm_object *uobj = >u_obj;
 
-   if (UAO_USES_SWHASH(aobj)) {
-   int i, hashbuckets = aobj->u_swhashmask + 1;
+   uao_dropswap_range(uobj, 0, 0);
 
+   if (UAO_USES_SWHASH(aobj)) {
/*
-* free the swslots from each hash bucket,
-* then the hash bucket, and finally the hash table itself.
+* free the hash table itself.
 */
-   for (i = 0; i < hashbuckets; i++) {
-   struct uao_swhash_elt *elt, *next;
-
-   for (elt = LIST_FIRST(>u_swhash[i]);
-elt != NULL;
-elt = next) {
-   int j;
-
-   for (j = 0; j < UAO_SWHASH_CLUSTER_SIZE; j++) {
-   int slot = elt->slots[j];
-
-   if (slot == 0) {
-   continue;
-   }
-   uvm_swap_free(slot, 1);
-   /*
-* this page is no longer
-* only in swap.
-*/
-   atomic_dec_int();
-   }
-
-   next = LIST_NEXT(elt, list);
-   pool_put(_swhash_elt_pool, elt);
-   }
-   }
-
hashfree(aobj->u_swhash, UAO_SWHASH_BUCKETS(aobj->u_pages), 
M_UVMAOBJ);
} else {
-   int i;
-
-   /*
-* free the array
-*/
-   for (i = 0; i < aobj->u_pages; i++) {
-   int slot = aobj->u_swslots[i];
-
-   if (slot) {
-   uvm_swap_free(slot, 1);
-
-   /* this page is no longer only in swap. */
-   atomic_dec_int();
-   }
-   }
free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
}
 
@@ -1487,9 +1445,6 @@ uao_pagein_page(struct uvm_aobj *aobj, i
 }
 
 /*
- * XXX pedro: Once we are comfortable enough with this function, we can adapt
- * uao_free() to use it.
- *
  * uao_dropswap_range: drop swapslots in the range.
  *
  * => aobj must be locked and is returned locked.



Re: running network stack forwarding in parallel

2021-05-17 Thread Martin Pieuchot
On 16/05/21(Sun) 15:56, Vitaliy Makkoveev wrote:
> 
> 
> > On 14 May 2021, at 14:43, Martin Pieuchot  wrote:
> > 
> > On 13/05/21(Thu) 14:50, Vitaliy Makkoveev wrote:
> >> On Thu, May 13, 2021 at 01:15:05PM +0200, Hrvoje Popovski wrote:
> >>> On 13.5.2021. 1:25, Vitaliy Makkoveev wrote:
> >>>> It seems this lock order issue is not parallel diff specific.
> >>> 
> >>> 
> >>> 
> >>> Yes,  you are right ... it seemed familiar but i couldn't reproduce it
> >>> on lapc trunk or without this diff so i thought that parallel diff is
> >>> one to blame ..
> >>> 
> >>> 
> >>> sorry for noise ..
> >>> 
> >> 
> >> Timeout thread and interface destroy thread are both serialized by
> >> kernel lock so it's hard to catch this issue. So your report is
> >> useful :)
> > 
> > The use of the NET_LOCK() in *clone_destroy() is problematic.  tpmr(4)
> > has a similar problem as reported by Hrvoje in a different thread.  I
> > don't know what it is serializing, hopefully David can tell us more.
> > 
> 
> It serializes detach hook and clone_detach. Detach hooks are executed
> with netlock held. Unfortunately this problem is much complicated,
> and we can’t just introduce new lock to solve it because this will
> introduce lock order issue.

We're talking about different uses of the NET_LOCK().  if_detach() and
if_deactivate() internally grab the NET_LOCK() for the reason you
mentioned.

I'm asking what in aggr_down() and aggr_p_dtor() or respectively
tpmr_down() and tpmr_p_dtor() require the NET_LOCK() and if this could
be done differently.



Re: running network stack forwarding in parallel

2021-05-14 Thread Martin Pieuchot
On 13/05/21(Thu) 14:50, Vitaliy Makkoveev wrote:
> On Thu, May 13, 2021 at 01:15:05PM +0200, Hrvoje Popovski wrote:
> > On 13.5.2021. 1:25, Vitaliy Makkoveev wrote:
> > > It seems this lock order issue is not parallel diff specific.
> > 
> > 
> > 
> > Yes,  you are right ... it seemed familiar but i couldn't reproduce it
> > on lapc trunk or without this diff so i thought that parallel diff is
> > one to blame ..
> > 
> > 
> > sorry for noise ..
> >
> 
> Timeout thread and interface destroy thread are both serialized by
> kernel lock so it's hard to catch this issue. So your report is
> useful :)

The use of the NET_LOCK() in *clone_destroy() is problematic.  tpmr(4)
has a similar problem as reported by Hrvoje in a different thread.  I
don't know what it is serializing, hopefully David can tell us more.



Re: ld.so: NULL dereference on corrupt library

2021-05-05 Thread Martin Pieuchot
On 04/05/21(Tue) 21:41, Klemens Nanni wrote:
> On Thu, Apr 15, 2021 at 03:05:45PM +0200, Mark Kettenis wrote:
> > > > [...] 
> > > > Hence, only access buckets in _dl_find_symbol_obj() if there are any;
> > > > this fixes the crash and in fact allows me to play the song even when
> > > > preloading the currupted library, i.e.
> > > > 
> > > > $ LD_PRELOAD=./libvorbisenc.so.3.1 ogg123 song62.ogg
> > > > 
> > > > now also works with patched ld.so installed -- I'd expected ld.so,
> > > > libvorbis or ogg123 to crash on some other place...
> > > > 
> > > > I'm not sure what to make of this, I also don't know enough about ld.so
> > > > to judge this diff in context, it does however fix an obvious error.
> > > > FWIW, regress/libexec/ld.so runs fine with this diff.
> > > 
> > > I'm not sure if silently ignoring the corruption is the best way to go.
> > 
> > It certainly isn't.  If corruption is detected, the prcess should
> > terminate immedtaley.
> 
> I totally agree, mention of regress passing shouldn't imply that I want
> that diff in, but rather show that it didn't have unexpected drawbacks.
> 
> > > Do you know why `nbuckets' and `buckets_elf' aren't initialized for this
> > > object?  Do you know if _dl_finalize_object() has been call for it?
> 
> Yes, _dl_finalize_object() is always called for it.
> 
> I compared my corrupted shared library with an intact copy from ports
> and it showed that the corrupted one was simply zeroed out at some point
> (not truncated) until the end, e.g. readelf(1)'s `-h' or `-l' report
> "Error: no .dynamic section in the dynamic segment".
> 
> So this isn't a case of some badly linked library or one that has a few
> bits flipped, it's simply a partial one... seems like bad luck?
> 
> > > > Is this a code path that can happen with intact objects?
> > > > Given that the file is obviously corrupted but programs using it still
> > > > (partially) work, should a warning be printed in this case?
> > > 
> > > Indicating that the library is corrupted might indeed be better than
> > > crashing.  However it isn't clear to me where such check should happen.
> 
> I've done just that now as there's nothing else to do.  It is an obscure
> case that I cannot explain without corruption, so very unlikely to
> happen, but now it did...
> 
> > > > Index: resolve.c
> > > > ===
> > > > RCS file: /cvs/src/libexec/ld.so/resolve.c,v
> > > > retrieving revision 1.94
> > > > diff -u -p -r1.94 resolve.c
> > > > --- resolve.c   4 Oct 2019 17:42:16 -   1.94
> > > > +++ resolve.c   14 Apr 2021 15:56:14 -
> > > > @@ -608,7 +608,7 @@ _dl_find_symbol_obj(elf_object_t *obj, s
> > > > return r > 0;
> > > > }
> > > > } while ((*hashval++ & 1U) == 0);
> > > > -   } else {
> > > > +   } else if (obj->nbuckets > 0) {
> > > > Elf_Word si;
> > > >  
> > > > for (si = obj->buckets_elf[sl->sl_elf_hash % 
> > > > obj->nbuckets];
> > > > 
> > > 
> > > 
> 
> readelf(1) detects this corruption as missing (or empty/zeroed out as
> code reading showed);  we could probably to that as well but it'd be
> less trivial and a generalisation of my issue.
> 
> So the new diff simply bails out if there is no symbol hash table, which
> is equally relevant for both ELF and GNU hashes:
> 
>   $ LD_PRELOAD=./bad.so ./ogg123 ~/song62.ogg
>   ld.so: ogg123.test: ./bad.so: no buckets
>   killed
>   $
> 
> Feedback? Objections? OK?

I still don't understand what the corruption is and the check below
doesn't explain that either.  So if I'm developing a library and I see
such message it doesn't give me more info than the previous core dump.

What is corrupted?  The header?  A section is missing?  An offset is
incorrect?  Is there a mismatch between DT_GNU_HASH and DT_HASH?

> Index: resolve.c
> ===
> RCS file: /cvs/src/libexec/ld.so/resolve.c,v
> retrieving revision 1.94
> diff -u -p -r1.94 resolve.c
> --- resolve.c 4 Oct 2019 17:42:16 -   1.94
> +++ resolve.c 29 Apr 2021 22:07:46 -
> @@ -573,6 +573,9 @@ _dl_find_symbol_obj(elf_object_t *obj, s
>  {
>   const Elf_Sym   *symt = obj->dyn.symtab;
>  
> + if (obj->nbuckets == 0)
> + _dl_die("%s: no buckets", obj->load_name);
> +
>   if (obj->status & STAT_GNU_HASH) {
>   uint32_t hash = sl->sl_gnu_hash;
>   Elf_Addr bloom_word;



Re: timeout_del_barrier(9): remove kernel lock

2021-05-04 Thread Martin Pieuchot
On 04/05/21(Tue) 01:10, Scott Cheloha wrote:
> [...] 
> I want to run softclock() without the kernel lock.  The way to go, I
> think, is to first push the kernel lock down into timeout_run(), and
> then to remove the kernel lock from each timeout, one by one.

Grabbing and releasing the KERNEL_LOCk() on a per-timeout basis creates
more latency than running all timeouts in a batch after having grabbed
the KERNEL_LOCK().  I doubt this is the best way forward.

> Before we can push the kernel lock down into timeout_run() we need to
> remove the kernel lock from timeout_del_barrier(9).

Seems worth it on its own.

> The kernel lock is used in timeout_del_barrier(9) to determine whether
> the given timeout has stopped running.  Because the softclock() runs
> with the kernel lock we currently assume that once the calling thread
> has taken the kernel lock any onging softclock() must have returned
> and relinquished the lock, so the timeout in question has returned.

So you want to stop using the KERNEL_LOCK() to do the serialization?  
 
> The simplest replacement I can think of is a volatile pointer to the
> running timeout that we set before leaving the timeout_mutex and clear
> after reentering the same during timeout_run().

Sounds like a condition variable protected by this mutex.  Interesting
that cond_wait(9) doesn't work with a mutex. 

> So in the non-TIMEOUT_PROC case the timeout_del_barrier(9) caller just
> spins until the timeout function returns and the timeout_running
> pointer is changed.  Not every caller can sleep during
> timeout_del_barrier(9).  I think spinning is the simplest thing that
> will definitely work here.

This keeps the current semantic indeed.

> -void
> -timeout_barrier(struct timeout *to)
> +int
> +timeout_del_barrier(struct timeout *to)
>  {
> + struct timeout barrier;
> + struct cond c = COND_INITIALIZER();
>   int needsproc = ISSET(to->to_flags, TIMEOUT_PROC);
>  
>   timeout_sync_order(needsproc);
>  
> - if (!needsproc) {
> - KERNEL_LOCK();
> - splx(splsoftclock());
> - KERNEL_UNLOCK();
> - } else {
> - struct cond c = COND_INITIALIZER();
> - struct timeout barrier;
> + mtx_enter(_mutex);
> +
> + if (timeout_del_locked(to)) {
> + mtx_leave(_mutex);
> + return 1;
> + }
>  
> + if (needsproc) {
>   timeout_set_proc(, timeout_proc_barrier, );
>   barrier.to_process = curproc->p_p;
> -
> - mtx_enter(_mutex);
>   SET(barrier.to_flags, TIMEOUT_ONQUEUE);
>   CIRCQ_INSERT_TAIL(_proc, _list);
>   mtx_leave(_mutex);
> -
>   wakeup_one(_proc);
> -
>   cond_wait(, "tmobar");
> + } else {
> + mtx_leave(_mutex);
> + /* XXX Is this in the right spot? */
> + splx(splsoftclock());
> + while (timeout_running == to)
> + CPU_BUSY_CYCLE();

Won't splx() will execute the soft-interrupt if there's any pending?
Shouldn't the barrier be before?  Could you add `spc->spc_spinning++'
around the spinning loop?  What happen if two threads call
timeout_del_barrier(9) with the same argument at the same time?  Is
it possible and/or supported?



Re: Stop/unstop process & xsig

2021-04-24 Thread Martin Pieuchot
On 24/04/21(Sat) 12:49, Mark Kettenis wrote:
> > Date: Sat, 24 Apr 2021 12:23:17 +0200
> > From: Martin Pieuchot 
> > 
> > On 20/03/21(Sat) 13:25, Martin Pieuchot wrote:
> > > Diff below refactors routines to stop/unstop processes and save the signal
> > > number which will/can be transmitted it in wait4(2).  It does the 
> > > following:
> > > 
> > > - Move the "hack" involving P_SINTR to avoid grabbing the SCHED_LOCK()
> > >   recursively inside proc_stop().
> > > 
> > > - Introduce proc_unstop(), the symmetric routine to proc_stop().
> > > 
> > > - Manipulate `ps_xsig' only in proc_stop/unstop().
> > > 
> > > Ok?
> > 
> > Anyone?
> 
> This is not ok...
> 
> > 
> > > Index: kern/kern_sig.c
> > > ===
> > > RCS file: /cvs/src/sys/kern/kern_sig.c,v
> > > retrieving revision 1.278
> > > diff -u -p -r1.278 kern_sig.c
> > > --- kern/kern_sig.c   12 Mar 2021 10:13:28 -  1.278
> > > +++ kern/kern_sig.c   20 Mar 2021 12:16:51 -
> > > @@ -124,7 +124,7 @@ const int sigprop[NSIG + 1] = {
> > >  
> > >  void setsigvec(struct proc *, int, struct sigaction *);
> > >  
> > > -void proc_stop(struct proc *p, int);
> > > +int proc_stop(struct proc *p, int, int);
> > >  void proc_stop_sweep(void *);
> > >  void *proc_stop_si;
> > >  
> > > @@ -1061,8 +1061,7 @@ ptsignal(struct proc *p, int signum, enu
> > >   if (pr->ps_flags & PS_PPWAIT)
> > >   goto out;
> > >   atomic_clearbits_int(siglist, mask);
> > > - pr->ps_xsig = signum;
> > > - proc_stop(p, 0);
> > > + proc_stop(p, signum, 0);
> > >   goto out;
> > >   }
> > >   /*
> > > @@ -1170,17 +1169,12 @@ out:
> > >   *
> > >   *   while (signum = cursig(curproc))
> > >   *   postsig(signum);
> > > - *
> > > - * Assumes that if the P_SINTR flag is set, we're holding both the
> > > - * kernel and scheduler locks.
> > >   */
> > >  int
> > >  cursig(struct proc *p)
> > >  {
> > >   struct process *pr = p->p_p;
> > >   int sigpending, signum, mask, prop;
> > > - int dolock = (p->p_flag & P_SINTR) == 0;
> > > - int s;
> > >  
> > >   KERNEL_ASSERT_LOCKED();
> > >  
> > > @@ -1217,31 +1211,22 @@ cursig(struct proc *p)
> > >*/
> > >   if (((pr->ps_flags & (PS_TRACED | PS_PPWAIT)) == PS_TRACED) &&
> > >   signum != SIGKILL) {
> > > - pr->ps_xsig = signum;
> > >  
> > >   single_thread_set(p, SINGLE_SUSPEND, 0);
> > > -
> > > - if (dolock)
> > > - SCHED_LOCK(s);
> > > - proc_stop(p, 1);
> > > - if (dolock)
> > > - SCHED_UNLOCK(s);
> > > -
> > > + signum = proc_stop(p, signum, 1);
> 
> At this point the process will sleep since proc_stop() calls
> mi_switch().  At that point the debugger may clear or change the value
> of ps_xsig.

Indeed, for that exact reason `ps_xsig' is read at the end of
proc_stop() once the thread returned from mi_switch().  Are you saying
this is not enough?

It is similar to move the assignment below just before the test, no?  My
understanding is that this is safe because all this code is currently
executed under a lock common to all the threads.

> 
> > >   single_thread_clear(p, 0);
> > >  
> > >   /*
> > >* If we are no longer being traced, or the parent
> > >* didn't give us a signal, look for more signals.
> > >*/
> > > - if ((pr->ps_flags & PS_TRACED) == 0 ||
> > > - pr->ps_xsig == 0)
> > > + if ((pr->ps_flags & PS_TRACED) == 0 || signum == 0)
> > >   continue;
> 
> So this change is wrong.
> 
> > >  
> > >   /*
> > >* If the new signal is being masked, look for other
> > >* signals.
> > >*/
> > >

Re: uvm_km_kmemalloc{,_pla}()

2021-04-24 Thread Martin Pieuchot
On 24/04/21(Sat) 12:25, Mark Kettenis wrote:
> > Date: Sat, 24 Apr 2021 12:02:22 +0200
> > From: Martin Pieuchot 
> > 
> > Diff below merge the two allocators into one and remove unused
> > alignment/boundary arguments.  This is a small cleanup that helps
> > me keep track of the remaining allocators.
> > 
> > ok?
> 
> Not sure.  Is uvm_km_kmemalloc() going to be replaced in the future?

I don't know yet.

> At the very least, you need to adjust the man page.

Sure.

> > Index: arch/arm/arm/pmap7.c
> > ===
> > RCS file: /cvs/src/sys/arch/arm/arm/pmap7.c,v
> > retrieving revision 1.61
> > diff -u -p -r1.61 pmap7.c
> > --- arch/arm/arm/pmap7.c25 Mar 2021 04:12:00 -  1.61
> > +++ arch/arm/arm/pmap7.c24 Apr 2021 09:53:11 -
> > @@ -2435,8 +2435,9 @@ pmap_bootstrap_pv_page_alloc(struct pool
> > return (rv);
> > }
> >  
> > -   new_page = uvm_km_kmemalloc(kernel_map, NULL, PAGE_SIZE,
> > -   (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT);
> > +   new_page = uvm_km_kmemalloc(kernel_map, NULL, PAGE_SIZE, 0,
> > +   (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT, no_constraint.ucr_low,
> > +   no_constraint.ucr_high, 0);
> >  
> > last_bootstrap_page = new_page;
> > return ((void *)new_page);
> > Index: dev/pci/envy.c
> > ===
> > RCS file: /cvs/src/sys/dev/pci/envy.c,v
> > retrieving revision 1.81
> > diff -u -p -r1.81 envy.c
> > --- dev/pci/envy.c  5 Jan 2020 01:07:58 -   1.81
> > +++ dev/pci/envy.c  24 Apr 2021 09:53:12 -
> > @@ -1834,9 +1834,9 @@ envy_allocm(void *self, int dir, size_t 
> >  #define ENVY_ALIGN 4
> >  #define ENVY_MAXADDR   ((1 << 28) - 1)
> >  
> > -   buf->addr = (caddr_t)uvm_km_kmemalloc_pla(kernel_map,
> > +   buf->addr = (caddr_t)uvm_km_kmemalloc(kernel_map,
> > uvm.kernel_object, buf->size, 0, UVM_KMF_NOWAIT, 0,
> > -   (paddr_t)ENVY_MAXADDR, 0, 0, 1);
> > +   (paddr_t)ENVY_MAXADDR, 1);
> > if (buf->addr == NULL) {
> > DPRINTF("%s: unable to alloc dma segment\n", DEVNAME(sc));
> > goto err_ret;
> > Index: dev/pci/if_bce.c
> > ===
> > RCS file: /cvs/src/sys/dev/pci/if_bce.c,v
> > retrieving revision 1.53
> > diff -u -p -r1.53 if_bce.c
> > --- dev/pci/if_bce.c10 Jul 2020 13:22:20 -  1.53
> > +++ dev/pci/if_bce.c24 Apr 2021 09:53:12 -
> > @@ -253,9 +253,9 @@ bce_attach(struct device *parent, struct
> > bce_reset(sc);
> >  
> > /* Create the data DMA region and maps. */
> > -   if ((sc->bce_data = (caddr_t)uvm_km_kmemalloc_pla(kernel_map,
> > +   if ((sc->bce_data = (caddr_t)uvm_km_kmemalloc(kernel_map,
> > uvm.kernel_object, (BCE_NTXDESC + BCE_NRXDESC) * MCLBYTES, 0,
> > -   UVM_KMF_NOWAIT, 0, (paddr_t)(0x4000 - 1), 0, 0, 1)) == NULL) {
> > +   UVM_KMF_NOWAIT, 0, (paddr_t)(0x4000 - 1), 1)) == NULL) {
> > printf(": unable to alloc space for ring");
> > return;
> > }
> > Index: kern/kern_malloc.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_malloc.c,v
> > retrieving revision 1.145
> > diff -u -p -r1.145 kern_malloc.c
> > --- kern/kern_malloc.c  21 Apr 2021 10:02:05 -  1.145
> > +++ kern/kern_malloc.c  24 Apr 2021 09:53:12 -
> > @@ -228,12 +228,12 @@ malloc(size_t size, int type, int flags)
> > mtx_leave(_mtx);
> > npg = atop(round_page(allocsize));
> > s = splvm();
> > -   va = (caddr_t)uvm_km_kmemalloc_pla(kmem_map, NULL,
> > +   va = (caddr_t)uvm_km_kmemalloc(kmem_map, NULL,
> > (vsize_t)ptoa(npg), 0,
> > ((flags & M_NOWAIT) ? UVM_KMF_NOWAIT : 0) |
> > ((flags & M_CANFAIL) ? UVM_KMF_CANFAIL : 0),
> > no_constraint.ucr_low, no_constraint.ucr_high,
> > -   0, 0, 0);
> > +   0);
> > splx(s);
> > if (va == NULL) {
> > /*
> > Index: uvm/uvm_extern.h
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
> > retrieving revision 1.157
> > diff -u -p -r1.157 uvm_extern.h
> > 

Re: Stop/unstop process & xsig

2021-04-24 Thread Martin Pieuchot
On 20/03/21(Sat) 13:25, Martin Pieuchot wrote:
> Diff below refactors routines to stop/unstop processes and save the signal
> number which will/can be transmitted it in wait4(2).  It does the following:
> 
> - Move the "hack" involving P_SINTR to avoid grabbing the SCHED_LOCK()
>   recursively inside proc_stop().
> 
> - Introduce proc_unstop(), the symmetric routine to proc_stop().
> 
> - Manipulate `ps_xsig' only in proc_stop/unstop().
> 
> Ok?

Anyone?

> Index: kern/kern_sig.c
> ===
> RCS file: /cvs/src/sys/kern/kern_sig.c,v
> retrieving revision 1.278
> diff -u -p -r1.278 kern_sig.c
> --- kern/kern_sig.c   12 Mar 2021 10:13:28 -  1.278
> +++ kern/kern_sig.c   20 Mar 2021 12:16:51 -
> @@ -124,7 +124,7 @@ const int sigprop[NSIG + 1] = {
>  
>  void setsigvec(struct proc *, int, struct sigaction *);
>  
> -void proc_stop(struct proc *p, int);
> +int proc_stop(struct proc *p, int, int);
>  void proc_stop_sweep(void *);
>  void *proc_stop_si;
>  
> @@ -1061,8 +1061,7 @@ ptsignal(struct proc *p, int signum, enu
>   if (pr->ps_flags & PS_PPWAIT)
>   goto out;
>   atomic_clearbits_int(siglist, mask);
> - pr->ps_xsig = signum;
> - proc_stop(p, 0);
> + proc_stop(p, signum, 0);
>   goto out;
>   }
>   /*
> @@ -1170,17 +1169,12 @@ out:
>   *
>   *   while (signum = cursig(curproc))
>   *   postsig(signum);
> - *
> - * Assumes that if the P_SINTR flag is set, we're holding both the
> - * kernel and scheduler locks.
>   */
>  int
>  cursig(struct proc *p)
>  {
>   struct process *pr = p->p_p;
>   int sigpending, signum, mask, prop;
> - int dolock = (p->p_flag & P_SINTR) == 0;
> - int s;
>  
>   KERNEL_ASSERT_LOCKED();
>  
> @@ -1217,31 +1211,22 @@ cursig(struct proc *p)
>*/
>   if (((pr->ps_flags & (PS_TRACED | PS_PPWAIT)) == PS_TRACED) &&
>   signum != SIGKILL) {
> - pr->ps_xsig = signum;
>  
>   single_thread_set(p, SINGLE_SUSPEND, 0);
> -
> - if (dolock)
> - SCHED_LOCK(s);
> - proc_stop(p, 1);
> - if (dolock)
> - SCHED_UNLOCK(s);
> -
> + signum = proc_stop(p, signum, 1);
>   single_thread_clear(p, 0);
>  
>   /*
>* If we are no longer being traced, or the parent
>* didn't give us a signal, look for more signals.
>*/
> - if ((pr->ps_flags & PS_TRACED) == 0 ||
> - pr->ps_xsig == 0)
> + if ((pr->ps_flags & PS_TRACED) == 0 || signum == 0)
>   continue;
>  
>   /*
>* If the new signal is being masked, look for other
>* signals.
>*/
> - signum = pr->ps_xsig;
>   mask = sigmask(signum);
>   if ((p->p_sigmask & mask) != 0)
>   continue;
> @@ -1286,12 +1271,7 @@ cursig(struct proc *p)
>   (pr->ps_pgrp->pg_jobc == 0 &&
>   prop & SA_TTYSTOP))
>   break;  /* == ignore */
> - pr->ps_xsig = signum;
> - if (dolock)
> - SCHED_LOCK(s);
> - proc_stop(p, 1);
> - if (dolock)
> - SCHED_UNLOCK(s);
> + proc_stop(p, signum, 1);
>   break;
>   } else if (prop & SA_IGNORE) {
>   /*
> @@ -1331,15 +1311,21 @@ keep:
>   * Put the argument process into the stopped state and notify the parent
>   * via wakeup.  Signals are handled elsewhere.  The process must not be
>   * on the run queue.
> + *
> + * Assumes that if the P_SINTR flag is set, we're holding the scheduler
> + * lock.
>   */
> -void
> -proc_stop(struct proc *p, int sw)
> +int
> +proc_stop(struct proc *p, int signum, int sw)
>  {
>   struct process *pr = p->p_p;
> + int 

uvm_km_kmemalloc{,_pla}()

2021-04-24 Thread Martin Pieuchot
Diff below merge the two allocators into one and remove unused
alignment/boundary arguments.  This is a small cleanup that helps
me keep track of the remaining allocators.

ok?

Index: arch/arm/arm/pmap7.c
===
RCS file: /cvs/src/sys/arch/arm/arm/pmap7.c,v
retrieving revision 1.61
diff -u -p -r1.61 pmap7.c
--- arch/arm/arm/pmap7.c25 Mar 2021 04:12:00 -  1.61
+++ arch/arm/arm/pmap7.c24 Apr 2021 09:53:11 -
@@ -2435,8 +2435,9 @@ pmap_bootstrap_pv_page_alloc(struct pool
return (rv);
}
 
-   new_page = uvm_km_kmemalloc(kernel_map, NULL, PAGE_SIZE,
-   (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT);
+   new_page = uvm_km_kmemalloc(kernel_map, NULL, PAGE_SIZE, 0,
+   (flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT, no_constraint.ucr_low,
+   no_constraint.ucr_high, 0);
 
last_bootstrap_page = new_page;
return ((void *)new_page);
Index: dev/pci/envy.c
===
RCS file: /cvs/src/sys/dev/pci/envy.c,v
retrieving revision 1.81
diff -u -p -r1.81 envy.c
--- dev/pci/envy.c  5 Jan 2020 01:07:58 -   1.81
+++ dev/pci/envy.c  24 Apr 2021 09:53:12 -
@@ -1834,9 +1834,9 @@ envy_allocm(void *self, int dir, size_t 
 #define ENVY_ALIGN 4
 #define ENVY_MAXADDR   ((1 << 28) - 1)
 
-   buf->addr = (caddr_t)uvm_km_kmemalloc_pla(kernel_map,
+   buf->addr = (caddr_t)uvm_km_kmemalloc(kernel_map,
uvm.kernel_object, buf->size, 0, UVM_KMF_NOWAIT, 0,
-   (paddr_t)ENVY_MAXADDR, 0, 0, 1);
+   (paddr_t)ENVY_MAXADDR, 1);
if (buf->addr == NULL) {
DPRINTF("%s: unable to alloc dma segment\n", DEVNAME(sc));
goto err_ret;
Index: dev/pci/if_bce.c
===
RCS file: /cvs/src/sys/dev/pci/if_bce.c,v
retrieving revision 1.53
diff -u -p -r1.53 if_bce.c
--- dev/pci/if_bce.c10 Jul 2020 13:22:20 -  1.53
+++ dev/pci/if_bce.c24 Apr 2021 09:53:12 -
@@ -253,9 +253,9 @@ bce_attach(struct device *parent, struct
bce_reset(sc);
 
/* Create the data DMA region and maps. */
-   if ((sc->bce_data = (caddr_t)uvm_km_kmemalloc_pla(kernel_map,
+   if ((sc->bce_data = (caddr_t)uvm_km_kmemalloc(kernel_map,
uvm.kernel_object, (BCE_NTXDESC + BCE_NRXDESC) * MCLBYTES, 0,
-   UVM_KMF_NOWAIT, 0, (paddr_t)(0x4000 - 1), 0, 0, 1)) == NULL) {
+   UVM_KMF_NOWAIT, 0, (paddr_t)(0x4000 - 1), 1)) == NULL) {
printf(": unable to alloc space for ring");
return;
}
Index: kern/kern_malloc.c
===
RCS file: /cvs/src/sys/kern/kern_malloc.c,v
retrieving revision 1.145
diff -u -p -r1.145 kern_malloc.c
--- kern/kern_malloc.c  21 Apr 2021 10:02:05 -  1.145
+++ kern/kern_malloc.c  24 Apr 2021 09:53:12 -
@@ -228,12 +228,12 @@ malloc(size_t size, int type, int flags)
mtx_leave(_mtx);
npg = atop(round_page(allocsize));
s = splvm();
-   va = (caddr_t)uvm_km_kmemalloc_pla(kmem_map, NULL,
+   va = (caddr_t)uvm_km_kmemalloc(kmem_map, NULL,
(vsize_t)ptoa(npg), 0,
((flags & M_NOWAIT) ? UVM_KMF_NOWAIT : 0) |
((flags & M_CANFAIL) ? UVM_KMF_CANFAIL : 0),
no_constraint.ucr_low, no_constraint.ucr_high,
-   0, 0, 0);
+   0);
splx(s);
if (va == NULL) {
/*
Index: uvm/uvm_extern.h
===
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.157
diff -u -p -r1.157 uvm_extern.h
--- uvm/uvm_extern.h12 Mar 2021 14:15:49 -  1.157
+++ uvm/uvm_extern.h24 Apr 2021 09:53:12 -
@@ -297,11 +297,9 @@ intuvm_io(vm_map_t, struct uio *, 
int
 vaddr_tuvm_km_alloc1(vm_map_t, vsize_t, vsize_t, 
boolean_t);
 void   uvm_km_free(vm_map_t, vaddr_t, vsize_t);
 void   uvm_km_free_wakeup(vm_map_t, vaddr_t, vsize_t);
-vaddr_tuvm_km_kmemalloc_pla(struct vm_map *,
+vaddr_tuvm_km_kmemalloc(struct vm_map *,
struct uvm_object *, vsize_t, vsize_t, int,
-   paddr_t, paddr_t, paddr_t, paddr_t, int);
-#define uvm_km_kmemalloc(map, obj, sz, flags)  \
-   uvm_km_kmemalloc_pla(map, obj, sz, 0, flags, 0, (paddr_t)-1, 0, 0, 0)
+   paddr_t, paddr_t, int);
 vaddr_tuvm_km_valloc(vm_map_t, vsize_t);
 vaddr_tuvm_km_valloc_try(vm_map_t, vsize_t);
 vaddr_tuvm_km_valloc_wait(vm_map_t, 

km_alloc(9) for i386 pmap

2021-04-23 Thread Martin Pieuchot
Diff below convert the last uses of uvm_km_alloc(9) and uvm_km_zalloc(9)
to km_alloc(9).

One of the allocations below uses `kp_pageable' instead of `kp_zero'
because the mapping for `pm_pdir_intel' is lost when PAE is enabled
and need to be re-established when a fault happens.  This is consistent
with what currently happens with uvm_km_zalloc().  Thanks to hshoexer@
for the analysis.

Fixing this is left as an exercise for the reader.  I'm currently
concerned by getting rid of the old allocators.

ok?

Index: arch/i386/i386/pmap.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.211
diff -u -p -r1.211 pmap.c
--- arch/i386/i386/pmap.c   11 Mar 2021 11:16:57 -  1.211
+++ arch/i386/i386/pmap.c   23 Apr 2021 17:36:57 -
@@ -1365,7 +1365,7 @@ void
 pmap_pinit_pd_86(struct pmap *pmap)
 {
/* allocate PDP */
-   pmap->pm_pdir = uvm_km_alloc(kernel_map, NBPG);
+   pmap->pm_pdir = (vaddr_t)km_alloc(NBPG, _any, _dirty, _waitok);
if (pmap->pm_pdir == 0)
panic("pmap_pinit_pd_86: kernel_map out of virtual space!");
pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir,
@@ -1397,7 +1397,8 @@ pmap_pinit_pd_86(struct pmap *pmap)
 * execution, one that lacks all kernel mappings.
 */
if (cpu_meltdown) {
-   pmap->pm_pdir_intel = uvm_km_zalloc(kernel_map, NBPG);
+   pmap->pm_pdir_intel = (vaddr_t)km_alloc(NBPG, _any, _zero,
+   _waitok);
if (pmap->pm_pdir_intel == 0)
panic("%s: kernel_map out of virtual space!", __func__);
 
@@ -1449,11 +1450,12 @@ pmap_destroy(struct pmap *pmap)
uvm_pagefree(pg);
}
 
-   uvm_km_free(kernel_map, pmap->pm_pdir, pmap->pm_pdirsize);
+   km_free((void *)pmap->pm_pdir, pmap->pm_pdirsize, _any, _dirty);
pmap->pm_pdir = 0;
 
if (pmap->pm_pdir_intel) {
-   uvm_km_free(kernel_map, pmap->pm_pdir_intel, pmap->pm_pdirsize);
+   km_free((void *)pmap->pm_pdir_intel, pmap->pm_pdirsize,
+   _any, _dirty);
pmap->pm_pdir_intel = 0;
}
 
@@ -2522,8 +2524,9 @@ pmap_enter_special_86(vaddr_t va, paddr_
__func__, va);
 
if (!pmap->pm_pdir_intel) {
-   if ((pmap->pm_pdir_intel = uvm_km_zalloc(kernel_map, NBPG))
-   == 0)
+   pmap->pm_pdir_intel = (vaddr_t)km_alloc(NBPG, _any, _zero,
+   _waitok);
+   if (pmap->pm_pdir_intel == 0)
panic("%s: kernel_map out of virtual space!", __func__);
if (!pmap_extract(pmap, pmap->pm_pdir_intel,
>pm_pdirpa_intel))
Index: arch/i386/i386/pmapae.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmapae.c,v
retrieving revision 1.60
diff -u -p -r1.60 pmapae.c
--- arch/i386/i386/pmapae.c 23 Sep 2020 15:13:26 -  1.60
+++ arch/i386/i386/pmapae.c 23 Apr 2021 17:59:05 -
@@ -738,7 +738,7 @@ pmap_bootstrap_pae(void)
(uint32_t)VM_PAGE_TO_PHYS(ptppg));
}
}
-   uvm_km_free(kernel_map, (vaddr_t)pd, NBPG);
+   km_free(pd, NBPG, _any, _dirty);
DPRINTF("%s: freeing PDP 0x%x\n", __func__, (uint32_t)pd);
}
 
@@ -944,7 +944,8 @@ pmap_pinit_pd_pae(struct pmap *pmap)
paddr_t pdidx[4];
 
/* allocate PDP */
-   pmap->pm_pdir = uvm_km_alloc(kernel_map, 4 * NBPG);
+   pmap->pm_pdir = (vaddr_t)km_alloc(4 * NBPG, _any, _dirty,
+   _waitok);
if (pmap->pm_pdir == 0)
panic("pmap_pinit_pd_pae: kernel_map out of virtual space!");
/* page index is in the pmap! */
@@ -997,7 +998,8 @@ pmap_pinit_pd_pae(struct pmap *pmap)
if (cpu_meltdown) {
int i;
 
-   if ((va = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0)
+   va = (vaddr_t)km_alloc(4 * NBPG, _any, _zero, _nowait);
+   if (va == 0)
panic("%s: kernel_map out of virtual space!", __func__);
if (!pmap_extract(pmap_kernel(),
(vaddr_t)>pm_pdidx_intel, >pm_pdirpa_intel))
@@ -1936,7 +1938,20 @@ pmap_enter_special_pae(vaddr_t va, paddr
__func__, va);
 
if (!pmap->pm_pdir_intel) {
-   if ((vapd = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0)
+#if notyet
+   /*
+* XXX mapping is established via pmap_kenter() and lost
+* after enabling PAE.
+*/
+   vapd = (vaddr_t)km_alloc(4 * NBPG, _any, _zero,
+   _waitok);
+#else
+   vapd = (vaddr_t)km_alloc(4 * NBPG, _any, _pageable,
+   _waitok);
+   if (vapd != 0)
+   

Re: dt(4) ifdef sysctl

2021-04-22 Thread Martin Pieuchot
On 22/04/21(Thu) 20:19, Alexander Bluhm wrote:
> Hi,
> 
> sysctl witnesswatch gives an error message if the feature is not
> compiled into the kernel.  I think dt(4) allowdt should do the same.
> 
> sysctl: kern.allowdt: value is not available
> 
> This removes a bit of unused code from ramdisk kernel.
> The variable allowdt should be in the device, not in sysctl source.
> We don't need #ifdef for extern and prototypes, without it code is
> more readable.
> Put the unneeded sysctl code into an #if NDT > 0.
> 
> ok?

ok mpi@

> By the way, can we enable dt(4) in GENERIC?  I use it quite often
> and it is handy to have it is avaiable.  Missuse is prevented by
> securelevel sysctl.  Any downside?

I don't see any reason to not do it.

> Index: dev/dt/dt_dev.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_dev.c,v
> retrieving revision 1.12
> diff -u -p -r1.12 dt_dev.c
> --- dev/dt/dt_dev.c   26 Mar 2021 21:17:10 -  1.12
> +++ dev/dt/dt_dev.c   22 Apr 2021 17:38:22 -
> @@ -109,6 +109,8 @@ SIMPLEQ_HEAD(, dt_probe)  dt_probe_list;  
>  struct rwlockdt_lock = RWLOCK_INITIALIZER("dtlk");
>  volatile uint32_tdt_tracing = 0; /* [K] # of processes tracing */
>  
> +int allowdt;
> +
>  void dtattach(struct device *, struct device *, void *);
>  int  dtopen(dev_t, int, int, struct proc *);
>  int  dtclose(dev_t, int, int, struct proc *);
> @@ -145,7 +147,6 @@ dtopen(dev_t dev, int flags, int mode, s
>  {
>   struct dt_softc *sc;
>   int unit = minor(dev);
> - extern int allowdt;
>  
>   if (!allowdt)
>   return EPERM;
> Index: kern/kern_sysctl.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.389
> diff -u -p -r1.389 kern_sysctl.c
> --- kern/kern_sysctl.c8 Feb 2021 10:51:02 -   1.389
> +++ kern/kern_sysctl.c22 Apr 2021 17:38:22 -
> @@ -114,24 +114,21 @@
>  #endif
>  
>  #include "audio.h"
> -#include "video.h"
> +#include "dt.h"
>  #include "pf.h"
> +#include "video.h"
>  
>  extern struct forkstat forkstat;
>  extern struct nchstats nchstats;
>  extern int nselcoll, fscale;
>  extern struct disklist_head disklist;
>  extern fixpt_t ccpu;
> -extern  long numvnodes;
> -#if NAUDIO > 0
> +extern long numvnodes;
> +extern int allowdt;
>  extern int audio_record_enable;
> -#endif
> -#if NVIDEO > 0
>  extern int video_record_enable;
> -#endif
>  
>  int allowkmem;
> -int allowdt;
>  
>  int sysctl_diskinit(int, struct proc *);
>  int sysctl_proc_args(int *, u_int, void *, size_t *, struct proc *);
> @@ -142,12 +139,8 @@ int sysctl_proc_vmmap(int *, u_int, void
>  int sysctl_intrcnt(int *, u_int, void *, size_t *);
>  int sysctl_sensors(int *, u_int, void *, size_t *, void *, size_t);
>  int sysctl_cptime2(int *, u_int, void *, size_t *, void *, size_t);
> -#if NAUDIO > 0
>  int sysctl_audio(int *, u_int, void *, size_t *, void *, size_t);
> -#endif
> -#if NVIDEO > 0
>  int sysctl_video(int *, u_int, void *, size_t *, void *, size_t);
> -#endif
>  int sysctl_cpustats(int *, u_int, void *, size_t *, void *, size_t);
>  int sysctl_utc_offset(void *, size_t *, void *, size_t);
>  
> @@ -479,10 +472,12 @@ kern_sysctl(int *name, u_int namelen, vo
>   return (EPERM);
>   securelevel = level;
>   return (0);
> +#if NDT > 0
>   case KERN_ALLOWDT:
>   if (securelevel > 0)
>   return (sysctl_rdint(oldp, oldlenp, newp, allowdt));
>   return (sysctl_int(oldp, oldlenp, newp, newlen,  ));
> +#endif
>   case KERN_ALLOWKMEM:
>   if (securelevel > 0)
>   return (sysctl_rdint(oldp, oldlenp, newp, allowkmem));
> 



  1   2   3   4   5   6   7   8   9   10   >