Switch select(2) to kqueue-based implementation

2020-12-10 Thread Martin Pieuchot
All previous kqueue refactoring have been committed, here's a final diff
to modify the internal implementation of {p,}select(2) to query kqfilter
handlers instead of poll ones.

{p,}poll(2) are left untouched to ease the transition.

Here's what I said in the original mail back in May [0]:

> The main argument for this proposal is to reduce the amount of code
> executed to notify userland when an event occur.  The outcome of this
> diff is that a single notification subsystem needs to be taken out of
> the KERNEL_LOCK().  This simplifies a lot existing locking tentacles.
> 
> Using kqueue internally means collision is avoided and there's no need
> to query handlers for fds that aren't ready.  This comes at the cost of
> allocating descriptors.  A space vs time trade-off.  Note that this cost
> can be diminished by doing lazy removal of event descriptors to be able
> to re-use them.

The logic is as follow:

- With this change every thread use a "private" kqueue, usable only by
  the kernel, to register events for select(2) and later poll(2).

- Events specified via FD_SET(2) are converted to their kqueue equivalent.
  ktrace(1) now also outputs converted events to ease debugging.

- kqueue_scan() might be called multiple times per syscall, just like with
  the last version of kevent(2). 

- At the end of every {p,}select(2) syscall the private kqueue is purged.

Tests, comments and oks welcome!

[0] https://marc.info/?l=openbsd-tech&m=158979921322191&w=2

Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.132
diff -u -p -r1.132 sys_generic.c
--- kern/sys_generic.c  2 Oct 2020 15:45:22 -   1.132
+++ kern/sys_generic.c  9 Dec 2020 19:06:23 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -66,8 +67,21 @@
 
 #include 
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2) information, somewhat verbose
+ *  3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) do {   \
+printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
+printf(x); \
+} while (0)
+
+int pselregister(struct proc *, fd_set *, int, int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[]);
+
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
 struct timespec *, const sigset_t *, register_t *);
@@ -584,11 +598,10 @@ int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
 {
+   struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
-   struct timespec elapsed, start, stop;
-   uint64_t nsecs;
-   int s, ncoll, error = 0;
+   int error, nevents = 0;
u_int ni;
 
if (nd < 0)
@@ -618,6 +631,8 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)&bits[5];
}
 
+   kqpoll_init();
+
 #definegetbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
goto done;
@@ -636,43 +651,57 @@ dopselect(struct proc *p, int nd, fd_set
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-   ncoll = nselcoll;
-   atomic_setbits_int(&p->p_flag, P_SELECT);
-   error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
-   if (error || *retval)
+   /* Register kqueue events */
+   if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
goto done;
-   if (timeout == NULL || timespecisset(timeout)) {
-   if (timeout != NULL) {
-   getnanouptime(&start);
-   nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-   } else
-   nsecs = INFSLP;
-   s = splhigh();
-   if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-   splx(s);
-   goto retry;
-   }
-   atomic_clearbits_int(&p->p_flag, P_SELECT);
-   error = tsleep_nsec(&selwait, PSOCK | PCATCH, "select", nsecs);
-   splx(s);
-   if (timeout != NULL) {
-   getnanouptime(&stop);
-   timespecsub(&stop, &start, &elapsed);
-   timespecsub(timeout, &elapsed, timeout);
-   if (timeout->tv_sec < 0)
-   timespecclear(timeout);
-   }
-   if (error == 0 |

Re: uvm_fault: entering swap code

2020-12-10 Thread Martin Pieuchot
On 08/12/20(Tue) 22:55, Jonathan Matthew wrote:
> On Mon, Dec 07, 2020 at 03:15:50PM -0300, Martin Pieuchot wrote:
> > Getting a page from the fault handler might require poking at some
> > swap-related states.
> > 
> > These are not in the hot-path of the fault handler so for the moment
> > just assert that the KERNEL_LOCK() is held or grab it if the function
> > might be called from an future unlocked path.
> > 
> > ok?
> 
> Could you add 'K' to the list of locks in the comment above struct uvmexp too?

Updated diff below.

> I went looking for other uses of swpgonly and saw that it's used under
> uvm_map_teardown -> uvm_unmap_kill_entry -> uvm_km_pgremove,
> and uvm_map_teardown ensures that the kernel lock is not held.
> Not related to this diff exactly, but is this something we need to fix?

I suppose that the problem can only occur if a kernel thread is exiting
since this code is only executed for the kernel pmap.  Anyway I added an
assertion.

Index: uvm/uvm_km.c
===
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
retrieving revision 1.137
diff -u -p -r1.137 uvm_km.c
--- uvm/uvm_km.c23 May 2020 06:15:09 -  1.137
+++ uvm/uvm_km.c10 Dec 2020 13:33:49 -
@@ -243,6 +243,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
voff_t curoff;
int slot;
 
+   KERNEL_ASSERT_LOCKED();
KASSERT(uobj->pgops == &aobj_pager);
 
for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
Index: uvm/uvm_swap.c
===
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.147
diff -u -p -r1.147 uvm_swap.c
--- uvm/uvm_swap.c  29 Sep 2020 11:47:41 -  1.147
+++ uvm/uvm_swap.c  10 Dec 2020 13:30:30 -
@@ -1403,7 +1403,7 @@ uvm_swap_alloc(int *nslots, boolean_t le
/*
 * lock data lock, convert slots into blocks, and enter loop
 */
-
+   KERNEL_ASSERT_LOCKED();
 ReTry: /* XXXMRG */
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
@@ -1449,8 +1449,10 @@ uvm_swapisfull(void)
 {
int result;
 
+   KERNEL_LOCK();
KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
result = (uvmexp.swpgonly == uvmexp.swpages);
+   KERNEL_UNLOCK();
 
return result;
 }
@@ -1465,6 +1467,7 @@ uvm_swap_markbad(int startslot, int nslo
 {
struct swapdev *sdp;
 
+   KERNEL_LOCK();
sdp = swapdrum_getsdp(startslot);
if (sdp != NULL) {
/*
@@ -1475,6 +1478,7 @@ uvm_swap_markbad(int startslot, int nslo
 */
sdp->swd_npgbad += nslots;
}
+   KERNEL_UNLOCK();
 }
 
 /*
@@ -1501,7 +1505,7 @@ uvm_swap_free(int startslot, int nslots)
 * in the extent, and return.   must hold pri lock to do
 * lookup and access the extent.
 */
-
+   KERNEL_LOCK();
sdp = swapdrum_getsdp(startslot);
KASSERT(uvmexp.nswapdev >= 1);
KASSERT(sdp != NULL);
@@ -1533,6 +1537,7 @@ uvm_swap_free(int startslot, int nslots)
}
}
 #endif /* UVM_SWAP_ENCRYPT */
+   KERNEL_UNLOCK();
 }
 
 /*
@@ -1567,6 +1572,7 @@ uvm_swap_get(struct vm_page *page, int s
return VM_PAGER_ERROR;
}
 
+   KERNEL_LOCK();
/* this page is (about to be) no longer only in swap. */
uvmexp.swpgonly--;
 
@@ -1577,7 +1583,7 @@ uvm_swap_get(struct vm_page *page, int s
/* oops, the read failed so it really is still only in swap. */
uvmexp.swpgonly++;
}
-
+   KERNEL_UNLOCK();
return (result);
 }
 
@@ -1599,6 +1605,8 @@ uvm_swap_io(struct vm_page **pps, int st
struct swapdev *sdp;
int encrypt = 0;
 #endif
+
+   KERNEL_ASSERT_LOCKED();
 
write = (flags & B_READ) == 0;
async = (flags & B_ASYNC) != 0;
Index: uvm/uvmexp.h
===
RCS file: /cvs/src/sys/uvm/uvmexp.h,v
retrieving revision 1.6
diff -u -p -r1.6 uvmexp.h
--- uvm/uvmexp.h1 Dec 2020 13:56:22 -   1.6
+++ uvm/uvmexp.h10 Dec 2020 13:31:01 -
@@ -42,6 +42,7 @@
  *
  *  Locks used to protect struct members in this file:
  * I   immutable after creation
+ * K   kernel lock
  * F   uvm_lock_fpageq
  */
 struct uvmexp {
@@ -79,9 +80,9 @@ struct uvmexp {
 
/* swap */
int nswapdev;   /* number of configured swap devices in system */
-   int swpages;/* number of PAGE_SIZE'ed swap pages */
+   int swpages;/* [K] number of PAGE_SIZE'ed swap pages */
int swpginuse;  /* number of swap pages in use */
-   int swpgonly;   /* number of swap pages in use, not also in RAM */
+   int

Re: delays in sensors thread

2020-12-10 Thread Martin Pieuchot
On 10/12/20(Thu) 18:47, Mark Kettenis wrote:
> [...] 
> Sensor drivers that are "good citizens" should probably continue to
> use the sensor thread.  Butfor things like asmc(4) that handle a
> largish bundle of sensors, a separate thread would be fine.  And I
> suspect that thread could run unlocked.

Using a separate thread won't help as long as it is KERNEL_LOCK()'d.

What allows the rest of the kernel to make progress in this case is the
fact that tsleep(9) releases the lock.

If one is going to do some work to make sure asmc_update() is safe to be
executed w/o KERNEL_LOCK() then it would be easier to start by releasing
the lock in this function.



Re: delays in sensors thread

2020-12-10 Thread Martin Pieuchot
On 10/12/20(Thu) 21:40, Alexandre Ratchov wrote:
> On Thu, Dec 10, 2020 at 05:27:16PM +0100, Marcus Glocker wrote:
> > Hi All,
> > 
> > I recently started to play around with uvideo(4) and uaudio(4) on my
> > amd64 iMacs.  There I quickly noticed regular freezes when streaming
> > USB video or audio.  On some of those machines it was very frequent,
> > like every few seconds the video or audio stream did freeze for ~1s,
> > then resume, while the rest of the system did continue to operate fine.
> > 
> > First I found that when running the machine with an SP kernel, the issue
> > disappears.
> 
> On SP kernels, interrupts are still working while the CPU is spinning
> in kernel mode (as long as current IPL permits it). That's why audio
> works better.
> 
> > Secondly some debugging hours, and quite some e-mail
> > exchanges with mpi@ later, I found that the freeze is getting triggered
> > by the asmc(4) driver, specifically by the sensor_task_register()
> > update function.  My first intention was to change
> > sensor_task_register() to call taskq_create() with the TASKQ_MPSAFE
> > flag for a test, to remove the kernel lock, which also resolved the
> > freezing with an MP kernel. [1]
> > 
> > In the end I found that the asmc(4) sensor update code is calling a
> > busy loop in asmc_wait(), where the delay call is spending ~50ms in
> > average.  Doing that during the KERNEL_LOCK() is resulting in
> > noticeable USB ISOC transfer delays.  Obviously replacing the delay(9)
> > with tsleep_nsec(9) in asmc(4) did fix the issue as well. [2]
> > 
> > I'm not sure if just applying diff [2] to the driver is the right
> > approach finally or if we need to take a more generic path to address
> > this problem.  Any feedback, help, comments appreciated.
> > 
> 
> Would asmc(4) work if we sleep for very long (tsleep_nsec() has no
> upper bound)? Spinning during 50ms in kernel mode doesn't look right,
> so using tsleep() looks as a step forward as long as sleeping doesn't
> break asmc(4).

Another way to look at the issue is:  Can this delay be reduced?

Considering that asmc_wait() is called twice per asmc_write() and once
per amsc_read() that means it is called at least 14 times in asmc_command()
and up to 20 times when reading fan informations,

If any of the operations times out the whole command is restarted,
possibly twice.

So it could be interesting to know if a particular query fail and is
restarted?  In this case, is it fixable and/or why do we need to
restart the command 3 times?

Or is the loop in asmc_wait() doing too big steps?  Or too smalls which
would imply that some commands fail?

Improving/fixing the delays might be easier than re-structuring the
driver and could fix both issues raised in this thread: holding the
KERNEL_LOCK() for too long and delaying other sensors. 



Re: Switch select(2) to kqueue-based implementation

2020-12-11 Thread Martin Pieuchot
On 10/12/20(Thu) 09:59, Martin Pieuchot wrote:
> All previous kqueue refactoring have been committed, here's a final diff
> to modify the internal implementation of {p,}select(2) to query kqfilter
> handlers instead of poll ones.
> 
> {p,}poll(2) are left untouched to ease the transition.
> 
> Here's what I said in the original mail back in May [0]:
> 
> > The main argument for this proposal is to reduce the amount of code
> > executed to notify userland when an event occur.  The outcome of this
> > diff is that a single notification subsystem needs to be taken out of
> > the KERNEL_LOCK().  This simplifies a lot existing locking tentacles.
> > 
> > Using kqueue internally means collision is avoided and there's no need
> > to query handlers for fds that aren't ready.  This comes at the cost of
> > allocating descriptors.  A space vs time trade-off.  Note that this cost
> > can be diminished by doing lazy removal of event descriptors to be able
> > to re-use them.
> 
> The logic is as follow:
> 
> - With this change every thread use a "private" kqueue, usable only by
>   the kernel, to register events for select(2) and later poll(2).
> 
> - Events specified via FD_SET(2) are converted to their kqueue equivalent.
>   ktrace(1) now also outputs converted events to ease debugging.
> 
> - kqueue_scan() might be called multiple times per syscall, just like with
>   the last version of kevent(2). 
> 
> - At the end of every {p,}select(2) syscall the private kqueue is purged.
> 
> [0] https://marc.info/?l=openbsd-tech&m=158979921322191&w=2

Updated diff that adds a FALLTHOUGHT lost in refactoring and do not
block in if the timeout is cleared and no events are requested, pointed 
by cheloha@:

Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.132
diff -u -p -r1.132 sys_generic.c
--- kern/sys_generic.c  2 Oct 2020 15:45:22 -   1.132
+++ kern/sys_generic.c  11 Dec 2020 12:28:10 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -66,8 +67,21 @@
 
 #include 
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2) information, somewhat verbose
+ *  3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) do {   \
+printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
+printf(x); \
+} while (0)
+
+int pselregister(struct proc *, fd_set *, int, int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[]);
+
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
 struct timespec *, const sigset_t *, register_t *);
@@ -584,11 +598,10 @@ int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
 {
+   struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
-   struct timespec elapsed, start, stop;
-   uint64_t nsecs;
-   int s, ncoll, error = 0;
+   int error, nevents = 0;
u_int ni;
 
if (nd < 0)
@@ -618,6 +631,8 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)&bits[5];
}
 
+   kqpoll_init();
+
 #definegetbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
goto done;
@@ -636,43 +651,59 @@ dopselect(struct proc *p, int nd, fd_set
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-   ncoll = nselcoll;
-   atomic_setbits_int(&p->p_flag, P_SELECT);
-   error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
-   if (error || *retval)
+   /* Register kqueue events */
+   if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
goto done;
-   if (timeout == NULL || timespecisset(timeout)) {
-   if (timeout != NULL) {
-   getnanouptime(&start);
-   nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-   } else
-   nsecs = INFSLP;
-   s = splhigh();
-   if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-   splx(s);
-   goto retry;
-   }
-   atomic_clearbits_int(&p->p_flag, P_SEL

Re: pool(9): remove ticks (attempt 2)

2020-12-11 Thread Martin Pieuchot
On 11/12/20(Fri) 12:52, Mark Kettenis wrote:
> > Date: Thu, 10 Dec 2020 16:13:22 -0600
> > From: Scott Cheloha 
> > 
> > Hi,
> > 
> > We looked at removing the ticks from subr_pool.c a while back but it
> > got shelved.  That may or may not have been my fault.  I don't
> > remember.
> > 
> > Anyway, I would normally suggest switching to getuptime(9) here, but
> > getuptime(9) counts in seconds and we're working with a 1 second
> > timeout in this code (pool_wait_free) so that's too coarse an
> > interface for this job.
> > 
> > The next best thing I could come up with was introducing a coarse
> > sub-second interface for use in this file, "getnsecuptime()", which
> > calls getnanouptime(9) and converts the result to a 64-bit count of
> > nanoseconds.  This is relatively fast (we don't read the underlying
> > timecounter hardware) and causes a minimal amount of code change (we
> > can use it inline because it returns an integral value).
> > 
> > >From there the changes are simple:
> > 
> > - Renames: ph_tick -> ph_timestamp, pr_cache_tick -> pr_cache_timestamp
> > 
> > - Call getnsecuptime(9) wherever we read 'ticks'.
> > 
> > - Change pool_wait_gc and pool_wait_free to counts of nanoseconds.
> >   They could be macros, e.g.
> > 
> > #define POOL_WAIT_GC80ULL
> > 
> >   but I'll leave that for a second diff to keep things simple.
> > 
> > This compiles and I haven't changed any logic so I assume it isn't
> > broken.
> > 
> > We could move getnsecuptime() into kern_tc.c but it isn't used
> > anywhere else yet so I'm hesitant to do so.
> > 
> > Thoughts?
> 
> Specifying the timeouts in nanoseconds isn't particularly useful I'd
> say.  But I see we can't use SEC_TO_NSEC here because of the overflow
> check...

I'm not sure to understand, can't we do:

pool_wait_free = SEC_TO_NSEC(1);
pool_wait_gc = SEC_TO_NSEC(8);

or are you pointing at something else?

One comment below

> > Index: kern/subr_pool.c
> > ===
> > RCS file: /cvs/src/sys/kern/subr_pool.c,v
> > retrieving revision 1.230
> > diff -u -p -r1.230 subr_pool.c
> > --- kern/subr_pool.c24 Jan 2020 06:31:17 -  1.230
> > +++ kern/subr_pool.c10 Dec 2020 22:08:33 -
> > @@ -41,6 +41,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  
> > @@ -148,7 +149,7 @@ struct pool_page_header {
> > caddr_t ph_page;/* this page's address */
> > caddr_t ph_colored; /* page's colored address */
> > unsigned long   ph_magic;
> > -   int ph_tick;
> > +   uint64_tph_timestamp;   /* uptime when last modified */
> >  };
> >  #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
> >  #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
> > @@ -266,8 +267,18 @@ void   pool_gc_sched(void *);
> >  struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
> >  void   pool_gc_pages(void *);
> >  struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
> > -int pool_wait_free = 1;
> > -int pool_wait_gc = 8;
> > +uint64_t pool_wait_free = 10ULL;   /* nanoseconds */
> > +uint64_t pool_wait_gc = 80ULL; /* nanoseconds */
> > +
> > +/* XXX where do I put this? */
> > +uint64_t
> > +getnsecuptime(void)
> > +{
> > +   struct timespec now;
> > +
> > +   getnanouptime(&now);
> > +   return TIMESPEC_TO_NSEC(&now);
> > +}
> >  
> >  RBT_PROTOTYPE(phtree, pool_page_header, ph_node, phtree_compare);
> >  
> > @@ -797,7 +808,7 @@ pool_put(struct pool *pp, void *v)
> > /* is it time to free a page? */
> > if (pp->pr_nidle > pp->pr_maxpages &&
> > (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
> > -   (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
> > +   getnsecuptime() - ph->ph_timestamp > pool_wait_free) {
> > freeph = ph;
> > pool_p_remove(pp, freeph);
> > }
> > @@ -864,7 +875,7 @@ pool_do_put(struct pool *pp, void *v)
> >  */
> > pp->pr_nidle++;
> >  
> > -   ph->ph_tick = ticks;
> > +   ph->ph_timestamp = getnsecuptime();
> > TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry);
> > TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_entry);
> > pool_update_curpage(pp);
> > @@ -1566,7 +1577,7 @@ pool_gc_pages(void *null)
> > /* is it time to free a page? */
> > if (pp->pr_nidle > pp->pr_minpages &&
> > (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
> > -   (ticks - ph->ph_tick) > (hz * pool_wait_gc)) {
> > +   getnsecuptime() - ph->ph_timestamp > pool_wait_gc) {
> > freeph = ph;
> > pool_p_remove(pp, freeph);
> > } else
> > @@ -1726,7 +1737,7 @@ pool_cache_init(struct pool *pp)
> > arc4random_buf(pp->pr_cache_m

Re: rw_lock_held() & friends

2020-12-14 Thread Martin Pieuchot
On 08/12/20(Tue) 14:39, Visa Hankala wrote:
> On Mon, Dec 07, 2020 at 03:25:00PM -0300, Martin Pieuchot wrote:
> > Simple diff below to add the rw_lock_held() as well as a read & write
> > version.
> > 
> > This allows us to reduce the difference with NetBSD in UVM by adding the
> > following checks:
> > 
> > KASSERT(rw_write_held(amap->am_lock));
> > 
> > ok?
> > 
> > Index: sys/rwlock.h
> > ===
> > RCS file: /cvs/src/sys/sys/rwlock.h,v
> > retrieving revision 1.26
> > diff -u -p -r1.26 rwlock.h
> > --- sys/rwlock.h16 Jul 2019 01:40:49 -  1.26
> > +++ sys/rwlock.h7 Dec 2020 18:22:03 -
> > @@ -168,6 +168,11 @@ intrw_enter(struct rwlock *, int);
> >  void   rw_exit(struct rwlock *);
> >  intrw_status(struct rwlock *);
> >  
> > +#definerw_read_held(rwl)   (rw_status(rwl) == RW_READ)
> > +#definerw_write_held(rwl)  (rw_status(rwl) == RW_WRITE)
> > +#definerw_lock_held(rwl)   (rw_write_held(rwl) || 
> > rw_read_held(rwl))
> 
> I think rw_lock_held() should invoke rw_status() only once. This would
> reduce the overhead. It could even be a proper C function.

As below?

Index: sys/rwlock.h
===
RCS file: /cvs/src/sys/sys/rwlock.h,v
retrieving revision 1.26
diff -u -p -r1.26 rwlock.h
--- sys/rwlock.h16 Jul 2019 01:40:49 -  1.26
+++ sys/rwlock.h14 Dec 2020 13:35:41 -
@@ -168,6 +168,29 @@ intrw_enter(struct rwlock *, int);
 void   rw_exit(struct rwlock *);
 intrw_status(struct rwlock *);
 
+static inline int
+rw_read_held(struct rwlock *rwl)
+{
+   return (rw_status(rwl) == RW_READ);
+}
+
+static inline int
+rw_write_held(struct rwlock *rwl)
+{
+   return (rw_status(rwl) == RW_WRITE);
+}
+
+static inline int
+rw_lock_held(struct rwlock *rwl)
+{
+   int status;
+
+   status = rw_status(rwl);
+
+   return (status == RW_READ || status == RW_WRITE);
+}
+
+
 void   _rrw_init_flags(struct rrwlock *, const char *, int,
const struct lock_type *);
 intrrw_enter(struct rrwlock *, int);



Re: Switch select(2) to kqueue-based implementation

2020-12-15 Thread Martin Pieuchot
On 12/12/20(Sat) 11:29, Visa Hankala wrote:
> On Fri, Dec 11, 2020 at 09:35:59AM -0300, Martin Pieuchot wrote:
> > On 10/12/20(Thu) 09:59, Martin Pieuchot wrote:
> > > All previous kqueue refactoring have been committed, here's a final diff
> > > to modify the internal implementation of {p,}select(2) to query kqfilter
> > > handlers instead of poll ones.
> > > 
> > > {p,}poll(2) are left untouched to ease the transition.
> > > 
> > > Here's what I said in the original mail back in May [0]:
> > > 
> > > > The main argument for this proposal is to reduce the amount of code
> > > > executed to notify userland when an event occur.  The outcome of this
> > > > diff is that a single notification subsystem needs to be taken out of
> > > > the KERNEL_LOCK().  This simplifies a lot existing locking tentacles.
> > > > 
> > > > Using kqueue internally means collision is avoided and there's no need
> > > > to query handlers for fds that aren't ready.  This comes at the cost of
> > > > allocating descriptors.  A space vs time trade-off.  Note that this cost
> > > > can be diminished by doing lazy removal of event descriptors to be able
> > > > to re-use them.
> > > 
> > > The logic is as follow:
> > > 
> > > - With this change every thread use a "private" kqueue, usable only by
> > >   the kernel, to register events for select(2) and later poll(2).
> > > 
> > > - Events specified via FD_SET(2) are converted to their kqueue equivalent.
> > >   ktrace(1) now also outputs converted events to ease debugging.
> > > 
> > > - kqueue_scan() might be called multiple times per syscall, just like with
> > >   the last version of kevent(2). 
> > > 
> > > - At the end of every {p,}select(2) syscall the private kqueue is purged.
> > > 
> > > [0] https://marc.info/?l=openbsd-tech&m=158979921322191&w=2
> > 
> > Updated diff that adds a FALLTHOUGHT lost in refactoring and do not
> > block in if the timeout is cleared and no events are requested, pointed 
> > by cheloha@:

Updated diff below.

> > +/*
> > + * Convert fd_set into kqueue events and register them on the
> > + * per-thread queue.
> > + */
> >  int
> > -selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
> > -register_t *retval)
> > +pselregister(struct proc *p, fd_set *ibits, int nfd, int ni, int 
> > *nregistered)
> 
> I think pselregister() should take the bitvector similarly to
> pselcollect(). Then the function could skip some pointer arithmetics
> as well.
> 
> int pselregister(struct proc *p, fd_set *pibits[3], int nfd, int 
> *nregistered);

I don't mind but did not include it in the diff, this seem like an extra
refactoring to me and I'm trying to change few lines as possible.

> > +/*
> > + * Convert given kqueue event into corresponding select(2) bit.
> > + */
> > +int
> > +pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3])
> > +{
> > +#ifdef DIAGNOSTIC
> > +   /* Filter out and lazily delete spurious events */
> > +   if ((unsigned long)kevp->udata != p->p_kq_serial) {
> > +   DPRINTFN(0, "select fd %u mismatched serial %lu\n",
> > +   (int)kevp->ident, p->p_kq_serial);
> > +   kevp->flags = EV_DISABLE|EV_DELETE;
> > +   kqueue_register(p->p_kq, kevp, p);
> > +   return (0);
> > +   }
> > +#endif
> 
> Shouldn't skipping of spurious events be taken into account in the
> main scan loop? Otherwise they may cause artifacts, such as premature
> timeout expiry when a wakeup is caused solely by a spurious event, or
> return of an incomplete result when spurious events eat capacity from
> nevents.

This sounds like an later improvement to me and I'd like to discuss it
separately. 


Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.132
diff -u -p -r1.132 sys_generic.c
--- kern/sys_generic.c  2 Oct 2020 15:45:22 -   1.132
+++ kern/sys_generic.c  15 Dec 2020 10:36:28 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -66,8 +67,21 @@
 
 #include 
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2

Re: sdmmc(4): sdmmc_io_function_enable(): don't sleep on lbolt

2020-12-15 Thread Martin Pieuchot
On 11/12/20(Fri) 19:07, Scott Cheloha wrote:
> I'd like to remove lbolt from the kernel.  I think having it in the
> kernel complicates otherwise simple code.

Decoupling code is IMHO a good thing.  I like this move.

> We can start with sdmmc(4).
> 
> The goal in sdmmc_io_function_enable() is calling sdmmc_io_function_ready()
> up to six times and sleep 1 second between each attempt.  Here's rewritten
> code that does with without lbolt.

Ok with me.

> Index: sdmmc_io.c
> ===
> RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
> retrieving revision 1.41
> diff -u -p -r1.41 sdmmc_io.c
> --- sdmmc_io.c31 Dec 2019 10:05:33 -  1.41
> +++ sdmmc_io.c12 Dec 2020 01:04:59 -
> @@ -231,8 +231,8 @@ sdmmc_io_function_enable(struct sdmmc_fu
>  {
>   struct sdmmc_softc *sc = sf->sc;
>   struct sdmmc_function *sf0 = sc->sc_fn0;
> + int chan, retry = 5;
>   u_int8_t rv;
> - int retry = 5;
>  
>   rw_assert_wrlock(&sc->sc_lock);
>  
> @@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
>   sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
>  
>   while (!sdmmc_io_function_ready(sf) && retry-- > 0)
> - tsleep_nsec(&lbolt, PPAUSE, "pause", INFSLP);
> + tsleep_nsec(&chan, PPAUSE, "pause", SEC_TO_NSEC(1));
>   return (retry >= 0) ? 0 : ETIMEDOUT;
>  }
>  
> 



Re: i386: apm(4): apm_thread(): sleep without lbolt

2020-12-15 Thread Martin Pieuchot
On 11/12/20(Fri) 19:17, Scott Cheloha wrote:
> Here's another sleep that doesn't need lbolt.
> 
> The idea here is to call apm_periodic_check() once a second.
> We can do that without lbolt.
> 
> Is there some other address that would be more appropriate for this
> thread to sleep on?  It doesn't look like any apm(4) code calls
> wakeup(9) on lbolt so I've just replaced with with a local channel.

Note sure we want to grow the stack just for that.  Any member of `sc',
or even `sc' itself if this doesn't conflict, could be used as wait
channel. 

> ok?
> 
> Index: apm.c
> ===
> RCS file: /cvs/src/sys/arch/i386/i386/apm.c,v
> retrieving revision 1.125
> diff -u -p -r1.125 apm.c
> --- apm.c 24 Jun 2020 22:03:40 -  1.125
> +++ apm.c 12 Dec 2020 01:17:38 -
> @@ -50,6 +50,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -904,12 +905,13 @@ void
>  apm_thread(void *v)
>  {
>   struct apm_softc *sc = v;
> + int chan;
>  
>   for (;;) {
>   rw_enter_write(&sc->sc_lock);
>   (void) apm_periodic_check(sc);
>   rw_exit_write(&sc->sc_lock);
> - tsleep_nsec(&lbolt, PWAIT, "apmev", INFSLP);
> + tsleep_nsec(&chan, PWAIT, "apmev", SEC_TO_NSEC(1));
>   }
>  }
>  
> 



Re: Lock operations for knote lists

2020-12-15 Thread Martin Pieuchot
On 11/12/20(Fri) 17:37, Visa Hankala wrote:
> This patch extends struct klist with a callback descriptor and
> an argument. The main purpose of this is to let the kqueue subsystem
> assert when a klist should be locked, and operate the klist lock
> in klist_invalidate().

Lovely!

> Access to a knote list of a kqueue-monitored object has to be
> serialized somehow. Because the object often has a lock for protecting
> its state, and because the object often acquires this lock at the latest
> in its f_event callback functions, I would like to use the same lock
> also for the knote lists. Uses of NOTE_SUBMIT already show a pattern
> arising.
> 
> There could be an embedded lock in klist. However, such a lock would be
> redundant in many cases. The code could not rely on a single lock type
> (mutex, rwlock, something else) because the needs of monitored objects
> vary. In addition, an embedded lock would introduce new lock order
> constraints. Note that this patch does not rule out use of dedicated
> klist locks.

Indeed, I'm currently dealing with such different type of lock issue in
UVM :/

> The patch introduces a way to associate lock operations with a klist.
> The caller can provide a custom implementation, or use a ready-made
> interface with a mutex or rwlock.
> 
> For compatibility with old code, the new code falls back to using the
> kernel lock if no specific klist initialization has been done. The
> existing code already relies on implicit initialization of klist.
> 
> Unfortunately, the size of struct klist will grow threefold.

The growth is unavoidable, it could have been of the size of a lock...

> As the patch gives the code the ability to operate the klist lock,
> the klist API could provide variants of insert and remove actions that
> handle locking internally, for convenience. However, that I would leave
> for another patch because I would prefer to rename the current
> klist_insert() to klist_insert_locked(), and klist_remove() to
> klist_remove_locked().
> 
> The patch additionally provides three examples of usage: audio, pipes,
> and sockets. Each of these examples is logically a separate changeset.

One question below.

> Please test and review.

This is in the middle of a build on sparc64, so far so good.

> Index: kern/kern_event.c
> ===
> RCS file: src/sys/kern/kern_event.c,v
> retrieving revision 1.147
> diff -u -p -r1.147 kern_event.c
> --- kern/kern_event.c 9 Dec 2020 18:58:19 -   1.147
> +++ kern/kern_event.c 11 Dec 2020 17:05:09 -
> @@ -1539,9 +1576,14 @@ klist_invalidate(struct klist *list)
>   NET_ASSERT_UNLOCKED();
>  
>   s = splhigh();
> + ls = klist_lock(list);

Isn't splhigh() redundant with klist_lock() now?  If a subsystem
provides its own lock/unlock routine shouldn't it ensure that the
necessary SPL is used?  Or is this protecting something else?  Or is
it just paranoia and we should try to remove it in a later step?

>   while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) {
> - if (!knote_acquire(kn))
> + if (!knote_acquire(kn, list, ls)) {
> + /* knote_acquire() has unlocked list. */
> + ls = klist_lock(list);
>   continue;
> + }
> + klist_unlock(list, ls);
>   splx(s);
>   kn->kn_fop->f_detach(kn);
>   if (kn->kn_fop->f_flags & FILTEROP_ISFD) {



SIGSEGV in _rthread_tls_destructors()

2020-12-15 Thread Martin Pieuchot
When the first thread of multimedia/mpv exits after having played a video
with the "gpu" (default) output, the programs receives a SIGSEV when it
tries to execute one of its destructor:

void
_rthread_tls_destructors(pthread_t thread)
{
[...]
for (i = 0; i < PTHREAD_DESTRUCTOR_ITERATIONS; i++) {
for (rs = thread->local_storage; rs; rs = rs->next) {
if (!rs->data)
continue;
if (rkeys[rs->keyid].destructor) {
void (*destructor)(void *) =
rkeys[rs->keyid].destructor;
void *data = rs->data;
rs->data = NULL;
_spinunlock(&rkeyslock);
destructor(data);   <-- HERE
_spinlock(&rkeyslock);
}
}
}
[...]
}

This doesn't happen with other outputs and I haven't checked/don't know
which piece of code in the "gpu" output calls pthread_key_create().

Full backtrace below.

$ mpv *.mp4
 (+) Video --vid=1 (*) (h264 640x352 30.288fps)
 (+) Audio --aid=1 (*) (aac 2ch 48000Hz)   
libEGL warning: DRI3: Screen seems not DRI3 capable
AO: [sdl] 48000Hz stereo 2ch s32
VO: [gpu] 640x352 yuv420p   
AV: 00:01:38 / 00:01:38 (100%) A-V:  0.000 


Exiting... (End of file)
pthread_mutex_destroy on mutex with waiters!
Segmentation fault (core dumped)
mpi@oliva $ egdb /usr/local/bin/mpv *.core  
GNU gdb (GDB) 7.12.1
Copyright (C) 2017 Free Software Foundation, Inc.   
License GPLv3+: GNU GPL version 3 or later   
This is free software: you are free to change and redistribute it.  
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-openbsd6.8".
Type "show configuration" for configuration details.
For bug reporting instructions, please see: 
. 
Find the GDB manual and other documentation resources online at:
.
For help, type "help".  
Type "apropos word" to search for commands related to "word"...
Reading symbols from /usr/local/bin/mpv...Reading symbols from /usr/local/bin/.d
ebug/mpv.dbg...done.
done.   
[New process 566589]   
[New process 131102]
[New process 552326]
[New process 162644]
Core was generated by `mpv'.
Program terminated with signal SIGSEGV, Segmentation fault. 
#0  0x09b1220dfa70 in ?? () 
[Current thread is 1 (process 566589)]   
(gdb) thr apply all bt  

Thread 4 (process 162644):  
#0  futex () at /tmp/-:3
#1  0x09b140ca37b5 in _twait (p=0x9b1417da640, val=1, clockid=0, abs=0x0)   
at /usr/src/lib/libc/thread/synch.h:34   
#2  _rthread_cond_timedwait (cond=0x9b1417da640, mutexp=0x9b0adcb19c8, abs=0x0)
at /usr/src/lib/libc/thread/rthread_cond.c:106
#3  0x09aea3812d5c in worker_thread (arg=0x9b0adcb19c0) 
at ../mpv-0.32.0/misc/thread_pool.c:80   
#4  0x09b19f5a60c1 in _rthread_start (v=) 
at /usr/src/lib/librthread/rthread.c:96   
#5  0x09b140c9d7f8 in __tfork_thread ()
at /usr/src/lib/libc/arch/amd64/sys/tfork_thread.S:77
#6  0x in ?? ()

Thread 3 (process 552326):
#0  _thread_sys_poll () at /tmp/-:3
#1  0x09b140c659fe in _libc_poll_cancel (fds=0x9b183f0a630, nfds=2, 
timeout=-1) at /usr/src/lib/libc/sys/w_poll.c:27
#2  0x09aea38

Re: SIGSEGV in _rthread_tls_destructors()

2020-12-15 Thread Martin Pieuchot
On 15/12/20(Tue) 16:30, Mark Kettenis wrote:
> > Date: Tue, 15 Dec 2020 12:15:30 -0300
> > From: Martin Pieuchot 
> > 
> > When the first thread of multimedia/mpv exits after having played a video
> > with the "gpu" (default) output, the programs receives a SIGSEV when it
> > tries to execute one of its destructor:
> > 
> > void
> > _rthread_tls_destructors(pthread_t thread)
> > {
> > [...]
> > for (i = 0; i < PTHREAD_DESTRUCTOR_ITERATIONS; i++) {
> > for (rs = thread->local_storage; rs; rs = rs->next) {
> > if (!rs->data)
> > continue;
> > if (rkeys[rs->keyid].destructor) {
> > void (*destructor)(void *) =
> > rkeys[rs->keyid].destructor;
> > void *data = rs->data;
> > rs->data = NULL;
> > _spinunlock(&rkeyslock);
> > destructor(data);   <-- HERE
> > _spinlock(&rkeyslock);
> > }
> > }
> > }
> > [...]
> > }
> > 
> > This doesn't happen with other outputs and I haven't checked/don't know
> > which piece of code in the "gpu" output calls pthread_key_create().
> > 
> > Full backtrace below.
> > 
> > $ mpv *.mp4
> >  (+) Video --vid=1 (*) (h264 640x352 30.288fps)
> >  (+) Audio --aid=1 (*) (aac 2ch 48000Hz)   
> > libEGL warning: DRI3: Screen seems not DRI3 capable
> > AO: [sdl] 48000Hz stereo 2ch s32
> > 
> > VO: [gpu] 640x352 yuv420p   
> > 
> > AV: 00:01:38 / 00:01:38 (100%) A-V:  0.000 
> > 
> > 
> > 
> > 
> > Exiting... (End of file)
> > 
> > pthread_mutex_destroy on mutex with waiters!
> 
> > Segmentation fault (core dumped)
> 
> POSIX says:
> 
>   "Attempting to destroy a locked mutex, or a mutex that another
>   thread is attempting to lock, or a mutex that is being used in a
>   pthread_cond_timedwait() or pthread_cond_wait() call by another
>   thread, results in undefined behavior."

It is not clear if this is related.  If it is, we don't know what changed
to make it happen.  Also why this SIGSEV doesn't happen with other
outputs, or when playing music, even if the pthread_mutex_destroy message
is printed?



Re: Switch select(2) to kqueue-based implementation

2020-12-16 Thread Martin Pieuchot
On 15/12/20(Tue) 17:23, Visa Hankala wrote:
> On Tue, Dec 15, 2020 at 07:46:01AM -0300, Martin Pieuchot wrote:
> > @@ -636,43 +651,59 @@ dopselect(struct proc *p, int nd, fd_set
> > if (sigmask)
> > dosigsuspend(p, *sigmask &~ sigcantmask);
> >  
> > -retry:
> > -   ncoll = nselcoll;
> > -   atomic_setbits_int(&p->p_flag, P_SELECT);
> > -   error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
> > -   if (error || *retval)
> > +   /* Register kqueue events */
> > +   if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
> > goto done;
> 
> The above has parentheses wrong and returns 1 (EPERM) as error.
> The lines should read:
> 
>   if ((error = pselregister(p, pibits[0], nd, ni, &nevents)) != 0)
>   goto done;

Thanks fixed.
 
> In addition to the above, I noticed that select(2) behaves differently
> than before when a file descriptor that is being monitored is closed by
> another thread. The old implementation returns EBADF. The new code keeps
> on waiting on the underlying object.
> 
> The diff below makes kqueue clear kqpoll's fd event registration on
> fd close. However, it does not make select(2) return an error, the fd
> just will not cause a wakeup any longer. I think I have an idea on how
> to correct that but I need to consider it some more.

Are you saying that knote_fdclose() can't cleanup the knotes on the
per-thread kqueue?

If so, should we replace the call of kqueue_free() in kqpoll_exit() by
a KQRELE()?  The original intend was to not put the per-thread kqueue to
the fdp list.  As you just pointed out this is necessary.  So I don't
see any need for kqueue_free().  We could even assert that the refcount
is 1.

Diff below does that, feel free to commit it.

Index: kern/kern_event.c
===
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.148
diff -u -p -r1.148 kern_event.c
--- kern/kern_event.c   15 Dec 2020 04:48:18 -  1.148
+++ kern/kern_event.c   16 Dec 2020 11:19:01 -
@@ -168,12 +168,11 @@ KQREF(struct kqueue *kq)
 void
 KQRELE(struct kqueue *kq)
 {
-   struct filedesc *fdp;
+   struct filedesc *fdp = kq->kq_fdp;
 
if (atomic_dec_int_nv(&kq->kq_refs) > 0)
return;
 
-   fdp = kq->kq_fdp;
if (rw_status(&fdp->fd_lock) == RW_WRITE) {
LIST_REMOVE(kq, kq_next);
} else {
@@ -182,12 +181,6 @@ KQRELE(struct kqueue *kq)
fdpunlock(fdp);
}
 
-   kqueue_free(kq);
-}
-
-void
-kqueue_free(struct kqueue *kq)
-{
free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize *
sizeof(struct knlist));
hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT);
@@ -509,12 +502,17 @@ void
 kqpoll_init(void)
 {
struct proc *p = curproc;
+   struct filedesc *fdp;
 
if (p->p_kq != NULL)
return;
 
p->p_kq = kqueue_alloc(p->p_fd);
p->p_kq_serial = arc4random();
+   fdp = p->p_fd;
+   fdplock(fdp);
+   LIST_INSERT_HEAD(&fdp->fd_kqlist, p->p_kq, kq_next);
+   fdpunlock(fdp);
 }
 
 void
@@ -526,7 +524,8 @@ kqpoll_exit(void)
return;
 
kqueue_terminate(p, p->p_kq);
-   kqueue_free(p->p_kq);
+   KASSERT(p->p_kq->kq_refs == 1);
+   KQRELE(p->p_kq);
p->p_kq = NULL;
 }
 



Re: sdmmc(4): sdmmc_io_function_enable(): don't sleep on lbolt

2020-12-16 Thread Martin Pieuchot
On 16/12/20(Wed) 12:50, Scott Cheloha wrote:
> On Tue, Dec 15, 2020 at 01:47:24PM +0100, Mark Kettenis wrote:
> > > Date: Tue, 15 Dec 2020 13:32:22 +0100
> > > From: Claudio Jeker 
> > > 
> > > On Fri, Dec 11, 2020 at 07:07:56PM -0600, Scott Cheloha wrote:
> > > > Hi,
> > > > 
> > > > I'd like to remove lbolt from the kernel.  I think having it in the
> > > > kernel complicates otherwise simple code.
> > > > 
> > > > We can start with sdmmc(4).
> > > > 
> > > > The goal in sdmmc_io_function_enable() is calling 
> > > > sdmmc_io_function_ready()
> > > > up to six times and sleep 1 second between each attempt.  Here's 
> > > > rewritten
> > > > code that does with without lbolt.
> > > > 
> > > > ok?
> > > > 
> > > > Index: sdmmc_io.c
> > > > ===
> > > > RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
> > > > retrieving revision 1.41
> > > > diff -u -p -r1.41 sdmmc_io.c
> > > > --- sdmmc_io.c  31 Dec 2019 10:05:33 -  1.41
> > > > +++ sdmmc_io.c  12 Dec 2020 01:04:59 -
> > > > @@ -231,8 +231,8 @@ sdmmc_io_function_enable(struct sdmmc_fu
> > > >  {
> > > > struct sdmmc_softc *sc = sf->sc;
> > > > struct sdmmc_function *sf0 = sc->sc_fn0;
> > > > +   int chan, retry = 5;
> > > > u_int8_t rv;
> > > > -   int retry = 5;
> > > >  
> > > > rw_assert_wrlock(&sc->sc_lock);
> > > >  
> > > > @@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
> > > > sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
> > > >  
> > > > while (!sdmmc_io_function_ready(sf) && retry-- > 0)
> > > > -   tsleep_nsec(&lbolt, PPAUSE, "pause", INFSLP);
> > > > +   tsleep_nsec(&chan, PPAUSE, "pause", SEC_TO_NSEC(1));
> > > > return (retry >= 0) ? 0 : ETIMEDOUT;
> > > >  }
> > > >  
> > > 
> > > Why not use &retry as wait channel instead of adding a new variable
> > > chan? Result is the same. Would it make sense to allow NULL as wait
> > > channel to make the tsleep not wakeable. At least that could be used in a
> > > few places where timeouts are implemented with tsleep and would make the
> > > intent more obvious.
> > 
> > Or have an appropriately named global variable?  Something like "int 
> > nowake"?
> 
> Something like the attached patch?
> 
> I think the idea of a "dead channel" communicates the intent.  Nobody
> broadcasts wakeups on the dead channel.  If you don't want to receive
> wakeup broadcasts you sleep on the dead channel.  Hence, "deadchan".

Why did we choose to use a variable over NULL?  Any technical reason?

I'm wondering it the locality of the variable might not matter in a
distant future.  Did you dig a bit deeper about the FreeBSD solution?
Why did they choose a per-CPU value?

> Index: kern/kern_synch.c
> ===
> RCS file: /cvs/src/sys/kern/kern_synch.c,v
> retrieving revision 1.172
> diff -u -p -r1.172 kern_synch.c
> --- kern/kern_synch.c 7 Dec 2020 16:55:29 -   1.172
> +++ kern/kern_synch.c 16 Dec 2020 18:50:12 -
> @@ -87,6 +87,12 @@ sleep_queue_init(void)
>   TAILQ_INIT(&slpque[i]);
>  }
>  
> +/*
> + * Threads that do not want to receive wakeup(9) broadcasts should
> + * sleep on deadchan.
> + */
> +static int __deadchan;
> +int *deadchan = &__deadchan;
>  
>  /*
>   * During autoconfiguration or after a panic, a sleep will simply
> Index: sys/systm.h
> ===
> RCS file: /cvs/src/sys/sys/systm.h,v
> retrieving revision 1.148
> diff -u -p -r1.148 systm.h
> --- sys/systm.h   26 Aug 2020 03:29:07 -  1.148
> +++ sys/systm.h   16 Dec 2020 18:50:12 -
> @@ -107,6 +107,8 @@ extern struct vnode *rootvp;  /* vnode eq
>  extern dev_t swapdev;/* swapping device */
>  extern struct vnode *swapdev_vp;/* vnode equivalent to above */
>  
> +extern int *deadchan;/* dead wakeup(9) channel */
> +
>  struct proc;
>  struct process;
>  #define curproc curcpu()->ci_curproc
> Index: dev/sdmmc/sdmmc_io.c
> ===
> RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
> retrieving revision 1.41
> diff -u -p -r1.41 sdmmc_io.c
> --- dev/sdmmc/sdmmc_io.c  31 Dec 2019 10:05:33 -  1.41
> +++ dev/sdmmc/sdmmc_io.c  16 Dec 2020 18:50:12 -
> @@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
>   sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
>  
>   while (!sdmmc_io_function_ready(sf) && retry-- > 0)
> - tsleep_nsec(&lbolt, PPAUSE, "pause", INFSLP);
> + tsleep_nsec(deadchan, PPAUSE, "pause", SEC_TO_NSEC(1));
>   return (retry >= 0) ? 0 : ETIMEDOUT;
>  }
>  
> 



Re: sdmmc(4): sdmmc_io_function_enable(): don't sleep on lbolt

2020-12-16 Thread Martin Pieuchot
On 16/12/20(Wed) 23:23, Claudio Jeker wrote:
> On Wed, Dec 16, 2020 at 04:50:42PM -0300, Martin Pieuchot wrote:
> > [...] 
> > Why did we choose to use a variable over NULL?  Any technical reason?
> 
> The sleep subsytem requires a non-NULL value for ident. Changing this
> seems not trivial.

I'd say this is an implementation detail, nothing prevent us to use a
"private" ident value if NULL is passed to tsleep(9) :)

> > I'm wondering it the locality of the variable might not matter in a
> > distant future.  Did you dig a bit deeper about the FreeBSD solution?
> > Why did they choose a per-CPU value?
> 
> Currently all sleep channels are hashed into IIRC 128 buckets. If all
> timeouts use the same sleep channel then this queue may get overcrowded.
> I guess only instrumentation and measurements will tell us how bad the
> sleep queue is hashed.

So using a global as sleep channel is not optimum?  Would it be better
to use an address on the stack?  If so we could make sleep_setup() accept
NULL and use 'sls' for example.



Re: WITNESS panic: acquiring blockable sleep lock with spinlock or critical section held (rwlock) kmmaplk

2020-12-17 Thread Martin Pieuchot
On 16/12/20(Wed) 22:49, Greg Steuck wrote:
> I just hit this while booting an i386-current in vmd. The source tree is
> synced to "Remove the assertion in uvm_km_pgremove()."
> 
> I enabled WITNESS on top of GENERIC. Naturally, GENERIC-Dec15 snap works.
> 
> Anybody else see this so I know it's worth a bisect?
> [...]

I can reproduce it.  Diff below fixes it.  This is the beginning of a
rabbit hole... thanks!

> witness: lock_object uninitialized: 0xd0f3c828
> Starting stack trace...
> witness_checkorder(0,d6bb011c,d1155e6c,d02e10e4,90) at witness_checkorder+0x8a
> witness_checkorder(d0f3c828,9,0) at witness_checkorder+0x8a
> mtx_enter(d0f3c81c) at mtx_enter+0x27
> pmap_extract_pae(d8bb0d80,f5605000,d8bb0da0) at pmap_extract_pae+0x53
> pmap_pinit_pd_pae(d8bb0d80) at pmap_pinit_pd_pae+0x268
> pmap_create(1,1000,f6fe5e86,d8bbfd54,d0f5ba18) at pmap_create+0xa8
> uvmspace_fork(d0f5b5fc,d8bb3e34,d0f5b5fc,1,d1155f70) at uvmspace_fork+0x56
> process_new(d8bb3e34,d0f5b5fc,1) at process_new+0xeb
> fork1(d0eb7b14,1,d04eb560,0,0,d1155f90) at fork1+0x1ba
> panic: acquiring blockable sleep lock with spinlock or critical section held 
> (rwlock) kmmaplk

pmap_kernel()'s mutexes aren't initialized.  Diff below does that.

Index: arch/i386/i386/pmap.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.209
diff -u -p -r1.209 pmap.c
--- arch/i386/i386/pmap.c   24 Sep 2020 11:36:50 -  1.209
+++ arch/i386/i386/pmap.c   17 Dec 2020 21:47:11 -
@@ -961,6 +961,8 @@ pmap_bootstrap(vaddr_t kva_start)
 */
 
kpm = pmap_kernel();
+   mtx_init(&kpm->pm_mtx, IPL_VM);
+   mtx_init(&kpm->pm_apte_mtx, IPL_VM);
uvm_objinit(&kpm->pm_obj, NULL, 1);
bzero(&kpm->pm_list, sizeof(kpm->pm_list));  /* pm_list not used */
kpm->pm_pdir = (vaddr_t)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);



Re: WITNESS panic: acquiring blockable sleep lock with spinlock or critical section held (rwlock) kmmaplk

2020-12-17 Thread Martin Pieuchot
On 17/12/20(Thu) 23:16, Mark Kettenis wrote:
> > Date: Thu, 17 Dec 2020 18:56:52 -0300
> > From: Martin Pieuchot 
> > 
> > On 16/12/20(Wed) 22:49, Greg Steuck wrote:
> > > I just hit this while booting an i386-current in vmd. The source tree is
> > > synced to "Remove the assertion in uvm_km_pgremove()."
> > > 
> > > I enabled WITNESS on top of GENERIC. Naturally, GENERIC-Dec15 snap works.
> > > 
> > > Anybody else see this so I know it's worth a bisect?
> > > [...]
> > 
> > I can reproduce it.  Diff below fixes it.  This is the beginning of a
> > rabbit hole... thanks!
> > 
> > > witness: lock_object uninitialized: 0xd0f3c828
> > > Starting stack trace...
> > > witness_checkorder(0,d6bb011c,d1155e6c,d02e10e4,90) at 
> > > witness_checkorder+0x8a
> > > witness_checkorder(d0f3c828,9,0) at witness_checkorder+0x8a
> > > mtx_enter(d0f3c81c) at mtx_enter+0x27
> > > pmap_extract_pae(d8bb0d80,f5605000,d8bb0da0) at pmap_extract_pae+0x53
> > > pmap_pinit_pd_pae(d8bb0d80) at pmap_pinit_pd_pae+0x268
> > > pmap_create(1,1000,f6fe5e86,d8bbfd54,d0f5ba18) at pmap_create+0xa8
> > > uvmspace_fork(d0f5b5fc,d8bb3e34,d0f5b5fc,1,d1155f70) at uvmspace_fork+0x56
> > > process_new(d8bb3e34,d0f5b5fc,1) at process_new+0xeb
> > > fork1(d0eb7b14,1,d04eb560,0,0,d1155f90) at fork1+0x1ba
> > > panic: acquiring blockable sleep lock with spinlock or critical section 
> > > held (rwlock) kmmaplk
> > 
> > pmap_kernel()'s mutexes aren't initialized.  Diff below does that.
> 
> Well, that is somewhat intentional.  Those mutexes should never be
> used for the kernel pmap.  The kernel pmap is always there and is
> updated atomically.
> 
> So how did we end up trying to grab one of these mutexs?

pmap_map_ptes() (both version of them) grab the current's pmap
`pm_apte_mtx' which ends up being the kernel one in this case.

> > Index: arch/i386/i386/pmap.c
> > ===
> > RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
> > retrieving revision 1.209
> > diff -u -p -r1.209 pmap.c
> > --- arch/i386/i386/pmap.c   24 Sep 2020 11:36:50 -  1.209
> > +++ arch/i386/i386/pmap.c   17 Dec 2020 21:47:11 -
> > @@ -961,6 +961,8 @@ pmap_bootstrap(vaddr_t kva_start)
> >  */
> >  
> > kpm = pmap_kernel();
> > +   mtx_init(&kpm->pm_mtx, IPL_VM);
> > +   mtx_init(&kpm->pm_apte_mtx, IPL_VM);
> > uvm_objinit(&kpm->pm_obj, NULL, 1);
> > bzero(&kpm->pm_list, sizeof(kpm->pm_list));  /* pm_list not used */
> > kpm->pm_pdir = (vaddr_t)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
> > 
> > 



Re: WITNESS panic: acquiring blockable sleep lock with spinlock or critical section held (rwlock) kmmaplk

2020-12-19 Thread Martin Pieuchot
On 18/12/20(Fri) 08:04, Todd C. Miller wrote:
> On Fri, 18 Dec 2020 13:34:39 +0100, Mark Kettenis wrote:
> 
> > Anyway, your analysis is right.  When a kernel thread wants to use
> > pmap_extract(9) on a userland pmap, it needs to lock pm_apte_mtx to
> > prevent another thread from simultaniously activating a userland pmap
> > too.  So indeed, pm_apte_mtx needs to be properly initialized for the
> > kernel pmap.
> >
> > However, pm_mtx should never be used for the kernel pmap.  If we don't
> > initialize the lock, witness will help us catching this condition, so
> > maybe we shouldn't...
> 
> I think a comment is warranted if we don't want to initialize the
> lock to prevent someone from fixing this in the future ;-)

A solution based on a comment and a non-enabled by option seems very
fragile to me.  I came up with the idea of poisoning the ipl of the
mutex.  What do you think?

Index: arch/i386/i386/machdep.c
===
RCS file: /cvs/src/sys/arch/i386/i386/machdep.c,v
retrieving revision 1.641
diff -u -p -r1.641 machdep.c
--- arch/i386/i386/machdep.c8 Nov 2020 20:37:23 -   1.641
+++ arch/i386/i386/machdep.c19 Dec 2020 20:57:03 -
@@ -3996,6 +3996,8 @@ splraise(int ncpl)
 {
int ocpl;
 
+   KASSERT(ncpl >= IPL_NONE);
+
_SPLRAISE(ocpl, ncpl);
return (ocpl);
 }
Index: arch/i386/i386/pmap.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.209
diff -u -p -r1.209 pmap.c
--- arch/i386/i386/pmap.c   24 Sep 2020 11:36:50 -  1.209
+++ arch/i386/i386/pmap.c   19 Dec 2020 20:58:48 -
@@ -961,6 +961,8 @@ pmap_bootstrap(vaddr_t kva_start)
 */
 
kpm = pmap_kernel();
+   mtx_init(&kpm->pm_mtx, -1); /* must not be used */
+   mtx_init(&kpm->pm_apte_mtx, IPL_VM);
uvm_objinit(&kpm->pm_obj, NULL, 1);
bzero(&kpm->pm_list, sizeof(kpm->pm_list));  /* pm_list not used */
kpm->pm_pdir = (vaddr_t)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);



Re: WITNESS panic: acquiring blockable sleep lock with spinlock or critical section held (rwlock) kmmaplk

2020-12-21 Thread Martin Pieuchot
On 20/12/20(Sun) 20:55, Mark Kettenis wrote:
> > Date: Sat, 19 Dec 2020 18:07:41 -0300
> > From: Martin Pieuchot 
> > 
> > On 18/12/20(Fri) 08:04, Todd C. Miller wrote:
> > > On Fri, 18 Dec 2020 13:34:39 +0100, Mark Kettenis wrote:
> > > 
> > > > Anyway, your analysis is right.  When a kernel thread wants to use
> > > > pmap_extract(9) on a userland pmap, it needs to lock pm_apte_mtx to
> > > > prevent another thread from simultaniously activating a userland pmap
> > > > too.  So indeed, pm_apte_mtx needs to be properly initialized for the
> > > > kernel pmap.
> > > >
> > > > However, pm_mtx should never be used for the kernel pmap.  If we don't
> > > > initialize the lock, witness will help us catching this condition, so
> > > > maybe we shouldn't...
> > > 
> > > I think a comment is warranted if we don't want to initialize the
> > > lock to prevent someone from fixing this in the future ;-)
> > 
> > A solution based on a comment and a non-enabled by option seems very
> > fragile to me.  I came up with the idea of poisoning the ipl of the
> > mutex.  What do you think?
> 
> Not sure if it makes sense to do this only for i386.  And is
> splraise() the right place for the check?  Instead of mtx_enter()?

Well calling splraise() with a negative value would also be a bug, no?

Sure, nothing prevent us to add this check for more architectures.  Diff
below does it for amd64 and sparc64 as well.  Could you do it for the ones
you can test?

Index: arch/amd64/amd64/intr.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/intr.c,v
retrieving revision 1.54
diff -u -p -r1.54 intr.c
--- arch/amd64/amd64/intr.c 17 Jun 2020 06:14:52 -  1.54
+++ arch/amd64/amd64/intr.c 21 Dec 2020 11:59:31 -
@@ -696,6 +696,8 @@ splraise(int nlevel)
int olevel;
struct cpu_info *ci = curcpu();
 
+   KASSERT(nlevel >= IPL_NONE);
+
olevel = ci->ci_ilevel;
ci->ci_ilevel = MAX(ci->ci_ilevel, nlevel);
return (olevel);
Index: arch/i386/i386/machdep.c
===
RCS file: /cvs/src/sys/arch/i386/i386/machdep.c,v
retrieving revision 1.641
diff -u -p -r1.641 machdep.c
--- arch/i386/i386/machdep.c8 Nov 2020 20:37:23 -   1.641
+++ arch/i386/i386/machdep.c21 Dec 2020 11:58:12 -
@@ -3996,6 +3996,8 @@ splraise(int ncpl)
 {
int ocpl;
 
+   KASSERT(ncpl >= IPL_NONE);
+
_SPLRAISE(ocpl, ncpl);
return (ocpl);
 }
Index: arch/i386/i386/pmap.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.209
diff -u -p -r1.209 pmap.c
--- arch/i386/i386/pmap.c   24 Sep 2020 11:36:50 -  1.209
+++ arch/i386/i386/pmap.c   19 Dec 2020 20:58:48 -
@@ -961,6 +961,8 @@ pmap_bootstrap(vaddr_t kva_start)
 */
 
kpm = pmap_kernel();
+   mtx_init(&kpm->pm_mtx, -1); /* must not be used */
+   mtx_init(&kpm->pm_apte_mtx, IPL_VM);
uvm_objinit(&kpm->pm_obj, NULL, 1);
bzero(&kpm->pm_list, sizeof(kpm->pm_list));  /* pm_list not used */
kpm->pm_pdir = (vaddr_t)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
Index: arch/sparc64/sparc64/intr.c
===
RCS file: /cvs/src/sys/arch/sparc64/sparc64/intr.c,v
retrieving revision 1.61
diff -u -p -r1.61 intr.c
--- arch/sparc64/sparc64/intr.c 24 Jun 2020 22:03:40 -  1.61
+++ arch/sparc64/sparc64/intr.c 21 Dec 2020 12:00:37 -
@@ -322,6 +322,7 @@ intr_establish(int level, struct intrhan
 int
 splraise(int ipl)
 {
+   KASSERT(ipl >= IPL_NONE);
return (_splraise(ipl));
 }
 



uvmexp & per-CPU counters

2020-12-21 Thread Martin Pieuchot
During a page fault multiples counters are updated.  They fall into two
categories "fault counters" and "global statistics" both of which are
currently represented by int-sized fields inside a global: `uvmexp'.

Diff below makes use of the per-CPU counters_inc(9) API to make sure no
update is lost with an unlocked fault handler.  I only converted the
fields touched by uvm_fault() to have a working solution and start a
discussion.

- Should we keep a single enum for all fields inside `uvmexp' or do we
  want to separate "statistics counters" which are mostly used sys/arch
  from "fault counters" which are only used in uvm/uvm_fault.c?

- The counter_add(9) API deals with uint64_t and currently uvmexp uses
  int.  Should we truncate or change the size of uvmexp fields or do
  something else?

Comments?

Index: kern/init_main.c
===
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.302
diff -u -p -r1.302 init_main.c
--- kern/init_main.c7 Dec 2020 16:55:28 -   1.302
+++ kern/init_main.c21 Dec 2020 19:37:13 -
@@ -432,6 +432,7 @@ main(void *framep)
 #endif
 
mbcpuinit();/* enable per cpu mbuf data */
+   uvm_init_percpu();
 
/* init exec and emul */
init_exec();
Index: uvm/uvm_extern.h
===
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.155
diff -u -p -r1.155 uvm_extern.h
--- uvm/uvm_extern.h1 Dec 2020 13:56:22 -   1.155
+++ uvm/uvm_extern.h21 Dec 2020 19:37:13 -
@@ -289,6 +289,7 @@ voiduvm_vsunlock_device(struct proc 
*
void *);
 void   uvm_pause(void);
 void   uvm_init(void); 
+void   uvm_init_percpu(void);
 intuvm_io(vm_map_t, struct uio *, int);
 
 #defineUVM_IO_FIXPROT  0x01
Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.109
diff -u -p -r1.109 uvm_fault.c
--- uvm/uvm_fault.c 8 Dec 2020 12:26:31 -   1.109
+++ uvm/uvm_fault.c 21 Dec 2020 19:37:13 -
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -271,7 +272,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
int result;
 
result = 0; /* XXX shut up gcc */
-   uvmexp.fltanget++;
+   counters_inc(uvmexp_counters, flt_anget);
 /* bump rusage counters */
if (anon->an_page)
curproc->p_ru.ru_minflt++;
@@ -295,7 +296,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
if ((pg->pg_flags & (PG_BUSY|PG_RELEASED)) == 0)
return (VM_PAGER_OK);
atomic_setbits_int(&pg->pg_flags, PG_WANTED);
-   uvmexp.fltpgwait++;
+   counters_inc(uvmexp_counters, flt_pgwait);
 
/*
 * the last unlock must be an atomic unlock+wait on
@@ -310,7 +311,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
 
if (pg == NULL) {   /* out of RAM.  */
uvmfault_unlockall(ufi, amap, NULL);
-   uvmexp.fltnoram++;
+   counters_inc(uvmexp_counters, flt_noram);
uvm_wait("flt_noram1");
/* ready to relock and try again */
} else {
@@ -325,7 +326,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
 * it is ok to read an_swslot here because
 * we hold PG_BUSY on the page.
 */
-   uvmexp.pageins++;
+   counters_inc(uvmexp_counters, pageins);
result = uvm_swap_get(pg, anon->an_swslot,
PGO_SYNCIO);
 
@@ -369,7 +370,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
uvm_anfree(anon);   /* frees page for us */
if (locked)
uvmfault_unlockall(ufi, amap, NULL);
-   uvmexp.fltpgrele++;
+   counters_inc(uvmexp_counters, flt_pgrele);
return (VM_PAGER_REFAULT);  /* refault! */
}
 
@@ -426,7 +427,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
}
 
/* try it again! */
-   uvmexp.fltanretry++;
+   counters_inc(uvmexp_counters, flt_anretry);
continue;
 
} /* while (1) */
@@ -547,7 +548,7 @@ uvm_fault_check(struct uvm_faultinfo *uf
/* need to clear */

Re: Force knote state update in klist_invalidate()

2020-12-21 Thread Martin Pieuchot
On 21/12/20(Mon) 16:45, Visa Hankala wrote:
> There is a slight inconsistency in klist_invalidate(). If the knote is
> already in the event queue and has flag EV_ONESHOT, kqueue_scan() will
> not invoke the newly set f_event. In this case, the kevent(2) system
> call will return the knote's original event state that no longer
> reflects the state that is reachable through the file descriptor
> (the caller of klist_invalidate() has already revoked access to the
> file or device).

I don't understand the problem.  Why should filt_dead() be called?  Is
it a race between two threads?  Would you mind giving a scenario or some
code example?

> I think a proper fix is to invoke f_event manually to force the state
> update.
> 
> OK?
> 
> Index: kern/kern_event.c
> ===
> RCS file: src/sys/kern/kern_event.c,v
> retrieving revision 1.153
> diff -u -p -r1.153 kern_event.c
> --- kern/kern_event.c 20 Dec 2020 12:54:05 -  1.153
> +++ kern/kern_event.c 21 Dec 2020 16:19:30 -
> @@ -1618,6 +1618,7 @@ klist_invalidate(struct klist *list)
>   kn->kn_fop->f_detach(kn);
>   if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
>   kn->kn_fop = &dead_filtops;
> + kn->kn_fop->f_event(kn, 0);
>   knote_activate(kn);
>   s = splhigh();
>   knote_release(kn);
> 



Re: kqueue_scan() should not return EWOULDBLOCK

2020-12-23 Thread Martin Pieuchot
On 23/12/20(Wed) 07:18, Visa Hankala wrote:
> This fixes a recent regression in kqueue_scan() where the function can
> mistakenly return EWOULDBLOCK.
> 
> Currently, kqueue_scan() does one more scan attempt after a timeout.
> Usually, this gives no new events and the function bails out through
> the following code. Note that it clears `error'.
> 
> if (kq->kq_count == 0) {
> /*
>  * Successive loops are only necessary if there are more
>  * ready events to gather, so they don't need to block.
>  */
> if ((tsp != NULL && !timespecisset(tsp)) ||
> scan->kqs_nevent != 0) {
> splx(s);
> error = 0;
> goto done;
> }
> 
> However, there can be a last-minute event activation, in which case the
> function processes the event queue. Unfortunately, the error variable
> preserves its value EWOULDBLOCK/EAGAIN that gets returned to the caller.
> kevent(2), or select(2) or poll(2), is not supposed to return this error.
> 
> The issue emerged in r1.146 of kern_event.c when the final copyout() was
> moved outside kqueue_scan(). The copyout()'s return value used to
> override the EWOULDBLOCK.
> 
> The following patch fixes the regression by clearing `error' at the
> start of each scan round. The clearing could be done conditionally after
> kqueue_sleep(). However, that does not seem as robust.

The value could be cleaned before "goto retry", that would be more
robust, no?

Ok mpi@ either way.

> Index: kern/kern_event.c
> ===
> RCS file: src/sys/kern/kern_event.c,v
> retrieving revision 1.153
> diff -u -p -r1.153 kern_event.c
> --- kern/kern_event.c 20 Dec 2020 12:54:05 -  1.153
> +++ kern/kern_event.c 23 Dec 2020 07:10:24 -
> @@ -977,6 +977,8 @@ kqueue_scan(struct kqueue_scan_state *sc
>  retry:
>   KASSERT(nkev == 0);
>  
> + error = 0;
> +
>   if (kq->kq_state & KQ_DYING) {
>   error = EBADF;
>   goto done;
> 



Re: uvmexp & per-CPU counters

2020-12-23 Thread Martin Pieuchot
On 22/12/20(Tue) 23:43, Mark Kettenis wrote:
> > Date: Mon, 21 Dec 2020 16:46:32 -0300
> > From: Martin Pieuchot 
> > 
> > During a page fault multiples counters are updated.  They fall into two
> > categories "fault counters" and "global statistics" both of which are
> > currently represented by int-sized fields inside a global: `uvmexp'.
> > 
> > Diff below makes use of the per-CPU counters_inc(9) API to make sure no
> > update is lost with an unlocked fault handler.  I only converted the
> > fields touched by uvm_fault() to have a working solution and start a
> > discussion.
> > 
> > - Should we keep a single enum for all fields inside `uvmexp' or do we
> >   want to separate "statistics counters" which are mostly used sys/arch
> >   from "fault counters" which are only used in uvm/uvm_fault.c?
> > 
> > - The counter_add(9) API deals with uint64_t and currently uvmexp uses
> >   int.  Should we truncate or change the size of uvmexp fields or do
> >   something else?
> > 
> > Comments?
> 
> I think this breaks "show uvmexp" in ddb.

Updated diff below fixes that.  Any comment on the issues raised above?

> You fear that using atomic operations for these counters would lead to
> too much bus contention on systems with a large number of CPUs?

I don't know.  I don't see the point of using atomic operations for
"real" counters that are not used to make any decision.  Atomic
operations have a high cost, bus contention might be one of them.

Index: kern/init_main.c
===
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.302
diff -u -p -r1.302 init_main.c
--- kern/init_main.c7 Dec 2020 16:55:28 -   1.302
+++ kern/init_main.c21 Dec 2020 19:37:13 -
@@ -432,6 +432,7 @@ main(void *framep)
 #endif
 
mbcpuinit();/* enable per cpu mbuf data */
+   uvm_init_percpu();
 
/* init exec and emul */
init_exec();
Index: uvm/uvm_extern.h
===
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.155
diff -u -p -r1.155 uvm_extern.h
--- uvm/uvm_extern.h1 Dec 2020 13:56:22 -   1.155
+++ uvm/uvm_extern.h21 Dec 2020 19:37:13 -
@@ -289,6 +289,7 @@ voiduvm_vsunlock_device(struct proc 
*
void *);
 void   uvm_pause(void);
 void   uvm_init(void); 
+void   uvm_init_percpu(void);
 intuvm_io(vm_map_t, struct uio *, int);
 
 #defineUVM_IO_FIXPROT  0x01
Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.109
diff -u -p -r1.109 uvm_fault.c
--- uvm/uvm_fault.c 8 Dec 2020 12:26:31 -   1.109
+++ uvm/uvm_fault.c 21 Dec 2020 19:37:13 -
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -271,7 +272,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
int result;
 
result = 0; /* XXX shut up gcc */
-   uvmexp.fltanget++;
+   counters_inc(uvmexp_counters, flt_anget);
 /* bump rusage counters */
if (anon->an_page)
curproc->p_ru.ru_minflt++;
@@ -295,7 +296,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
if ((pg->pg_flags & (PG_BUSY|PG_RELEASED)) == 0)
return (VM_PAGER_OK);
atomic_setbits_int(&pg->pg_flags, PG_WANTED);
-   uvmexp.fltpgwait++;
+   counters_inc(uvmexp_counters, flt_pgwait);
 
/*
 * the last unlock must be an atomic unlock+wait on
@@ -310,7 +311,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
 
if (pg == NULL) {   /* out of RAM.  */
uvmfault_unlockall(ufi, amap, NULL);
-   uvmexp.fltnoram++;
+   counters_inc(uvmexp_counters, flt_noram);
uvm_wait("flt_noram1");
/* ready to relock and try again */
} else {
@@ -325,7 +326,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
 * it is ok to read an_swslot here because
 * we hold PG_BUSY on the page.
 */
-   uvmexp.pageins++;
+   counters_inc(uvmexp_counters, pageins);
result = uvm_swap_get(pg, ano

Re: Rename SIMPLEQ_ to STAILQ_, diff 1/7

2020-12-26 Thread Martin Pieuchot
On 26/12/20(Sat) 18:23, Mark Kettenis wrote:
> [...]
> NetBSD and Solaris both provide SIMPLEQ_* and STAILQ_*.  I'm not sure
> removing one in favour of the other is helpful.

It would be helpful to provide both set of macros for some time to ease
the transition/conversion.

Then we can decide if we want to get rid of SIMPLEQ_* or not.



Re: i386 pmap diff

2020-12-28 Thread Martin Pieuchot
On 23/12/20(Wed) 18:24, Mark Kettenis wrote:
> Diff below switches the i386 pmap to use the modern km_alloc(9)
> functions and uses IPL_VM for the pmap pool, following the example of
> amd64.

Diff below is the one I sent you last year.  It has an "#if notyet"
around the allocation that generates the following fault:

panic: uvm_fault(0xd0e39af8, 0xf1dfc000, 0, 1) -> e
Stopped at  db_enter+0x4:   popl%ebp
TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
* 0  0  0 0x1  0x2000K swapper
db_enter(d0e53909,d10c5df4,0,f1dfc000,d0ecca7c) at db_enter+0x4
panic(d0c38a96,d0e39af8,f1dfc000,1,e) at panic+0xd3
kpageflttrap(d10c5e60,f1dfc000,f1dfc000,,d0f78b00) at kpageflttrap+0x14d
trap(d10c5e60) at trap+0x26a
calltrap(8,10006,d1d91cc0,f1ee2000,d083107c) at calltrap+0xc
docopyf(d1d91cc0) at docopyf+0x5
pmap_create(1,1000,61c1cc4d,d1da2ea4,d0f7af34) at pmap_create+0xa8
uvmspace_fork(d0f7ab0c,d1d94ca0,d0f7ab0c,1,d10c5f70) at uvmspace_fork+0x56
process_new(d1d94ca0,d0f7ab0c,1) at process_new+0xeb
fork1(d0ecca7c,1,d08c8d40,0,0,d10c5f90) at fork1+0x1ba

> Don't have easy access to an i386 machine right now, so this has only
> been compile tested.

This can be reproduced in vmm(4) in case you'd like to debug it.

Index: arch/i386/i386/pmap.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.210
diff -u -p -r1.210 pmap.c
--- arch/i386/i386/pmap.c   28 Dec 2020 14:02:08 -  1.210
+++ arch/i386/i386/pmap.c   28 Dec 2020 14:17:45 -
@@ -1365,7 +1365,7 @@ void
 pmap_pinit_pd_86(struct pmap *pmap)
 {
/* allocate PDP */
-   pmap->pm_pdir = uvm_km_alloc(kernel_map, NBPG);
+   pmap->pm_pdir = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_dirty, &kd_waitok);
if (pmap->pm_pdir == 0)
panic("pmap_pinit_pd_86: kernel_map out of virtual space!");
pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir,
@@ -1397,7 +1397,8 @@ pmap_pinit_pd_86(struct pmap *pmap)
 * execution, one that lacks all kernel mappings.
 */
if (cpu_meltdown) {
-   pmap->pm_pdir_intel = uvm_km_zalloc(kernel_map, NBPG);
+   pmap->pm_pdir_intel = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_zero,
+   &kd_waitok);
if (pmap->pm_pdir_intel == 0)
panic("%s: kernel_map out of virtual space!", __func__);
 
@@ -1449,11 +1450,12 @@ pmap_destroy(struct pmap *pmap)
uvm_pagefree(pg);
}
 
-   uvm_km_free(kernel_map, pmap->pm_pdir, pmap->pm_pdirsize);
+   km_free((void *)pmap->pm_pdir, pmap->pm_pdirsize, &kv_any, &kp_dirty);
pmap->pm_pdir = 0;
 
if (pmap->pm_pdir_intel) {
-   uvm_km_free(kernel_map, pmap->pm_pdir_intel, pmap->pm_pdirsize);
+   km_free((void *)pmap->pm_pdir_intel, pmap->pm_pdirsize,
+   &kv_any, &kp_dirty);
pmap->pm_pdir_intel = 0;
}
 
@@ -2522,8 +2524,9 @@ pmap_enter_special_86(vaddr_t va, paddr_
__func__, va);
 
if (!pmap->pm_pdir_intel) {
-   if ((pmap->pm_pdir_intel = uvm_km_zalloc(kernel_map, NBPG))
-   == 0)
+   pmap->pm_pdir_intel = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_zero,
+   &kd_waitok);
+   if (pmap->pm_pdir_intel == 0)
panic("%s: kernel_map out of virtual space!", __func__);
if (!pmap_extract(pmap, pmap->pm_pdir_intel,
&pmap->pm_pdirpa_intel))
Index: arch/i386/i386/pmapae.c
===
RCS file: /cvs/src/sys/arch/i386/i386/pmapae.c,v
retrieving revision 1.60
diff -u -p -r1.60 pmapae.c
--- arch/i386/i386/pmapae.c 23 Sep 2020 15:13:26 -  1.60
+++ arch/i386/i386/pmapae.c 28 Dec 2020 14:17:45 -
@@ -738,7 +738,7 @@ pmap_bootstrap_pae(void)
(uint32_t)VM_PAGE_TO_PHYS(ptppg));
}
}
-   uvm_km_free(kernel_map, (vaddr_t)pd, NBPG);
+   km_free(pd, NBPG, &kv_any, &kp_dirty);
DPRINTF("%s: freeing PDP 0x%x\n", __func__, (uint32_t)pd);
}
 
@@ -944,7 +944,8 @@ pmap_pinit_pd_pae(struct pmap *pmap)
paddr_t pdidx[4];
 
/* allocate PDP */
-   pmap->pm_pdir = uvm_km_alloc(kernel_map, 4 * NBPG);
+   pmap->pm_pdir = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_dirty,
+   &kd_waitok);
if (pmap->pm_pdir == 0)
panic("pmap_pinit_pd_pae: kernel_map out of virtual space!");
/* page index is in the pmap! */
@@ -997,7 +998,8 @@ pmap_pinit_pd_pae(struct pmap *pmap)
if (cpu_meltdown) {
int i;
 
-   if ((va = uvm_km_zalloc(kernel_map, 4 * NBPG)) == 0)
+   va = (vaddr_t)km_alloc(4 * NBPG, &kv_any, &kp_zero, &kd_waitok);
+   if (va == 

Re: sleep_setup/finish simplification

2020-12-28 Thread Martin Pieuchot
On 08/12/20(Tue) 10:06, Martin Pieuchot wrote:
> Diff below aims to simplify the API to put a thread on a sleep queue and
> reduce it to the following:
> 
>   sleep_setup();
>   /* check condition or release lock */
>   sleep_finish();
> 
> It is motivated by my work to sleep the SCHED_LOCK() but might as well
> prevent/fix some bugs.
> 
> The tricky part of the current implementation is that sleep_setup_signal()
> can already park/stop the current thread resulting in a context change.
> Should any custom accounting / lock check happen before that?  At least
> two lock primitives do so currently:  drm's schedule_timeout() and
> rwlock's rw_enter().
> 
> As a result of this diff various states can be removed and sleep_finish()
> contains the following magic:
> 
>   1. check for signal/parking
>   2. context switch or remove from sleep queue
>   3. check for signal/parking
> 
> Note that sleep_finish() could be simplified even further but I left
> that for later to ease the review.
> 
> Comments?  Oks?

Anyone?

> Index: dev/dt/dt_dev.c
> ===
> RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 dt_dev.c
> --- dev/dt/dt_dev.c   28 Sep 2020 13:16:58 -  1.10
> +++ dev/dt/dt_dev.c   7 Dec 2020 17:19:15 -
> @@ -225,10 +225,8 @@ dtread(dev_t dev, struct uio *uio, int f
>   return (EMSGSIZE);
>  
>   while (!sc->ds_evtcnt) {
> - sleep_setup(&sls, sc, PWAIT | PCATCH, "dtread");
> - sleep_setup_signal(&sls);
> - sleep_finish(&sls, !sc->ds_evtcnt);
> - error = sleep_finish_signal(&sls);
> + sleep_setup(&sls, sc, PWAIT | PCATCH, "dtread", 0);
> + error = sleep_finish(&sls, !sc->ds_evtcnt);
>   if (error == EINTR || error == ERESTART)
>   break;
>   }
> Index: dev/pci/drm/drm_linux.c
> ===
> RCS file: /cvs/src/sys/dev/pci/drm/drm_linux.c,v
> retrieving revision 1.70
> diff -u -p -r1.70 drm_linux.c
> --- dev/pci/drm/drm_linux.c   14 Nov 2020 23:08:47 -  1.70
> +++ dev/pci/drm/drm_linux.c   7 Dec 2020 17:19:15 -
> @@ -110,26 +110,23 @@ schedule_timeout(long timeout)
>  {
>   struct sleep_state sls;
>   long deadline;
> - int wait, spl;
> + int wait, spl, timo = 0;
>  
>   MUTEX_ASSERT_LOCKED(&sch_mtx);
>   KASSERT(!cold);
>  
> - sleep_setup(&sls, sch_ident, sch_priority, "schto");
>   if (timeout != MAX_SCHEDULE_TIMEOUT)
> - sleep_setup_timeout(&sls, timeout);
> + timo = timeout;
> + sleep_setup(&sls, sch_ident, sch_priority, "schto", timo);
>  
>   wait = (sch_proc == curproc && timeout > 0);
>  
>   spl = MUTEX_OLDIPL(&sch_mtx);
>   MUTEX_OLDIPL(&sch_mtx) = splsched();
>   mtx_leave(&sch_mtx);
> -
> - sleep_setup_signal(&sls);
> -
>   if (timeout != MAX_SCHEDULE_TIMEOUT)
>   deadline = ticks + timeout;
> - sleep_finish_all(&sls, wait);
> + sleep_finish(&sls, wait);
>   if (timeout != MAX_SCHEDULE_TIMEOUT)
>   timeout = deadline - ticks;
>  
> Index: dev/pci/if_myx.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_myx.c,v
> retrieving revision 1.112
> diff -u -p -r1.112 if_myx.c
> --- dev/pci/if_myx.c  27 Nov 2020 00:13:15 -  1.112
> +++ dev/pci/if_myx.c  7 Dec 2020 17:19:15 -
> @@ -1396,7 +1396,7 @@ myx_down(struct myx_softc *sc)
>   (void)myx_cmd(sc, MYXCMD_SET_IFDOWN, &mc, NULL);
>  
>   while (sc->sc_state != MYX_S_OFF) {
> - sleep_setup(&sls, sts, PWAIT, "myxdown");
> + sleep_setup(&sls, sts, PWAIT, "myxdown", 0);
>   membar_consumer();
>   sleep_finish(&sls, sc->sc_state != MYX_S_OFF);
>   }
> Index: kern/kern_rwlock.c
> ===
> RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
> retrieving revision 1.45
> diff -u -p -r1.45 kern_rwlock.c
> --- kern/kern_rwlock.c2 Mar 2020 17:07:49 -   1.45
> +++ kern/kern_rwlock.c7 Dec 2020 17:19:15 -
> @@ -278,15 +278,13 @@ retry:
>   prio = op->wait_prio;
>   if (flags & RW_INTR)
>   prio |= PCATCH;
> - sleep_setup(&sls, rwl, prio, rwl->rwl_

uvm_fault: amap & anon locking

2020-12-30 Thread Martin Pieuchot
Diff below adds some locking to UVM's amap & anon data structures that
should be enough to get the upper part of the fault handler out of the
KERNEL_LOCK().

This diff doesn't unlock the fault handler, I'd suggest to do this in a
later step on an arch by arch basis.

This is a port of what exists in NetBSD.  A rwlock is attached to every
amap and is shared with all its anon.  The same lock will be used by
multiple amaps if they have anons in common.  This diff includes the new
rw_obj_* API required to have reference-counted rwlocks.

Other than that a global rwlock is used to protect the list of amap and
many pool have been converted to use a rwlock internally to not create
lock ordering problem when allocations are made while holding a rwlock.

The style of the diff is sometimes questionable.  This is done to reduce
differences with NetBSD sources in order to help porting more locking goos.

This has been extensively tested as part of the unlocking diff I sent to
many developers.  However, I'd appreciate if you could test again because
this diff doesn't include WITNESS and do not unlock the fault handler.

Index: kern/init_main.c
===
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.303
diff -u -p -r1.303 init_main.c
--- kern/init_main.c28 Dec 2020 14:01:23 -  1.303
+++ kern/init_main.c29 Dec 2020 14:13:52 -
@@ -232,6 +232,7 @@ main(void *framep)
KERNEL_LOCK_INIT();
SCHED_LOCK_INIT();
 
+   rw_obj_init();
uvm_init();
disk_init();/* must come before autoconfiguration */
tty_init(); /* initialise tty's */
Index: kern/kern_rwlock.c
===
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.45
diff -u -p -r1.45 kern_rwlock.c
--- kern/kern_rwlock.c  2 Mar 2020 17:07:49 -   1.45
+++ kern/kern_rwlock.c  30 Dec 2020 14:03:00 -
@@ -19,6 +19,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -487,4 +488,124 @@ int
 rrw_status(struct rrwlock *rrwl)
 {
return (rw_status(&rrwl->rrwl_lock));
+}
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#defineRWLOCK_OBJ_MAGIC0x5aa3c85d
+struct rwlock_obj {
+   struct rwlock   ro_lock;
+   u_int   ro_magic;
+   u_int   ro_refcnt;
+};
+
+
+struct pool rwlock_obj_pool;
+
+/*
+ * rw_obj_init:
+ *
+ * Initialize the mutex object store.
+ */
+void
+rw_obj_init(void)
+{
+   pool_init(&rwlock_obj_pool, sizeof(struct rwlock_obj), 0, IPL_NONE,
+   PR_WAITOK | PR_RWLOCK, "rwobjpl", NULL);
+}
+
+/*
+ * rw_obj_alloc:
+ *
+ * Allocate a single lock object.
+ */
+void
+_rw_obj_alloc_flags(struct rwlock **lock, const char *name, int flags,
+struct lock_type *type)
+{
+   struct rwlock_obj *mo;
+
+   mo = pool_get(&rwlock_obj_pool, PR_WAITOK);
+   mo->ro_magic = RWLOCK_OBJ_MAGIC;
+   _rw_init_flags(&mo->ro_lock, name, flags, type);
+   mo->ro_refcnt = 1;
+
+   *lock = &mo->ro_lock;
+}
+
+/*
+ * rw_obj_hold:
+ *
+ * Add a single reference to a lock object.  A reference to the object
+ * must already be held, and must be held across this call.
+ */
+
+void
+rw_obj_hold(struct rwlock *lock)
+{
+   struct rwlock_obj *mo = (struct rwlock_obj *)lock;
+
+   KASSERTMSG(mo->ro_magic == RWLOCK_OBJ_MAGIC,
+   "%s: lock %p: mo->ro_magic (%#x) != RWLOCK_OBJ_MAGIC (%#x)",

Re: uvm_fault: amap & anon locking

2021-01-11 Thread Martin Pieuchot
On 31/12/20(Thu) 22:35, Mark Kettenis wrote:
> > Date: Wed, 30 Dec 2020 11:19:41 -0300
> > From: Martin Pieuchot 
> > 
> > Diff below adds some locking to UVM's amap & anon data structures that
> > should be enough to get the upper part of the fault handler out of the
> > KERNEL_LOCK().
> > 
> > This diff doesn't unlock the fault handler, I'd suggest to do this in a
> > later step on an arch by arch basis.
> > 
> > This is a port of what exists in NetBSD.  A rwlock is attached to every
> > amap and is shared with all its anon.  The same lock will be used by
> > multiple amaps if they have anons in common.  This diff includes the new
> > rw_obj_* API required to have reference-counted rwlocks.
> > 
> > Other than that a global rwlock is used to protect the list of amap and
> > many pool have been converted to use a rwlock internally to not create
> > lock ordering problem when allocations are made while holding a rwlock.
> 
> Can you explain what those lock odering problems are?  To me it seems
> that a pool mutex should always be taken last, and that the pool
> system doesn't need to enter the amap or anon code.

This is the same problem prevented by IPL_MPFLOOR but with a different
approach.  Taking an interrupt whose handler grab the KERNEL_LOCK() while
holding a mutex can lead to a deadlock if this mutex can be grabbed with
the KERNEL_LOCK() held.

Using PR_RWLOCK was the approach introduced in guenther@'s diff to
unlock mmap(2).  The way I understand it is that this code should only be
used in process contexts and that's why PR_RWLOCK got introduced.

This leads to me question whether PR_RWLOCK is worth the complexity.  Since
then its used has spread across the tree.

> Removing the check in pool_get that PR_WAITOK is set for pools created
> with PR_RWLOCK is a bit problematic from my perspective.  At the very
> least we should adjust the man page.

This is necessary if we want to use PR_RWLOCK since uvm_analloc() does
not sleep to be able to handle out-of-RAM conditions in the fault
handler.

The alternative would be to use IPL_MPFLOOR when allocating anon & amap,
this has the advantage of documenting that this IPL change is only
"temporary".

Diff below does that.  If we agree on this approach I'd like to start
collecting oks, for parts or for the whole.


Index: kern/init_main.c
===
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.304
diff -u -p -r1.304 init_main.c
--- kern/init_main.c1 Jan 2021 07:00:33 -   1.304
+++ kern/init_main.c11 Jan 2021 11:59:38 -
@@ -232,6 +232,7 @@ main(void *framep)
KERNEL_LOCK_INIT();
SCHED_LOCK_INIT();
 
+   rw_obj_init();
uvm_init();
disk_init();/* must come before autoconfiguration */
tty_init(); /* initialise tty's */
Index: kern/kern_rwlock.c
===
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.45
diff -u -p -r1.45 kern_rwlock.c
--- kern/kern_rwlock.c  2 Mar 2020 17:07:49 -   1.45
+++ kern/kern_rwlock.c  11 Jan 2021 13:59:07 -
@@ -19,6 +19,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -487,4 +488,124 @@ int
 rrw_status(struct rrwlock *rrwl)
 {
return (rw_status(&rrwl->rrwl_lock));
+}
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARI

uvm_fault: access_type fixup for wired mapping

2021-01-12 Thread Martin Pieuchot
Diff below moves `access_type' to the context structure passed down to
the various routines and fix a regression introduced in a previous
refactoring.

`access_type' is overwritten for wired mapping and the value of
`enter_prot' is used instead.

ok?

Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.111
diff -u -p -r1.111 uvm_fault.c
--- uvm/uvm_fault.c 2 Jan 2021 02:39:59 -   1.111
+++ uvm/uvm_fault.c 12 Jan 2021 12:36:38 -
@@ -477,6 +477,7 @@ struct uvm_faultctx {
 * read-only after that.
 */
vm_prot_t enter_prot;
+   vm_prot_t access_type;
vaddr_t startva;
int npages;
int centeridx;
@@ -486,7 +487,7 @@ struct uvm_faultctx {
 };
 
 intuvm_fault_lower(struct uvm_faultinfo *, struct uvm_faultctx *,
-   struct vm_page **, vm_fault_t, vm_prot_t);
+   struct vm_page **, vm_fault_t);
 
 /*
  * uvm_fault_check: check prot, handle needs-copy, etc.
@@ -505,7 +506,7 @@ int uvm_fault_lower(struct uvm_faultinfo
  */
 int
 uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
-struct vm_anon ***ranons, vm_prot_t access_type)
+struct vm_anon ***ranons)
 {
struct vm_amap *amap;
struct uvm_object *uobj;
@@ -523,7 +524,7 @@ uvm_fault_check(struct uvm_faultinfo *uf
 #endif
 
/* check protection */
-   if ((ufi->entry->protection & access_type) != access_type) {
+   if ((ufi->entry->protection & flt->access_type) != flt->access_type) {
uvmfault_unlockmaps(ufi, FALSE);
return (EACCES);
}
@@ -539,11 +540,11 @@ uvm_fault_check(struct uvm_faultinfo *uf
flt->pa_flags = UVM_ET_ISWC(ufi->entry) ? PMAP_WC : 0;
flt->wired = VM_MAPENT_ISWIRED(ufi->entry) || (flt->narrow == TRUE);
if (flt->wired)
-   access_type = flt->enter_prot; /* full access for wired */
+   flt->access_type = flt->enter_prot; /* full access for wired */
 
/* handle "needs_copy" case. */
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
-   if ((access_type & PROT_WRITE) ||
+   if ((flt->access_type & PROT_WRITE) ||
(ufi->entry->object.uvm_obj == NULL)) {
/* need to clear */
uvmfault_unlockmaps(ufi, FALSE);
@@ -648,7 +649,7 @@ uvm_fault_check(struct uvm_faultinfo *uf
  */
 int
 uvm_fault_upper(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
-   struct vm_anon **anons, vm_fault_t fault_type, vm_prot_t access_type)
+   struct vm_anon **anons, vm_fault_t fault_type)
 {
struct vm_amap *amap = ufi->entry->aref.ar_amap;
struct vm_anon *oanon, *anon = anons[flt->centeridx];
@@ -699,7 +700,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf
 * if we are out of anon VM we wait for RAM to become available.
 */
 
-   if ((access_type & PROT_WRITE) != 0 && anon->an_ref > 1) {
+   if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) {
counters_inc(uvmexp_counters, flt_acow);
oanon = anon;   /* oanon = old */
anon = uvm_analloc();
@@ -761,7 +762,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf
 */
if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
VM_PAGE_TO_PHYS(pg) | flt->pa_flags, flt->enter_prot,
-   access_type | PMAP_CANFAIL | (flt->wired ? PMAP_WIRED : 0)) != 0) {
+   flt->access_type | PMAP_CANFAIL | (flt->wired ? PMAP_WIRED : 0)) != 
0) {
/*
 * No need to undo what we did; we can simply think of
 * this as the pmap throwing away the mapping information.
@@ -922,6 +923,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
 * pages on wire */
else
flt.narrow = FALSE; /* normal fault */
+   flt.access_type = access_type;
 
 
/*
@@ -930,7 +932,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
while (error == ERESTART) {
anons = anons_store;
 
-   error = uvm_fault_check(&ufi, &flt, &anons, access_type);
+   error = uvm_fault_check(&ufi, &flt, &anons);
if (error != 0)
continue;
 
@@ -938,13 +940,11 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
if (shadowed == TRUE) {
/* case 1: fault on an anon in our amap */
-   error = uvm_fault_upper(&ufi, &flt, anons, fault_type,
-   access_type);
+   error = uvm_fault_upper(&ufi, &flt, anons, fault_type);
} else {
/* case 2: fault on backing object or zero fill */
KERNEL_LOCK();
-   error = uvm_fault_low

Re: Cache parent's pid as `ps_ppid' and use it instead of `ps_pptr->ps_pid'.

2021-01-13 Thread Martin Pieuchot
On 02/01/21(Sat) 21:54, Vitaliy Makkoveev wrote:
> This allows us to unlock getppid(2). Also NetBSD, DragonflyBSD and OSX
> do the same.

Seems the way to go, two comments below.

> Index: kern/exec_elf.c
> ===
> RCS file: /cvs/src/sys/kern/exec_elf.c,v
> retrieving revision 1.156
> diff -u -p -r1.156 exec_elf.c
> --- kern/exec_elf.c   7 Dec 2020 16:55:28 -   1.156
> +++ kern/exec_elf.c   2 Jan 2021 15:47:46 -
> @@ -1257,7 +1257,7 @@ coredump_notes_elf(struct proc *p, void 
>   cpi.cpi_sigcatch = pr->ps_sigacts->ps_sigcatch;
>  
>   cpi.cpi_pid = pr->ps_pid;
> - cpi.cpi_ppid = pr->ps_pptr->ps_pid;
> + cpi.cpi_ppid = pr->ps_ppid;
>   cpi.cpi_pgrp = pr->ps_pgid;
>   if (pr->ps_session->s_leader)
>   cpi.cpi_sid = pr->ps_session->s_leader->ps_pid;
> Index: kern/kern_exit.c
> ===
> RCS file: /cvs/src/sys/kern/kern_exit.c,v
> retrieving revision 1.193
> diff -u -p -r1.193 kern_exit.c
> --- kern/kern_exit.c  9 Dec 2020 18:58:19 -   1.193
> +++ kern/kern_exit.c  2 Jan 2021 15:47:46 -
> @@ -694,6 +694,7 @@ process_reparent(struct process *child, 
>   }
>  
>   child->ps_pptr = parent;
> + child->ps_ppid = parent->ps_pid;

Should the parent pid be changed when a process is re-parented when
being traced?  It seems that both Free and Net only change it when a
process is re-arented to process 1 (init).

> Index: kern/kern_fork.c
> ===
> RCS file: /cvs/src/sys/kern/kern_fork.c,v
> retrieving revision 1.230
> diff -u -p -r1.230 kern_fork.c
> --- kern/kern_fork.c  7 Dec 2020 16:55:28 -   1.230
> +++ kern/kern_fork.c  2 Jan 2021 15:47:46 -
> @@ -231,6 +231,7 @@ process_new(struct proc *p, struct proce
>  
>   /* post-copy fixups */
>   pr->ps_pptr = parent;
> + pr->ps_ppid = parent->ps_pid;
>  
>   /* bump references to the text vnode (for sysctl) */
>   pr->ps_textvp = parent->ps_textvp;
> Index: kern/kern_prot.c
> ===
> RCS file: /cvs/src/sys/kern/kern_prot.c,v
> retrieving revision 1.76
> diff -u -p -r1.76 kern_prot.c
> --- kern/kern_prot.c  9 Jul 2019 12:23:25 -   1.76
> +++ kern/kern_prot.c  2 Jan 2021 15:47:46 -
> @@ -84,7 +84,7 @@ int
>  sys_getppid(struct proc *p, void *v, register_t *retval)
>  {
>  
> - *retval = p->p_p->ps_pptr->ps_pid;
> + *retval = p->p_p->ps_ppid;
>   return (0);
>  }
>  
> Index: kern/kern_sysctl.c
> ===
> RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.385
> diff -u -p -r1.385 kern_sysctl.c
> --- kern/kern_sysctl.c28 Dec 2020 18:28:11 -  1.385
> +++ kern/kern_sysctl.c2 Jan 2021 15:47:46 -
> @@ -1666,7 +1666,7 @@ fill_kproc(struct process *pr, struct ki
>  
>   /* stuff that's too painful to generalize into the macros */
>   if (pr->ps_pptr)
> - ki->p_ppid = pr->ps_pptr->ps_pid;
> + ki->p_ppid = pr->ps_ppid;
>   if (s->s_leader)
>   ki->p_sid = s->s_leader->ps_pid;
>  
> Index: sys/proc.h
> ===
> RCS file: /cvs/src/sys/sys/proc.h,v
> retrieving revision 1.303
> diff -u -p -r1.303 proc.h
> --- sys/proc.h9 Dec 2020 18:58:19 -   1.303
> +++ sys/proc.h2 Jan 2021 15:47:47 -
> @@ -216,6 +216,7 @@ struct process {
>   u_int   ps_xexit;   /* Exit status for wait */
>   int ps_xsig;/* Stopping or killing signal */
>  
> + pid_t   ps_ppid;/* Cached parent pid */
>   pid_t   ps_oppid;   /* Save parent pid during ptrace. */

Can't we re-use `ps_oppid' and always cache it?  It seems that FreeBSD
is doing something like that.  Could this field be documented to make it
clear why geppid(2) can be unlocked?

>   int ps_ptmask;  /* Ptrace event mask */
>   struct  ptrace_state *ps_ptstat;/* Ptrace state */
> 



Re: ugen(4) and uhidev(4) data toggle problem

2021-01-18 Thread Martin Pieuchot
On 16/01/21(Sat) 16:10, Greg Steuck wrote:
> Marcus Glocker  writes:
> > There are a few threads going on related to problems with ugen(4) and
> > uhidev(4) devices on xhci(4).  This is related to the issue patrick@
> > already explained; while ehci(4) can save the last data toggle state,
> > xhci(4) resets it on every open/close cycle, getting out of sync with
> > the device.
> 
> Is this related to the terrible contortions we go through in
> src/lib/libfido2/src/hid_openbsd.c:terrible_ping_kludge? Same code
> is also included into firefox and our chromium to get fido(4) to work.
> 
> Security keys work no better or worse with your patch. If you believe
> there's a chance to remove terrible_ping_kludge, I'll try.

Yes please, that's the whole point of Marcus's work, to get rid of that
kludge.



Re: route sourceaddr: simplify code & get out of ART

2021-01-24 Thread Martin Pieuchot
On 23/01/21(Sat) 21:59, Vitaliy Makkoveev wrote:
> Hello.
> 
> According the code `ifaddr’ struct has `ifa_refcnt’ field. Also it seems `ifa’
> could exist while corresponding `ifp’ was destroyed. Is this true for `rt’
> case? Should `ifa_refcnt' be bumped while you return `ifa’?

What is stored is a "struct rtentry".  This data structure is properly
refcounted in this diff.  When the last reference of `rt' is freed
ifafree() is called.  So there's no need to mess with `ifa' directly.


> > On 9 Jan 2021, at 20:50, Denis Fondras  wrote:
> > 
> > This diff place the user-set source address outside of struct art_root and 
> > make
> > the code more readable (to me).
> > 
> > Based on a concept by mpi@
> > 
> > Index: net/art.h
> > ===
> > RCS file: /cvs/src/sys/net/art.h,v
> > retrieving revision 1.20
> > diff -u -p -r1.20 art.h
> > --- net/art.h   12 Nov 2020 15:25:28 -  1.20
> > +++ net/art.h   9 Jan 2021 16:04:02 -
> > @@ -42,7 +42,6 @@ struct art_root {
> > uint8_t  ar_nlvl;   /* [I] Number of levels */
> > uint8_t  ar_alen;   /* [I] Address length in bits */
> > uint8_t  ar_off;/* [I] Offset of key in bytes */
> > -   struct sockaddr *source;/* [K] optional src addr to use 
> > */
> > };
> > 
> > #define ISLEAF(e)   (((unsigned long)(e) & 1) == 0)
> > Index: net/route.c
> > ===
> > RCS file: /cvs/src/sys/net/route.c,v
> > retrieving revision 1.397
> > diff -u -p -r1.397 route.c
> > --- net/route.c 29 Oct 2020 21:15:27 -  1.397
> > +++ net/route.c 9 Jan 2021 16:04:02 -
> > @@ -1192,9 +1192,9 @@ rt_ifa_del(struct ifaddr *ifa, int flags
> > if (flags & RTF_CONNECTED)
> > prio = ifp->if_priority + RTP_CONNECTED;
> > 
> > -   rtable_clearsource(rdomain, ifa->ifa_addr);
> > error = rtrequest_delete(&info, prio, ifp, &rt, rdomain);
> > if (error == 0) {
> > +   rt_sourceclear(rt, rdomain);
> > rtm_send(rt, RTM_DELETE, 0, rdomain);
> > if (flags & RTF_LOCAL)
> > rtm_addr(RTM_DELADDR, ifa);
> > Index: net/route.h
> > ===
> > RCS file: /cvs/src/sys/net/route.h,v
> > retrieving revision 1.183
> > diff -u -p -r1.183 route.h
> > --- net/route.h 29 Oct 2020 21:15:27 -  1.183
> > +++ net/route.h 9 Jan 2021 16:04:02 -
> > @@ -478,6 +478,9 @@ int  rtrequest_delete(struct rt_addrinfo
> > int  rt_if_track(struct ifnet *);
> > int  rt_if_linkstate_change(struct rtentry *, void *, u_int);
> > int  rtdeletemsg(struct rtentry *, struct ifnet *, u_int);
> > +
> > +struct ifaddr  *rt_get_ifa(struct rtentry *, unsigned int);
> > +voidrt_sourceclear(struct rtentry *, unsigned int);
> > #endif /* _KERNEL */
> > 
> > #endif /* _NET_ROUTE_H_ */
> > Index: net/rtable.c
> > ===
> > RCS file: /cvs/src/sys/net/rtable.c,v
> > retrieving revision 1.72
> > diff -u -p -r1.72 rtable.c
> > --- net/rtable.c7 Nov 2020 09:51:40 -   1.72
> > +++ net/rtable.c9 Jan 2021 16:04:02 -
> > @@ -365,44 +365,6 @@ rtable_alloc(unsigned int rtableid, unsi
> > return (art_alloc(rtableid, alen, off));
> > }
> > 
> > -int
> > -rtable_setsource(unsigned int rtableid, int af, struct sockaddr *src)
> > -{
> > -   struct art_root *ar;
> > -
> > -   if ((ar = rtable_get(rtableid, af)) == NULL)
> > -   return (EAFNOSUPPORT);
> > -
> > -   ar->source = src;
> > -
> > -   return (0);
> > -}
> > -
> > -struct sockaddr *
> > -rtable_getsource(unsigned int rtableid, int af)
> > -{
> > -   struct art_root *ar;
> > -
> > -   ar = rtable_get(rtableid, af);
> > -   if (ar == NULL)
> > -   return (NULL);
> > -
> > -   return (ar->source);
> > -}
> > -
> > -void
> > -rtable_clearsource(unsigned int rtableid, struct sockaddr *src)
> > -{
> > -   struct sockaddr *addr;
> > -
> > -   addr = rtable_getsource(rtableid, src->sa_family);
> > -   if (addr && (addr->sa_len == src->sa_len)) {
> > -   if (memcmp(src, addr, addr->sa_len) == 0) {
> > -   rtable_setsource(rtableid, src->sa_family, NULL);
> > -   }
> > -   }
> > -}
> > -
> > struct rtentry *
> > rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
> > struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
> > Index: net/rtable.h
> > ===
> > RCS file: /cvs/src/sys/net/rtable.h,v
> > retrieving revision 1.26
> > diff -u -p -r1.26 rtable.h
> > --- net/rtable.h7 Nov 2020 09:51:40 -   1.26
> > +++ net/rtable.h9 Jan 2021 16:04:02 -
> > @@ -39,9 +39,6 @@ unsigned int   rtable_l2(unsigned int);
> > unsigned int rtable_loindex(unsigned in

Re: route sourceaddr: simplify code & get out of ART

2021-01-26 Thread Martin Pieuchot
On 23/01/21(Sat) 12:22, Denis Fondras wrote:
> Le Sat, Jan 09, 2021 at 06:50:50PM +0100, Denis Fondras a écrit :
> > This diff place the user-set source address outside of struct art_root and 
> > make
> > the code more readable (to me).
> > 
> > Based on a concept by mpi@

Comments below.

> > Index: net/rtsock.c
> > ===
> > RCS file: /cvs/src/sys/net/rtsock.c,v
> > retrieving revision 1.304
> > diff -u -p -r1.304 rtsock.c
> > --- net/rtsock.c7 Nov 2020 09:51:40 -   1.304
> > +++ net/rtsock.c9 Jan 2021 16:04:02 -
> > @@ -138,7 +138,8 @@ int  sysctl_iflist(int, struct walkarg 
> >  int sysctl_ifnames(struct walkarg *);
> >  int sysctl_rtable_rtstat(void *, size_t *, void *);
> >  
> > -int rt_setsource(unsigned int, struct sockaddr *);
> > +int rt_sourceset(struct rtentry *, unsigned int);
> > +struct rtentry *rt_get_rt(int, unsigned int);

I don't understand what's the use for rt_get_rt(), the name of the
function isn't helping either, more on that below.

> >  /*
> >   * Locks used to protect struct members
> > @@ -170,6 +171,14 @@ struct rtptable {
> >  struct pool rtpcb_pool;
> >  struct rtptable rtptable;
> >  
> > +struct rt_srcaddr {
> > +   LIST_ENTRY(rt_srcaddr)   rts_next;
> > +   unsigned int rts_rtableid;
> > +   struct rtentry  *rts_rt;
> > +};
> > +
> > +LIST_HEAD(, rt_srcaddr)srcaddr_h = LIST_HEAD_INITIALIZER(srcaddr_h);
> > +

Could you document which lock are protecting those fields?  Can you
assert such lock is held when accessing them?  Could you also document
what this data structure is for?

> >  /*
> >   * These flags and timeout are used for indicating to userland (via a
> >   * RTM_DESYNC msg) when the route socket has overflowed and messages
> > @@ -664,10 +673,7 @@ rtm_report(struct rtentry *rt, u_char ty
> > ifp = if_get(rt->rt_ifidx);
> > if (ifp != NULL) {
> > info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
> > -   info.rti_info[RTAX_IFA] =
> > -   rtable_getsource(tableid, 
> > info.rti_info[RTAX_DST]->sa_family);
> > -   if (info.rti_info[RTAX_IFA] == NULL)
> > -   info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
> > +   info.rti_info[RTAX_IFA] = rt_get_ifa(rt, tableid)->ifa_addr;
> > if (ifp->if_flags & IFF_POINTOPOINT)

With the introduction of rt_get_ifa() is there any place left in the
network stack where `rt->rt_ifa' is accessed directly?  Is there a
reason?  Should we explain when using the function is necessary in a
comment on top of it?

> > info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
> > }
> > @@ -860,10 +866,28 @@ route_output(struct mbuf *m, struct sock
> > if (info.rti_info[RTAX_IFA] == NULL) {
> > error = EINVAL;
> > goto fail;
> > +   } else if ((info.rti_info[RTAX_IFA]->sa_family == AF_INET6 &&
> > +   IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)
> > +   info.rti_info[RTAX_IFA])->sin6_addr)) ||
> > +   (info.rti_info[RTAX_IFA]->sa_family == AF_INET &&
> > +   ((struct sockaddr_in *)
> > +   info.rti_info[RTAX_IFA])->sin_addr.s_addr == 0)) {

Do I understand correctly that the default route is used as a magic
value to clear any preferred source routing entry?  Isn't it possible to
retrieve this route via rtalloc(9) instead of rt_get_rt()?

Why do we need a `rt' at all?  If it's because rt_sourceclear() expects
one, can we change this expectation?

> > +   
> > rt_sourceclear(rt_get_rt(info.rti_info[RTAX_IFA]->sa_family,
> > +   tableid), tableid);
> > +   rtfree(rt);
> > +   rt = NULL;
> > +   } else {
> > +   rt = rtalloc(info.rti_info[RTAX_IFA], 0, tableid);
> > +   if (rt == NULL || !ISSET(rt->rt_flags, RTF_LOCAL)) {
> > +   error = EINVAL;
> > +   goto fail;
> > +   }
> > +   NET_LOCK();
> > +   error = rt_sourceset(rt, tableid);
> > +   NET_UNLOCK();

Could you push the NET_LOCK() down and do the allocation before grabbing
it?  We should refrain from calling malloc(9) with such lock held.

Maybe the NET_LOCK() is not the lock we want here, but this can be
changed later.

> > +   if (error != 0)
> > +   goto fail;
> > }
> > -   if ((error =
> > -   rt_setsource(tableid, info.rti_info[RTAX_IFA])) != 0)
> > -   goto fail;
> > } else {
> > error = rtm_output(rtm, &rt, &info, prio, tableid);
> > if (!error) {
> > @@ -873,9 +897,9 @@ route_output(struct mbuf *m, struct sock
> > rtm = rtm_report

Re: sleep_setup/finish simplification

2021-02-01 Thread Martin Pieuchot
On 08/12/20(Tue) 10:06, Martin Pieuchot wrote:
> Diff below aims to simplify the API to put a thread on a sleep queue and
> reduce it to the following:
> 
>   sleep_setup();
>   /* check condition or release lock */
>   sleep_finish();
> 
> It is motivated by my work to sleep the SCHED_LOCK() but might as well
> prevent/fix some bugs.
> 
> The tricky part of the current implementation is that sleep_setup_signal()
> can already park/stop the current thread resulting in a context change.
> Should any custom accounting / lock check happen before that?  At least
> two lock primitives do so currently:  drm's schedule_timeout() and
> rwlock's rw_enter().
> 
> As a result of this diff various states can be removed and sleep_finish()
> contains the following magic:
> 
>   1. check for signal/parking
>   2. context switch or remove from sleep queue
>   3. check for signal/parking
> 
> Note that sleep_finish() could be simplified even further but I left
> that for later to ease the review.

Updated diff on top of recent changes from claudio@, still ok?

Index: dev/dt/dt_dev.c
===
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.10
diff -u -p -r1.10 dt_dev.c
--- dev/dt/dt_dev.c 28 Sep 2020 13:16:58 -  1.10
+++ dev/dt/dt_dev.c 26 Jan 2021 17:20:11 -
@@ -225,10 +225,8 @@ dtread(dev_t dev, struct uio *uio, int f
return (EMSGSIZE);
 
while (!sc->ds_evtcnt) {
-   sleep_setup(&sls, sc, PWAIT | PCATCH, "dtread");
-   sleep_setup_signal(&sls);
-   sleep_finish(&sls, !sc->ds_evtcnt);
-   error = sleep_finish_signal(&sls);
+   sleep_setup(&sls, sc, PWAIT | PCATCH, "dtread", 0);
+   error = sleep_finish(&sls, !sc->ds_evtcnt);
if (error == EINTR || error == ERESTART)
break;
}
Index: dev/pci/if_myx.c
===
RCS file: /cvs/src/sys/dev/pci/if_myx.c,v
retrieving revision 1.114
diff -u -p -r1.114 if_myx.c
--- dev/pci/if_myx.c17 Jan 2021 02:52:21 -  1.114
+++ dev/pci/if_myx.c26 Jan 2021 17:20:11 -
@@ -1397,7 +1397,7 @@ myx_down(struct myx_softc *sc)
(void)myx_cmd(sc, MYXCMD_SET_IFDOWN, &mc, NULL);
 
while (sc->sc_state != MYX_S_OFF) {
-   sleep_setup(&sls, sts, PWAIT, "myxdown");
+   sleep_setup(&sls, sts, PWAIT, "myxdown", 0);
membar_consumer();
sleep_finish(&sls, sc->sc_state != MYX_S_OFF);
}
Index: dev/pci/drm/drm_linux.c
===
RCS file: /cvs/src/sys/dev/pci/drm/drm_linux.c,v
retrieving revision 1.76
diff -u -p -r1.76 drm_linux.c
--- dev/pci/drm/drm_linux.c 13 Jan 2021 01:04:49 -  1.76
+++ dev/pci/drm/drm_linux.c 26 Jan 2021 17:22:50 -
@@ -110,14 +110,14 @@ schedule_timeout(long timeout)
 {
struct sleep_state sls;
unsigned long deadline;
-   int wait, spl;
+   int wait, spl, timo = 0;
 
MUTEX_ASSERT_LOCKED(&sch_mtx);
KASSERT(!cold);
 
-   sleep_setup(&sls, sch_ident, sch_priority, "schto");
if (timeout != MAX_SCHEDULE_TIMEOUT)
-   sleep_setup_timeout(&sls, timeout);
+   timo = timeout;
+   sleep_setup(&sls, sch_ident, sch_priority, "schto", timo);
 
wait = (sch_proc == curproc && timeout > 0);
 
@@ -125,11 +125,9 @@ schedule_timeout(long timeout)
MUTEX_OLDIPL(&sch_mtx) = splsched();
mtx_leave(&sch_mtx);
 
-   sleep_setup_signal(&sls);
-
if (timeout != MAX_SCHEDULE_TIMEOUT)
deadline = jiffies + timeout;
-   sleep_finish_all(&sls, wait);
+   sleep_finish(&sls, wait);
if (timeout != MAX_SCHEDULE_TIMEOUT)
timeout = deadline - jiffies;
 
Index: kern/kern_rwlock.c
===
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.46
diff -u -p -r1.46 kern_rwlock.c
--- kern/kern_rwlock.c  11 Jan 2021 18:49:38 -  1.46
+++ kern/kern_rwlock.c  26 Jan 2021 17:20:11 -
@@ -279,15 +279,13 @@ retry:
prio = op->wait_prio;
if (flags & RW_INTR)
prio |= PCATCH;
-   sleep_setup(&sls, rwl, prio, rwl->rwl_name);
-   if (flags & RW_INTR)
-   sleep_setup_signal(&sls);
+   sleep_setup(&sls, rwl, prio, rwl->rwl_name, 0);
 
do_sleep = !rw_cas(&rwl->rwl_owner, o, set);
 
-   sleep_finish(&

Re: sleep_setup/finish simplification

2021-02-03 Thread Martin Pieuchot
On 02/02/21(Tue) 10:45, Claudio Jeker wrote:
> On Mon, Feb 01, 2021 at 04:25:47PM +0100, Martin Pieuchot wrote:
> > On 08/12/20(Tue) 10:06, Martin Pieuchot wrote:
> > > Diff below aims to simplify the API to put a thread on a sleep queue and
> > > reduce it to the following:
> > > 
> > >   sleep_setup();
> > >   /* check condition or release lock */
> > >   sleep_finish();
> > > 
> > > It is motivated by my work to sleep the SCHED_LOCK() but might as well
> > > prevent/fix some bugs.
> > > 
> > > The tricky part of the current implementation is that sleep_setup_signal()
> > > can already park/stop the current thread resulting in a context change.
> > > Should any custom accounting / lock check happen before that?  At least
> > > two lock primitives do so currently:  drm's schedule_timeout() and
> > > rwlock's rw_enter().
> > > 
> > > As a result of this diff various states can be removed and sleep_finish()
> > > contains the following magic:
> > > 
> > >   1. check for signal/parking
> > >   2. context switch or remove from sleep queue
> > >   3. check for signal/parking
> > > 
> > > Note that sleep_finish() could be simplified even further but I left
> > > that for later to ease the review.
> > 
> > Updated diff on top of recent changes from claudio@, still ok?
> 
> Found the bug. The timeout for rwsleep() got lost. See below.

Thanks, updated diff addressing your points.

Index: dev/dt/dt_dev.c
===
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.10
diff -u -p -r1.10 dt_dev.c
--- dev/dt/dt_dev.c 28 Sep 2020 13:16:58 -  1.10
+++ dev/dt/dt_dev.c 3 Feb 2021 08:38:54 -
@@ -225,10 +225,8 @@ dtread(dev_t dev, struct uio *uio, int f
return (EMSGSIZE);
 
while (!sc->ds_evtcnt) {
-   sleep_setup(&sls, sc, PWAIT | PCATCH, "dtread");
-   sleep_setup_signal(&sls);
-   sleep_finish(&sls, !sc->ds_evtcnt);
-   error = sleep_finish_signal(&sls);
+   sleep_setup(&sls, sc, PWAIT | PCATCH, "dtread", 0);
+   error = sleep_finish(&sls, !sc->ds_evtcnt);
if (error == EINTR || error == ERESTART)
break;
}
Index: dev/pci/if_myx.c
===
RCS file: /cvs/src/sys/dev/pci/if_myx.c,v
retrieving revision 1.114
diff -u -p -r1.114 if_myx.c
--- dev/pci/if_myx.c17 Jan 2021 02:52:21 -  1.114
+++ dev/pci/if_myx.c3 Feb 2021 08:38:54 -
@@ -1397,7 +1397,7 @@ myx_down(struct myx_softc *sc)
(void)myx_cmd(sc, MYXCMD_SET_IFDOWN, &mc, NULL);
 
while (sc->sc_state != MYX_S_OFF) {
-   sleep_setup(&sls, sts, PWAIT, "myxdown");
+   sleep_setup(&sls, sts, PWAIT, "myxdown", 0);
membar_consumer();
sleep_finish(&sls, sc->sc_state != MYX_S_OFF);
}
Index: dev/pci/drm/drm_linux.c
===
RCS file: /cvs/src/sys/dev/pci/drm/drm_linux.c,v
retrieving revision 1.76
diff -u -p -r1.76 drm_linux.c
--- dev/pci/drm/drm_linux.c 13 Jan 2021 01:04:49 -  1.76
+++ dev/pci/drm/drm_linux.c 3 Feb 2021 08:38:54 -
@@ -110,14 +110,14 @@ schedule_timeout(long timeout)
 {
struct sleep_state sls;
unsigned long deadline;
-   int wait, spl;
+   int wait, spl, timo = 0;
 
MUTEX_ASSERT_LOCKED(&sch_mtx);
KASSERT(!cold);
 
-   sleep_setup(&sls, sch_ident, sch_priority, "schto");
if (timeout != MAX_SCHEDULE_TIMEOUT)
-   sleep_setup_timeout(&sls, timeout);
+   timo = timeout;
+   sleep_setup(&sls, sch_ident, sch_priority, "schto", timo);
 
wait = (sch_proc == curproc && timeout > 0);
 
@@ -125,11 +125,9 @@ schedule_timeout(long timeout)
MUTEX_OLDIPL(&sch_mtx) = splsched();
mtx_leave(&sch_mtx);
 
-   sleep_setup_signal(&sls);
-
if (timeout != MAX_SCHEDULE_TIMEOUT)
deadline = jiffies + timeout;
-   sleep_finish_all(&sls, wait);
+   sleep_finish(&sls, wait);
if (timeout != MAX_SCHEDULE_TIMEOUT)
timeout = deadline - jiffies;
 
Index: kern/kern_rwlock.c
===
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.46
diff -u -p -r1.46 kern_rwlock.c
--- kern/kern_rwlock.c  11 Jan 2021 18:49:38 -  1.46
+++ kern/kern_rwlock.c  3 Feb 2021 08:38:5

Re: video(4) multiple opens

2021-02-10 Thread Martin Pieuchot
On 09/02/21(Tue) 20:35, Marcus Glocker wrote:
> jca@ has recently committed a change to video(4) to allow the same
> process to do multiple opens on the same video device to satisfy
> certain applications, and start to go in to the V4L2 "1.1.4 Multiple
> Opens" specification direction as described here:
> 
> https://www.kernel.org/doc/html/v5.10/userspace-api/media/v4l/open.html#f1
> 
> As well he recently sent me some locking code to prevent concurrent
> access to certain video(4) functions.  On that I think it makes more
> sense to introduce the locking code together with the next step, which
> is to allow different processes to open the same video device.
> 
> Therefore I have added on top of jca@s locking code the code to define
> a device owner, and based on that distinguish between which process is
> allowed to call certain video(4) functions.  Basically the process
> starting with the buffer memory allocation or/and starting the video
> stream becomes the device owner.  Other processes can do things like
> calling VIDIOC_G_CTRL or VIDIOC_S_CTRL ioctls.  In this diff certainly
> more ioctls can be moved up to the "shared" part, but I only started
> with the video controls for now.
> 
> Also video(1) requires a small change to make the read(2) method
> signal that the stream needs to be stopped on close.  I checked that
> other applications do implement this behavior as well.  Otherwise
> if you have multiple file handles open on a video device, and the
> read(2) process exists before another process, we won't notice that
> the stream needs to be stopped, since only the last file handle
> close will call the videoclose() function.
> 
> I would appreciate some regression testing and feedback to make a
> start on implementing this specification.

Which fields is the new lock protecting?  Why isn't the KERNEL_LOCK()
enough?  Is it because of sleeping points?  Could you annotate them?

> Index: sys/dev/video.c
> ===
> RCS file: /cvs/src/sys/dev/video.c,v
> retrieving revision 1.48
> diff -u -p -u -p -r1.48 video.c
> --- sys/dev/video.c   31 Jan 2021 19:32:01 -  1.48
> +++ sys/dev/video.c   7 Feb 2021 15:33:52 -
> @@ -48,6 +48,8 @@ struct video_softc {
>   struct video_hw_if  *hw_if; /* hardware interface */
>   char sc_dying;  /* device detached */
>   struct process   *sc_owner; /* owner process */
> + struct rwlocksc_lock;   /* device lock */
> + uint8_t  sc_open;   /* device opened */
>  
>   int  sc_fsize;
>   uint8_t *sc_fbuffer;
> @@ -122,6 +124,7 @@ videoopen(dev_t dev, int flags, int fmt,
>  {
>   int unit;
>   struct video_softc *sc;
> + int r = 0;
>  
>   unit = VIDEOUNIT(dev);
>   if (unit >= video_cd.cd_ndevs ||
> @@ -129,22 +132,27 @@ videoopen(dev_t dev, int flags, int fmt,
>sc->hw_if == NULL)
>   return (ENXIO);
>  
> - if (sc->sc_owner != NULL) {
> - if (sc->sc_owner == p->p_p)
> - return (0);
> - else
> - return (EBUSY);
> - } else
> - sc->sc_owner = p->p_p;
> + rw_enter_write(&sc->sc_lock);
> +
> + if (sc->sc_open) {
> + DPRINTF(("%s: device already open\n", __func__));
> + rw_exit_write(&sc->sc_lock);
> + return (r);
> + } else {
> + sc->sc_open = 1;
> + DPRINTF(("%s: set device to open\n", __func__));
> + }
>  
>   sc->sc_vidmode = VIDMODE_NONE;
>   sc->sc_frames_ready = 0;
>  
>   if (sc->hw_if->open != NULL)
> - return (sc->hw_if->open(sc->hw_hdl, flags, &sc->sc_fsize,
> - sc->sc_fbuffer, video_intr, sc));
> - else
> - return (0);
> + r = sc->hw_if->open(sc->hw_hdl, flags, &sc->sc_fsize,
> + sc->sc_fbuffer, video_intr, sc);
> +
> + rw_exit_write(&sc->sc_lock);
> +
> + return (r);
>  }
>  
>  int
> @@ -155,11 +163,23 @@ videoclose(dev_t dev, int flags, int fmt
>  
>   sc = video_cd.cd_devs[VIDEOUNIT(dev)];
>  
> + rw_enter_write(&sc->sc_lock);
> +
>   if (sc->hw_if->close != NULL)
>   r = sc->hw_if->close(sc->hw_hdl);
>  
> + if (p != NULL) {
> + sc->sc_open = 0;
> + DPRINTF(("%s: last close\n", __func__));
> + }
> + DPRINTF(("%s: stream close\n", __func__));
> +
> + sc->sc_vidmode = VIDMODE_NONE;
> + sc->sc_frames_ready = 0;
>   sc->sc_owner = NULL;
>  
> + rw_exit_write(&sc->sc_lock);
> +
>   return (r);
>  }
>  
> @@ -175,11 +195,28 @@ videoread(dev_t dev, struct uio *uio, in
>   (sc = video_cd.cd_devs[unit]) == NULL)
>   return (ENXIO);
>  
> - if (sc->sc_dying)
> + rw_enter_write(&sc->sc_lock);
> +
> + if (sc->sc_dying) {
> + rw_exit_write(&sc-

Move single_thread_set() out of KERNEL_LOCK()

2021-02-10 Thread Martin Pieuchot
Diff below extends the scope of the SCHED_LOCK() to no longer require
the KERNEL_LOCK() when iterating over `ps_thread'.  This is enough to
make progress without having to introduce new mechanism.

ok?

Index: kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.195
diff -u -p -r1.195 kern_exit.c
--- kern/kern_exit.c8 Feb 2021 10:51:01 -   1.195
+++ kern/kern_exit.c8 Feb 2021 10:55:18 -
@@ -124,6 +124,7 @@ exit1(struct proc *p, int xexit, int xsi
 {
struct process *pr, *qr, *nqr;
struct rusage *rup;
+   int s;
 
atomic_setbits_int(&p->p_flag, P_WEXIT);
 
@@ -161,7 +162,9 @@ exit1(struct proc *p, int xexit, int xsi
}
 
/* unlink ourselves from the active threads */
+   SCHED_LOCK(s);
TAILQ_REMOVE(&pr->ps_threads, p, p_thr_link);
+   SCHED_UNLOCK(s);
if ((p->p_flag & P_THREAD) == 0) {
/* main thread gotta wait because it has the pid, et al */
while (pr->ps_refcnt > 1)
Index: kern/kern_fork.c
===
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.232
diff -u -p -r1.232 kern_fork.c
--- kern/kern_fork.c8 Feb 2021 10:51:01 -   1.232
+++ kern/kern_fork.c8 Feb 2021 10:54:49 -
@@ -558,13 +558,13 @@ thread_fork(struct proc *curp, void *sta
 
LIST_INSERT_HEAD(&allproc, p, p_list);
LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
-   TAILQ_INSERT_TAIL(&pr->ps_threads, p, p_thr_link);
 
+   SCHED_LOCK(s);
+   TAILQ_INSERT_TAIL(&pr->ps_threads, p, p_thr_link);
/*
 * if somebody else wants to take us to single threaded mode,
 * count ourselves in.
 */
-   SCHED_LOCK(s);
if (pr->ps_single) {
atomic_inc_int(&pr->ps_singlecount);
atomic_setbits_int(&p->p_flag, P_SUSPSINGLE);
Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.272
diff -u -p -r1.272 kern_sig.c
--- kern/kern_sig.c 8 Feb 2021 10:51:01 -   1.272
+++ kern/kern_sig.c 8 Feb 2021 10:56:26 -
@@ -1209,11 +1209,7 @@ issignal(struct proc *p)
signum != SIGKILL) {
pr->ps_xsig = signum;
 
-   if (dolock)
-   KERNEL_LOCK();
single_thread_set(p, SINGLE_PTRACE, 0);
-   if (dolock)
-   KERNEL_UNLOCK();
 
if (dolock)
SCHED_LOCK(s);
@@ -2009,7 +2005,6 @@ single_thread_set(struct proc *p, enum s
struct proc *q;
int error, s;
 
-   KERNEL_ASSERT_LOCKED();
KASSERT(curproc == p);
 
SCHED_LOCK(s);
Index: sys/proc.h
===
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.308
diff -u -p -r1.308 proc.h
--- sys/proc.h  8 Feb 2021 10:51:02 -   1.308
+++ sys/proc.h  8 Feb 2021 10:55:45 -
@@ -167,7 +167,7 @@ struct process {
struct  ucred *ps_ucred;/* Process owner's identity. */
 
LIST_ENTRY(process) ps_list;/* List of all processes. */
-   TAILQ_HEAD(,proc) ps_threads;   /* Threads in this process. */
+   TAILQ_HEAD(,proc) ps_threads;   /* [K|S] Threads in this process. */
 
LIST_ENTRY(process) ps_pglist;  /* List of processes in pgrp. */
struct  process *ps_pptr;   /* Pointer to parent process. */



rw_enter_diag() vs WITNESS

2021-02-15 Thread Martin Pieuchot
Diagnostic function rw_enter_diag() should be called before
WITNESS_CHECKORDER() to have proper locking/debugging information.

In the case of 'locking against myself' it is currently impossible
to know where the lock has been previously acquired.  Diff below fixes
that, ok?

Index: kern/kern_rwlock.c
===
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.47
diff -u -p -r1.47 kern_rwlock.c
--- kern/kern_rwlock.c  8 Feb 2021 08:18:45 -   1.47
+++ kern/kern_rwlock.c  15 Feb 2021 10:32:57 -
@@ -237,7 +237,11 @@ rw_enter(struct rwlock *rwl, int flags)
int error, prio;
 #ifdef WITNESS
int lop_flags;
+#endif
+
+   rw_enter_diag(rwl, flags);
 
+#ifdef WITNESS
lop_flags = LOP_NEWORDER;
if (flags & RW_WRITE)
lop_flags |= LOP_EXCLUSIVE;
@@ -270,8 +274,6 @@ retry:
continue;
}
 #endif
-
-   rw_enter_diag(rwl, flags);
 
if (flags & RW_NOSLEEP)
return (EBUSY);



uvm_fault: Comments & still cleanup

2021-02-15 Thread Martin Pieuchot
Diff below includes non-functional changes:

- Sync comments with NetBSD including locking details.
- Remove superfluous parenthesis and spaces.
- Add brackets, even if questionable, to reduce diff with NetBSD
- Use for (;;) instead of while(1)
- Rename a variable from 'result' into 'error'.
- Move uvm_fault() and uvm_fault_upper_lookup()
- Add an locking assert in uvm_fault_upper_lookup()

ok?

Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.113
diff -u -p -r1.113 uvm_fault.c
--- uvm/uvm_fault.c 19 Jan 2021 13:21:36 -  1.113
+++ uvm/uvm_fault.c 15 Feb 2021 10:44:20 -
@@ -55,11 +55,11 @@
  *read/write1 write>1  read/write   +-cow_write/zero
  * | | ||
  *  +--|--+   +--|--+ +-+   +  |  + | +-+
- * amap |  V  |   |  --->new|  || |  ^  |
+ * amap |  V  |   |  -> new |  || |  ^  |
  *  +-+   +-+ +-+   +  |  + | +--|--+
  * |||
  *  +-+   +-+   +--|--+ | +--|--+
- * uobj | d/c |   | d/c |   |  V  | +|  |
+ * uobj | d/c |   | d/c |   |  V  | ++  |
  *  +-+   +-+   +-+   +-+
  *
  * d/c = don't care
@@ -69,7 +69,7 @@
  *
  *   case [1]: upper layer fault [anon active]
  * 1A: [read] or [write with anon->an_ref == 1]
- * I/O takes place in top level anon and uobj is not touched.
+ * I/O takes place in upper level anon and uobj is not touched.
  * 1B: [write with anon->an_ref > 1]
  * new anon is alloc'd and data is copied off ["COW"]
  *
@@ -89,7 +89,7 @@
  * the code is structured as follows:
  *
  * - init the "IN" params in the ufi structure
- *   ReFault:
+ *   ReFault: (ERESTART returned to the loop in uvm_fault)
  * - do lookups [locks maps], check protection, handle needs_copy
  * - check for case 0 fault (error)
  * - establish "range" of fault
@@ -136,8 +136,8 @@
  *by multiple map entries, and figuring out what should wait could be
  *complex as well...).
  *
- * we use alternative 2 currently.   maybe alternative 3 would be useful
- * in the future.XXX keep in mind for future consideration//rechecking.
+ * we use alternative 2.  given that we are multi-threaded now we may want
+ * to reconsider the choice.
  */
 
 /*
@@ -177,7 +177,7 @@ uvmfault_anonflush(struct vm_anon **anon
int lcv;
struct vm_page *pg;
 
-   for (lcv = 0 ; lcv < n ; lcv++) {
+   for (lcv = 0; lcv < n; lcv++) {
if (anons[lcv] == NULL)
continue;
KASSERT(rw_lock_held(anons[lcv]->an_lock));
@@ -222,14 +222,14 @@ uvmfault_init(void)
 /*
  * uvmfault_amapcopy: clear "needs_copy" in a map.
  *
+ * => called with VM data structures unlocked (usually, see below)
+ * => we get a write lock on the maps and clear needs_copy for a VA
  * => if we are out of RAM we sleep (waiting for more)
  */
 static void
 uvmfault_amapcopy(struct uvm_faultinfo *ufi)
 {
-
-   /* while we haven't done the job */
-   while (1) {
+   for (;;) {
/* no mapping?  give up. */
if (uvmfault_lookup(ufi, TRUE) == FALSE)
return;
@@ -258,36 +258,46 @@ uvmfault_amapcopy(struct uvm_faultinfo *
  * uvmfault_anonget: get data in an anon into a non-busy, non-released
  * page in that anon.
  *
- * => we don't move the page on the queues [gets moved later]
- * => if we allocate a new page [we_own], it gets put on the queues.
- *either way, the result is that the page is on the queues at return time
+ * => Map, amap and thus anon should be locked by caller.
+ * => If we fail, we unlock everything and error is returned.
+ * => If we are successful, return with everything still locked.
+ * => We do not move the page on the queues [gets moved later].  If we
+ *allocate a new page [we_own], it gets put on the queues.  Either way,
+ *the result is that the page is on the queues at return time
  */
 int
 uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
 struct vm_anon *anon)
 {
-   boolean_t we_own;   /* we own anon's page? */
-   boolean_t locked;   /* did we relock? */
struct vm_page *pg;
-   int result;
+   int error;
 
KASSERT(rw_lock_held(anon->an_lock));
KASSERT(anon->an_lock == amap->am_lock);
 
-   result = 0; /* XXX shut up gcc */
+   /* Increment the counters.*/
counters_inc(uvmexp_counters, flt_anget);
-/* bump rusage counters */
-   if (anon->an_page)
+   if (anon->an_page) {
curproc->p_ru.ru_minflt++;
-   else
+

Re: uvm_fault: Comments & style cleanup

2021-02-15 Thread Martin Pieuchot
On 15/02/21(Mon) 11:47, Martin Pieuchot wrote:
> Diff below includes non-functional changes:
> 
> - Sync comments with NetBSD including locking details.
> - Remove superfluous parenthesis and spaces.
> - Add brackets, even if questionable, to reduce diff with NetBSD
> - Use for (;;) instead of while(1)
> - Rename a variable from 'result' into 'error'.
> - Move uvm_fault() and uvm_fault_upper_lookup()
> - Add an locking assert in uvm_fault_upper_lookup()

Updated diff on top of recent fix, still ok?

Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.114
diff -u -p -r1.114 uvm_fault.c
--- uvm/uvm_fault.c 15 Feb 2021 12:12:54 -  1.114
+++ uvm/uvm_fault.c 15 Feb 2021 12:14:08 -
@@ -55,11 +55,11 @@
  *read/write1 write>1  read/write   +-cow_write/zero
  * | | ||
  *  +--|--+   +--|--+ +-+   +  |  + | +-+
- * amap |  V  |   |  --->new|  || |  ^  |
+ * amap |  V  |   |  -> new |  || |  ^  |
  *  +-+   +-+ +-+   +  |  + | +--|--+
  * |||
  *  +-+   +-+   +--|--+ | +--|--+
- * uobj | d/c |   | d/c |   |  V  | +|  |
+ * uobj | d/c |   | d/c |   |  V  | ++  |
  *  +-+   +-+   +-+   +-+
  *
  * d/c = don't care
@@ -69,7 +69,7 @@
  *
  *   case [1]: upper layer fault [anon active]
  * 1A: [read] or [write with anon->an_ref == 1]
- * I/O takes place in top level anon and uobj is not touched.
+ * I/O takes place in upper level anon and uobj is not touched.
  * 1B: [write with anon->an_ref > 1]
  * new anon is alloc'd and data is copied off ["COW"]
  *
@@ -89,7 +89,7 @@
  * the code is structured as follows:
  *
  * - init the "IN" params in the ufi structure
- *   ReFault:
+ *   ReFault: (ERESTART returned to the loop in uvm_fault)
  * - do lookups [locks maps], check protection, handle needs_copy
  * - check for case 0 fault (error)
  * - establish "range" of fault
@@ -136,8 +136,8 @@
  *by multiple map entries, and figuring out what should wait could be
  *complex as well...).
  *
- * we use alternative 2 currently.   maybe alternative 3 would be useful
- * in the future.XXX keep in mind for future consideration//rechecking.
+ * we use alternative 2.  given that we are multi-threaded now we may want
+ * to reconsider the choice.
  */
 
 /*
@@ -177,7 +177,7 @@ uvmfault_anonflush(struct vm_anon **anon
int lcv;
struct vm_page *pg;
 
-   for (lcv = 0 ; lcv < n ; lcv++) {
+   for (lcv = 0; lcv < n; lcv++) {
if (anons[lcv] == NULL)
continue;
KASSERT(rw_lock_held(anons[lcv]->an_lock));
@@ -222,14 +222,14 @@ uvmfault_init(void)
 /*
  * uvmfault_amapcopy: clear "needs_copy" in a map.
  *
+ * => called with VM data structures unlocked (usually, see below)
+ * => we get a write lock on the maps and clear needs_copy for a VA
  * => if we are out of RAM we sleep (waiting for more)
  */
 static void
 uvmfault_amapcopy(struct uvm_faultinfo *ufi)
 {
-
-   /* while we haven't done the job */
-   while (1) {
+   for (;;) {
/* no mapping?  give up. */
if (uvmfault_lookup(ufi, TRUE) == FALSE)
return;
@@ -258,36 +258,46 @@ uvmfault_amapcopy(struct uvm_faultinfo *
  * uvmfault_anonget: get data in an anon into a non-busy, non-released
  * page in that anon.
  *
- * => we don't move the page on the queues [gets moved later]
- * => if we allocate a new page [we_own], it gets put on the queues.
- *either way, the result is that the page is on the queues at return time
+ * => Map, amap and thus anon should be locked by caller.
+ * => If we fail, we unlock everything and error is returned.
+ * => If we are successful, return with everything still locked.
+ * => We do not move the page on the queues [gets moved later].  If we
+ *allocate a new page [we_own], it gets put on the queues.  Either way,
+ *the result is that the page is on the queues at return time
  */
 int
 uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
 struct vm_anon *anon)
 {
-   boolean_t we_own;   /* we own anon's page? */
-   boolean_t locked;   /* did we relock? */
struct vm_page *pg;
-   int result;
+   int error;
 
KASSERT(rw_lock_held(anon->an_lock));
KASSERT(anon->an_lock == amap->am_lock);

Re: rw_enter_diag() vs WITNESS

2021-02-16 Thread Martin Pieuchot
On 15/02/21(Mon) 16:58, Visa Hankala wrote:
> On Mon, Feb 15, 2021 at 11:37:45AM +0100, Martin Pieuchot wrote:
> > Diagnostic function rw_enter_diag() should be called before
> > WITNESS_CHECKORDER() to have proper locking/debugging information.
> > 
> > In the case of 'locking against myself' it is currently impossible
> > to know where the lock has been previously acquired.  Diff below fixes
> > that, ok?
> 
> Based on this description alone, it is not clear to me what exactly
> gets solved. Doesn't the code reach rw_enter_diag() inside the loop
> when trying to recurse on the rwlock?

It does but before reaching WITNESS_CHECKORDER().  So if a panic is
triggered "show all locks" will point to the previous place where the
same lock has been acquired.
Currently it points to the place triggering the panic which doesn't
help figuring the problem.

> Does this change have implications with (panicstr || db_active)?

Indeed, updated diff below.

Index: kern/kern_rwlock.c
===
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.47
diff -u -p -r1.47 kern_rwlock.c
--- kern/kern_rwlock.c  8 Feb 2021 08:18:45 -   1.47
+++ kern/kern_rwlock.c  16 Feb 2021 09:04:04 -
@@ -167,6 +167,9 @@ rw_exit_write(struct rwlock *rwl)
 static void
 rw_enter_diag(struct rwlock *rwl, int flags)
 {
+   if (panicstr || db_active)
+   return;
+
switch (flags & RW_OPMASK) {
case RW_WRITE:
case RW_READ:
@@ -237,7 +240,11 @@ rw_enter(struct rwlock *rwl, int flags)
int error, prio;
 #ifdef WITNESS
int lop_flags;
+#endif
+
+   rw_enter_diag(rwl, flags);
 
+#ifdef WITNESS
lop_flags = LOP_NEWORDER;
if (flags & RW_WRITE)
lop_flags |= LOP_EXCLUSIVE;
@@ -270,8 +277,6 @@ retry:
continue;
}
 #endif
-
-   rw_enter_diag(rwl, flags);
 
if (flags & RW_NOSLEEP)
return (EBUSY);



uvm_fault_lower refactoring

2021-02-16 Thread Martin Pieuchot
Start by moving `pgo_fault' handler outside of uvm_fault_lower().

If a page has a backing object that prefer to handler to fault itself
the locking will be different, so keep it under KERNEL_LOCK() for the
moment and make it separate from the rest of uvm_fault_lower().

ok?

Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.115
diff -u -p -r1.115 uvm_fault.c
--- uvm/uvm_fault.c 16 Feb 2021 09:10:17 -  1.115
+++ uvm/uvm_fault.c 16 Feb 2021 10:13:58 -
@@ -598,10 +598,37 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
/* case 1: fault on an anon in our amap */
error = uvm_fault_upper(&ufi, &flt, anons, fault_type);
} else {
-   /* case 2: fault on backing object or zero fill */
-   KERNEL_LOCK();
-   error = uvm_fault_lower(&ufi, &flt, pages, fault_type);
-   KERNEL_UNLOCK();
+   struct uvm_object *uobj = ufi.entry->object.uvm_obj;
+
+   /*
+* if the desired page is not shadowed by the amap and
+* we have a backing object, then we check to see if
+* the backing object would prefer to handle the fault
+* itself (rather than letting us do it with the usual
+* pgo_get hook).  the backing object signals this by
+* providing a pgo_fault routine.
+*/
+   if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
+   KERNEL_LOCK();
+   error = uobj->pgops->pgo_fault(&ufi,
+   flt.startva, pages, flt.npages,
+   flt.centeridx, fault_type, flt.access_type,
+   PGO_LOCKED);
+   KERNEL_UNLOCK();
+
+   if (error == VM_PAGER_OK)
+   error = 0;
+   else if (error == VM_PAGER_REFAULT)
+   error = ERESTART;
+   else
+   error = EACCES;
+   } else {
+   /* case 2: fault on backing obj or zero fill */
+   KERNEL_LOCK();
+   error = uvm_fault_lower(&ufi, &flt, pages,
+   fault_type);
+   KERNEL_UNLOCK();
+   }
}
}
 
@@ -1054,26 +1081,6 @@ uvm_fault_lower(struct uvm_faultinfo *uf
struct vm_anon *anon = NULL;
vaddr_t currva;
voff_t uoff;
-
-   /*
-* if the desired page is not shadowed by the amap and we have a
-* backing object, then we check to see if the backing object would
-* prefer to handle the fault itself (rather than letting us do it
-* with the usual pgo_get hook).  the backing object signals this by
-* providing a pgo_fault routine.
-*/
-   if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
-   result = uobj->pgops->pgo_fault(ufi, flt->startva, pages,
-   flt->npages, flt->centeridx, fault_type, flt->access_type,
-   PGO_LOCKED);
-
-   if (result == VM_PAGER_OK)
-   return (0); /* pgo_fault did pmap enter */
-   else if (result == VM_PAGER_REFAULT)
-   return ERESTART;/* try again! */
-   else
-   return (EACCES);
-   }
 
/*
 * now, if the desired page is not shadowed by the amap and we have



pdaemon vs anon locking

2021-02-17 Thread Martin Pieuchot
Diff below adds anon locking to the page daemon.  It will become
necessary to guarantee exclusive access to an anon as soon as the
KERNEL_LOCK() is removed from the fault handler.

Comments?  Oks?

Index: uvm/uvm_pdaemon.c
===
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.88
diff -u -p -r1.88 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c   24 Nov 2020 13:49:09 -  1.88
+++ uvm/uvm_pdaemon.c   17 Feb 2021 10:05:43 -
@@ -460,7 +460,13 @@ uvmpd_scan_inactive(struct pglist *pglst
if (p->pg_flags & PQ_ANON) {
anon = p->uanon;
KASSERT(anon != NULL);
+   if (rw_enter(anon->an_lock,
+   RW_WRITE|RW_NOSLEEP)) {
+   /* lock failed, skip this page */
+   continue;
+   }
if (p->pg_flags & PG_BUSY) {
+   rw_exit(anon->an_lock);
uvmexp.pdbusy++;
/* someone else owns page, skip it */
continue;
@@ -504,6 +510,7 @@ uvmpd_scan_inactive(struct pglist *pglst
 
/* remove from object */
anon->an_page = NULL;
+   rw_exit(anon->an_lock);
}
continue;
}
@@ -513,6 +520,9 @@ uvmpd_scan_inactive(struct pglist *pglst
 * free target when all the current pageouts complete.
 */
if (free + uvmexp.paging > uvmexp.freetarg << 2) {
+   if (anon) {
+   rw_exit(anon->an_lock);
+   }
continue;
}
 
@@ -525,6 +535,9 @@ uvmpd_scan_inactive(struct pglist *pglst
if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
dirtyreacts++;
uvm_pageactivate(p);
+   if (anon) {
+   rw_exit(anon->an_lock);
+   }
continue;
}
 
@@ -591,6 +604,8 @@ uvmpd_scan_inactive(struct pglist *pglst
&p->pg_flags,
PG_BUSY);
UVM_PAGE_OWN(p, NULL);
+   if (anon)
+   rw_exit(anon->an_lock);
continue;
}
swcpages = 0;   /* cluster is empty */
@@ -622,6 +637,9 @@ uvmpd_scan_inactive(struct pglist *pglst
 */
if (swap_backed) {
if (p) {/* if we just added a page to cluster */
+   if (anon)
+   rw_exit(anon->an_lock);
+
/* cluster not full yet? */
if (swcpages < swnpages)
continue;
@@ -730,6 +748,12 @@ uvmpd_scan_inactive(struct pglist *pglst
/* relock p's object: page queues not lock yet, so
 * no need for "try" */
 
+   /* !swap_backed case: already locked... */
+   if (swap_backed) {
+   if (anon)
+   rw_enter(anon->an_lock, RW_WRITE);
+   }
+
 #ifdef DIAGNOSTIC
if (result == VM_PAGER_UNLOCK)
panic("pagedaemon: pageout returned "
@@ -754,6 +778,7 @@ uvmpd_scan_inactive(struct pglist *pglst
anon->an_page = NULL;
p->uanon = NULL;
 
+   rw_exit(anon->an_lock);
uvm_anfree(anon);   /* kills anon */
pmap_page_protect(p, PROT_NONE);
anon = NULL;
@@ -787,6 +812,8 @@ uvmpd_scan_inactive(struct pglist *pglst
 * the inactive queue can't be re-queued [note: not
 * true for active queue]).
 */
+   if (anon)
+   rw_exit(anon->an_lock);
 
if (nextpg && (nextpg->pg_flags & PQ_INACTIVE) == 0

Re: uvm_fault_lower refactoring

2021-02-22 Thread Martin Pieuchot
On 16/02/21(Tue) 11:20, Martin Pieuchot wrote:
> Start by moving `pgo_fault' handler outside of uvm_fault_lower().
> 
> If a page has a backing object that prefer to handler to fault itself
> the locking will be different, so keep it under KERNEL_LOCK() for the
> moment and make it separate from the rest of uvm_fault_lower().
> 
> ok?

Anyone?

> 
> Index: uvm/uvm_fault.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.115
> diff -u -p -r1.115 uvm_fault.c
> --- uvm/uvm_fault.c   16 Feb 2021 09:10:17 -  1.115
> +++ uvm/uvm_fault.c   16 Feb 2021 10:13:58 -
> @@ -598,10 +598,37 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
>   /* case 1: fault on an anon in our amap */
>   error = uvm_fault_upper(&ufi, &flt, anons, fault_type);
>   } else {
> - /* case 2: fault on backing object or zero fill */
> - KERNEL_LOCK();
> - error = uvm_fault_lower(&ufi, &flt, pages, fault_type);
> - KERNEL_UNLOCK();
> + struct uvm_object *uobj = ufi.entry->object.uvm_obj;
> +
> + /*
> +  * if the desired page is not shadowed by the amap and
> +  * we have a backing object, then we check to see if
> +  * the backing object would prefer to handle the fault
> +  * itself (rather than letting us do it with the usual
> +  * pgo_get hook).  the backing object signals this by
> +  * providing a pgo_fault routine.
> +  */
> + if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
> + KERNEL_LOCK();
> + error = uobj->pgops->pgo_fault(&ufi,
> + flt.startva, pages, flt.npages,
> + flt.centeridx, fault_type, flt.access_type,
> + PGO_LOCKED);
> + KERNEL_UNLOCK();
> +
> + if (error == VM_PAGER_OK)
> + error = 0;
> + else if (error == VM_PAGER_REFAULT)
> + error = ERESTART;
> + else
> + error = EACCES;
> + } else {
> + /* case 2: fault on backing obj or zero fill */
> + KERNEL_LOCK();
> + error = uvm_fault_lower(&ufi, &flt, pages,
> + fault_type);
> + KERNEL_UNLOCK();
> + }
>   }
>   }
>  
> @@ -1054,26 +1081,6 @@ uvm_fault_lower(struct uvm_faultinfo *uf
>   struct vm_anon *anon = NULL;
>   vaddr_t currva;
>   voff_t uoff;
> -
> - /*
> -  * if the desired page is not shadowed by the amap and we have a
> -  * backing object, then we check to see if the backing object would
> -  * prefer to handle the fault itself (rather than letting us do it
> -  * with the usual pgo_get hook).  the backing object signals this by
> -  * providing a pgo_fault routine.
> -  */
> - if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
> - result = uobj->pgops->pgo_fault(ufi, flt->startva, pages,
> - flt->npages, flt->centeridx, fault_type, flt->access_type,
> - PGO_LOCKED);
> -
> - if (result == VM_PAGER_OK)
> - return (0); /* pgo_fault did pmap enter */
> - else if (result == VM_PAGER_REFAULT)
> - return ERESTART;/* try again! */
> - else
> - return (EACCES);
> - }
>  
>   /*
>* now, if the desired page is not shadowed by the amap and we have
> 



uvm_fault: uvm_fault_lower_lookup() refactoring

2021-02-22 Thread Martin Pieuchot
Similar refactoring to what has been done for the upper part of the fault
handler, ok?

Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.115
diff -u -p -r1.115 uvm_fault.c
--- uvm/uvm_fault.c 16 Feb 2021 09:10:17 -  1.115
+++ uvm/uvm_fault.c 22 Feb 2021 09:11:53 -
@@ -1039,6 +1039,98 @@ uvm_fault_upper(struct uvm_faultinfo *uf
 }
 
 /*
+ * uvm_fault_lower_lookup: look up on-memory uobj pages.
+ *
+ * 1. get on-memory pages.
+ * 2. if failed, give up (get only center page later).
+ * 3. if succeeded, enter h/w mapping of neighbor pages.
+ */
+
+struct vm_page *
+uvm_fault_lower_lookup(
+   struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
+   struct vm_page **pages)
+{
+   struct uvm_object *uobj = ufi->entry->object.uvm_obj;
+   struct vm_page *uobjpage = NULL;
+   int lcv, gotpages;
+   vaddr_t currva;
+
+   counters_inc(uvmexp_counters, flt_lget);
+   gotpages = flt->npages;
+   (void) uobj->pgops->pgo_get(uobj,
+   ufi->entry->offset + (flt->startva - ufi->entry->start),
+   pages, &gotpages, flt->centeridx,
+   flt->access_type & MASK(ufi->entry), ufi->entry->advice,
+   PGO_LOCKED);
+
+   /*
+* check for pages to map, if we got any
+*/
+   if (gotpages == 0) {
+   return NULL;
+   }
+
+   currva = flt->startva;
+   for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
+   if (pages[lcv] == NULL ||
+   pages[lcv] == PGO_DONTCARE)
+   continue;
+
+   KASSERT((pages[lcv]->pg_flags & PG_RELEASED) == 0);
+
+   /*
+* if center page is resident and not
+* PG_BUSY, then pgo_get made it PG_BUSY
+* for us and gave us a handle to it.
+* remember this page as "uobjpage."
+* (for later use).
+*/
+   if (lcv == flt->centeridx) {
+   uobjpage = pages[lcv];
+   continue;
+   }
+
+   /*
+* note: calling pgo_get with locked data
+* structures returns us pages which are
+* neither busy nor released, so we don't
+* need to check for this.   we can just
+* directly enter the page (after moving it
+* to the head of the active queue [useful?]).
+*/
+
+   uvm_lock_pageq();
+   uvm_pageactivate(pages[lcv]);   /* reactivate */
+   uvm_unlock_pageq();
+   counters_inc(uvmexp_counters, flt_nomap);
+
+   /*
+* Since this page isn't the page that's
+* actually faulting, ignore pmap_enter()
+* failures; it's not critical that we
+* enter these right now.
+*/
+   (void) pmap_enter(ufi->orig_map->pmap, currva,
+   VM_PAGE_TO_PHYS(pages[lcv]) | flt->pa_flags,
+   flt->enter_prot & MASK(ufi->entry),
+   PMAP_CANFAIL |
+(flt->wired ? PMAP_WIRED : 0));
+
+   /*
+* NOTE: page can't be PG_WANTED because
+* we've held the lock the whole time
+* we've had the handle.
+*/
+   atomic_clearbits_int(&pages[lcv]->pg_flags, PG_BUSY);
+   UVM_PAGE_OWN(pages[lcv], NULL);
+   }
+   pmap_update(ufi->orig_map->pmap);
+
+   return uobjpage;
+}
+
+/*
  * uvm_fault_lower: handle lower fault.
  *
  */
@@ -1049,10 +1141,9 @@ uvm_fault_lower(struct uvm_faultinfo *uf
struct vm_amap *amap = ufi->entry->aref.ar_amap;
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
boolean_t promote, locked;
-   int result, lcv, gotpages;
+   int result;
struct vm_page *uobjpage, *pg = NULL;
struct vm_anon *anon = NULL;
-   vaddr_t currva;
voff_t uoff;
 
/*
@@ -1081,82 +1172,11 @@ uvm_fault_lower(struct uvm_faultinfo *uf
 * we ask (with pgo_get) the object for resident pages that we care
 * about and attempt to map them in.  we do not let pgo_get block
 * (PGO_LOCKED).
-*
-* ("get" has the option of doing a pmap_enter for us)
 */
-   if (uobj != NULL) {
-   counters_inc(uvmexp_counters, flt_lget);
-   gotpages = flt->npages;
-   (void) uobj->pgops->pgo_get(uobj, ufi->entry->offset +
-   (flt->startva - ufi->entry->start),
-   pages, &gotpages, flt->centeridx,
-   flt->access_type & MASK(ufi->entry),
-   ufi->entry->advice, PGO_LOCKED);
-
-   /* check for pag

Re: uvm_fault_lower refactoring

2021-02-23 Thread Martin Pieuchot
On 23/02/21(Tue) 00:24, Mark Kettenis wrote:
> > Date: Mon, 22 Feb 2021 10:10:21 +0100
> > From: Martin Pieuchot 
> > 
> > On 16/02/21(Tue) 11:20, Martin Pieuchot wrote:
> > > Start by moving `pgo_fault' handler outside of uvm_fault_lower().
> > > 
> > > If a page has a backing object that prefer to handler to fault itself
> > > the locking will be different, so keep it under KERNEL_LOCK() for the
> > > moment and make it separate from the rest of uvm_fault_lower().
> > > 
> > > ok?
> > 
> > Anyone?
> 
> This diverges from NetBSD; you think that's a good idea?

Which tree are you looking at?  I'm doing this exactly to reduce
differences with NetBSD.  r1.228 of uvm_fault.c in NetBSD contain this
logic in uvm_fault_internal().

> > > Index: uvm/uvm_fault.c
> > > ===
> > > RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> > > retrieving revision 1.115
> > > diff -u -p -r1.115 uvm_fault.c
> > > --- uvm/uvm_fault.c   16 Feb 2021 09:10:17 -  1.115
> > > +++ uvm/uvm_fault.c   16 Feb 2021 10:13:58 -
> > > @@ -598,10 +598,37 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
> > >   /* case 1: fault on an anon in our amap */
> > >   error = uvm_fault_upper(&ufi, &flt, anons, fault_type);
> > >   } else {
> > > - /* case 2: fault on backing object or zero fill */
> > > - KERNEL_LOCK();
> > > - error = uvm_fault_lower(&ufi, &flt, pages, fault_type);
> > > - KERNEL_UNLOCK();
> > > + struct uvm_object *uobj = ufi.entry->object.uvm_obj;
> > > +
> > > + /*
> > > +  * if the desired page is not shadowed by the amap and
> > > +  * we have a backing object, then we check to see if
> > > +  * the backing object would prefer to handle the fault
> > > +  * itself (rather than letting us do it with the usual
> > > +  * pgo_get hook).  the backing object signals this by
> > > +  * providing a pgo_fault routine.
> > > +  */
> > > + if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
> > > + KERNEL_LOCK();
> > > + error = uobj->pgops->pgo_fault(&ufi,
> > > + flt.startva, pages, flt.npages,
> > > + flt.centeridx, fault_type, flt.access_type,
> > > + PGO_LOCKED);
> > > + KERNEL_UNLOCK();
> > > +
> > > + if (error == VM_PAGER_OK)
> > > + error = 0;
> > > + else if (error == VM_PAGER_REFAULT)
> > > + error = ERESTART;
> > > + else
> > > + error = EACCES;
> > > + } else {
> > > + /* case 2: fault on backing obj or zero fill */
> > > + KERNEL_LOCK();
> > > + error = uvm_fault_lower(&ufi, &flt, pages,
> > > + fault_type);
> > > + KERNEL_UNLOCK();
> > > + }
> > >   }
> > >   }
> > >  
> > > @@ -1054,26 +1081,6 @@ uvm_fault_lower(struct uvm_faultinfo *uf
> > >   struct vm_anon *anon = NULL;
> > >   vaddr_t currva;
> > >   voff_t uoff;
> > > -
> > > - /*
> > > -  * if the desired page is not shadowed by the amap and we have a
> > > -  * backing object, then we check to see if the backing object would
> > > -  * prefer to handle the fault itself (rather than letting us do it
> > > -  * with the usual pgo_get hook).  the backing object signals this by
> > > -  * providing a pgo_fault routine.
> > > -  */
> > > - if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
> > > - result = uobj->pgops->pgo_fault(ufi, flt->startva, pages,
> > > - flt->npages, flt->centeridx, fault_type, flt->access_type,
> > > - PGO_LOCKED);
> > > -
> > > - if (result == VM_PAGER_OK)
> > > - return (0); /* pgo_fault did pmap enter */
> > > - else if (result == VM_PAGER_REFAULT)
> > > - return ERESTART;/* try again! */
> > > - else
> > > - return (EACCES);
> > > - }
> > >  
> > >   /*
> > >* now, if the desired page is not shadowed by the amap and we have
> > > 
> > 
> > 



Different fix for vnode deadlock

2021-02-23 Thread Martin Pieuchot
Page faults on vnode-backed objects commonly end up calling VOP_READ(9)
or VOP_WRITE(9) to go through the buffer cache.  This implies grabbing
an inode lock after any UVM locking.

On the other hand changing the size of a vnode results in entering UVM,
generally via calling uvm_vnp_setsize() with a locked inode.

Syzkaller exposed a deadlock that anton@ fixed in r1.108 of uvm/uvm_vnode.c
by making the page fault path grab the inode lock earlier.  Sadly such
change isn't compatible with a finer locking required to unlock the lower
part of the UVM fault handler.

UVM's code make use of the PG_BUSY flag to ask other threads to not touch
a given page.  This is done to keep the ownership of a page after having
released its associated lock.  This is currently hard to follow because
the locking code has been removed ;)

With the current fix, the PG_BUSY flag is set after grabbing the inode
lock which creates a lock ordering problem with `uobj->vmlock' being
released after setting the flag.

So the diff below takes a different approach, if the thread that faulted
finds that the `inode' is contended it stops there and restart the fault.
This has the side effect of un-PG_BUSY the pages and allows the other
thread to make progress.

This is enough to move forward with `uobj->vmlock' without changing the
interaction between the existing buffer cache and UVM locking (thanks!).

I couldn't trigger the deadlock with regress/sys/uvm/vnode with this
diff. 

Is the explanation clear enough?  Comments?  Oks?

Index: uvm/uvm_vnode.c
===
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.108
diff -u -p -r1.108 uvm_vnode.c
--- uvm/uvm_vnode.c 26 Oct 2020 19:48:19 -  1.108
+++ uvm/uvm_vnode.c 23 Feb 2021 10:46:50 -
@@ -90,9 +90,6 @@ intuvn_io(struct uvm_vnode *, vm_page
 int uvn_put(struct uvm_object *, vm_page_t *, int, boolean_t);
 voiduvn_reference(struct uvm_object *);
 
-int uvm_vnode_lock(struct uvm_vnode *);
-voiduvm_vnode_unlock(struct uvm_vnode *);
-
 /*
  * master pager structure
  */
@@ -878,16 +875,11 @@ uvn_cluster(struct uvm_object *uobj, vof
 int
 uvn_put(struct uvm_object *uobj, struct vm_page **pps, int npages, int flags)
 {
-   struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
int retval;
 
KERNEL_ASSERT_LOCKED();
 
-   retval = uvm_vnode_lock(uvn);
-   if (retval)
-   return(retval);
-   retval = uvn_io(uvn, pps, npages, flags, UIO_WRITE);
-   uvm_vnode_unlock(uvn);
+   retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE);
 
return(retval);
 }
@@ -905,10 +897,9 @@ int
 uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
 int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
 {
-   struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
voff_t current_offset;
struct vm_page *ptmp;
-   int lcv, result, gotpages, retval;
+   int lcv, result, gotpages;
boolean_t done;
 
KERNEL_ASSERT_LOCKED();
@@ -983,18 +974,6 @@ uvn_get(struct uvm_object *uobj, voff_t 
}
 
/*
-* Before getting non-resident pages which must be populate with data
-* using I/O on the backing vnode, lock the same vnode. Such pages are
-* about to be allocated and busied (i.e. PG_BUSY) by the current
-* thread. Allocating and busying the page(s) before acquiring the
-* vnode lock could cause a deadlock with uvn_flush() which acquires the
-* vnode lock before waiting on pages to become unbusy and then flushed.
-*/
-   retval = uvm_vnode_lock(uvn);
-   if (retval)
-   return(retval);
-
-   /*
 * step 2: get non-resident or busy pages.
 * data structures are unlocked.
 *
@@ -1080,15 +1059,14 @@ uvn_get(struct uvm_object *uobj, voff_t 
 * we have a "fake/busy/clean" page that we just allocated.  do
 * I/O to fill it with valid data.
 */
-   result = uvn_io(uvn, &ptmp, 1, PGO_SYNCIO, UIO_READ);
+   result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1,
+   PGO_SYNCIO|PGO_NOWAIT, UIO_READ);
 
/*
 * I/O done.  because we used syncio the result can not be
 * PEND or AGAIN.
 */
if (result != VM_PAGER_OK) {
-   uvm_vnode_unlock(uvn);
-
if (ptmp->pg_flags & PG_WANTED)
wakeup(ptmp);
 
@@ -1119,15 +1097,12 @@ uvn_get(struct uvm_object *uobj, voff_t 
 
}
 
-   uvm_vnode_unlock(uvn);
-
return (VM_PAGER_OK);
 }
 
 /*
  * uvn_io: do I/O to a vnode
  *
- * => uvn: the backing vnode must be locked
  * => prefer map unlocked (not required)
  * => flags: PGO_SYNCIO -- use sync. I/O
  * 

Re: pdaemon vs anon locking

2021-02-24 Thread Martin Pieuchot
On 17/02/21(Wed) 11:56, Martin Pieuchot wrote:
> Diff below adds anon locking to the page daemon.  It will become
> necessary to guarantee exclusive access to an anon as soon as the
> KERNEL_LOCK() is removed from the fault handler.

Anyone?  This should have been part of the already committed anon
locking.

> Index: uvm/uvm_pdaemon.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> retrieving revision 1.88
> diff -u -p -r1.88 uvm_pdaemon.c
> --- uvm/uvm_pdaemon.c 24 Nov 2020 13:49:09 -  1.88
> +++ uvm/uvm_pdaemon.c 17 Feb 2021 10:05:43 -
> @@ -460,7 +460,13 @@ uvmpd_scan_inactive(struct pglist *pglst
>   if (p->pg_flags & PQ_ANON) {
>   anon = p->uanon;
>   KASSERT(anon != NULL);
> + if (rw_enter(anon->an_lock,
> + RW_WRITE|RW_NOSLEEP)) {
> + /* lock failed, skip this page */
> + continue;
> + }
>   if (p->pg_flags & PG_BUSY) {
> + rw_exit(anon->an_lock);
>   uvmexp.pdbusy++;
>   /* someone else owns page, skip it */
>   continue;
> @@ -504,6 +510,7 @@ uvmpd_scan_inactive(struct pglist *pglst
>  
>   /* remove from object */
>   anon->an_page = NULL;
> + rw_exit(anon->an_lock);
>   }
>   continue;
>   }
> @@ -513,6 +520,9 @@ uvmpd_scan_inactive(struct pglist *pglst
>* free target when all the current pageouts complete.
>*/
>   if (free + uvmexp.paging > uvmexp.freetarg << 2) {
> + if (anon) {
> + rw_exit(anon->an_lock);
> + }
>   continue;
>   }
>  
> @@ -525,6 +535,9 @@ uvmpd_scan_inactive(struct pglist *pglst
>   if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
>   dirtyreacts++;
>   uvm_pageactivate(p);
> + if (anon) {
> + rw_exit(anon->an_lock);
> + }
>   continue;
>   }
>  
> @@ -591,6 +604,8 @@ uvmpd_scan_inactive(struct pglist *pglst
>   &p->pg_flags,
>   PG_BUSY);
>   UVM_PAGE_OWN(p, NULL);
> + if (anon)
> + rw_exit(anon->an_lock);
>   continue;
>   }
>   swcpages = 0;   /* cluster is empty */
> @@ -622,6 +637,9 @@ uvmpd_scan_inactive(struct pglist *pglst
>*/
>   if (swap_backed) {
>   if (p) {/* if we just added a page to cluster */
> + if (anon)
> + rw_exit(anon->an_lock);
> +
>   /* cluster not full yet? */
>   if (swcpages < swnpages)
>   continue;
> @@ -730,6 +748,12 @@ uvmpd_scan_inactive(struct pglist *pglst
>   /* relock p's object: page queues not lock yet, so
>* no need for "try" */
>  
> + /* !swap_backed case: already locked... */
> + if (swap_backed) {
> + if (anon)
> + rw_enter(anon->an_lock, RW_WRITE);
> + }
> +
>  #ifdef DIAGNOSTIC
>   if (result == VM_PAGER_UNLOCK)
>   panic("pagedaemon: pageout returned "
> @@ -754,6 +778,7 @@ uvmpd_scan_inactive(struct pglist *pglst
>   anon->an_page = NULL;
>   p->uanon = NULL;
>  
> + rw_exit(anon->a

uvm: modify `uvmexp.swpgonly' atomically

2021-02-24 Thread Martin Pieuchot
As soon as the upper part of the page fault handler is executed w/o
KERNEL_LOCK(), uvm_anfree_list() will also be executed without it.

To not corrupt the value of `uvmexp.swpgonly' counter, use atomic
operations to modify it.

ok?

Index: uvm/uvm_anon.c
===
RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
retrieving revision 1.51
diff -u -p -r1.51 uvm_anon.c
--- uvm/uvm_anon.c  19 Jan 2021 13:21:36 -  1.51
+++ uvm/uvm_anon.c  24 Feb 2021 09:48:41 -
@@ -120,9 +120,9 @@ uvm_anfree_list(struct vm_anon *anon, st
}
} else {
if (anon->an_swslot != 0) {
-   /* this page is no longer only in swap. */
+   /* This page is no longer only in swap. */
KASSERT(uvmexp.swpgonly > 0);
-   uvmexp.swpgonly--;
+   atomic_dec_int(&uvmexp.swpgonly);
}
}
anon->an_lock = NULL;
Index: uvm/uvm_aobj.c
===
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.90
diff -u -p -r1.90 uvm_aobj.c
--- uvm/uvm_aobj.c  11 Jan 2021 18:51:09 -  1.90
+++ uvm/uvm_aobj.c  24 Feb 2021 09:50:39 -
@@ -381,7 +381,7 @@ uao_free(struct uvm_aobj *aobj)
 * this page is no longer
 * only in swap.
 */
-   uvmexp.swpgonly--;
+   atomic_dec_int(&uvmexp.swpgonly);
}
 
next = LIST_NEXT(elt, list);
@@ -400,7 +400,7 @@ uao_free(struct uvm_aobj *aobj)
if (slot) {
uvm_swap_free(slot, 1);
/* this page is no longer only in swap. */
-   uvmexp.swpgonly--;
+   atomic_dec_int(&uvmexp.swpgonly);
}
}
free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
@@ -1549,6 +1549,6 @@ uao_dropswap_range(struct uvm_object *uo
 */
if (swpgonlydelta > 0) {
KASSERT(uvmexp.swpgonly >= swpgonlydelta);
-   uvmexp.swpgonly -= swpgonlydelta;
+   atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
}
 }
Index: uvm/uvm_km.c
===
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
retrieving revision 1.139
diff -u -p -r1.139 uvm_km.c
--- uvm/uvm_km.c15 Dec 2020 22:14:42 -  1.139
+++ uvm/uvm_km.c24 Feb 2021 09:52:19 -
@@ -242,6 +242,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
struct vm_page *pp;
voff_t curoff;
int slot;
+   int swpgonlydelta = 0;
 
KASSERT(uobj->pgops == &aobj_pager);
 
@@ -262,8 +263,13 @@ uvm_km_pgremove(struct uvm_object *uobj,
uvm_pagefree(pp);
uvm_unlock_pageq();
} else if (slot != 0) {
-   uvmexp.swpgonly--;
+   swpgonlydelta++;
}
+   }
+
+   if (swpgonlydelta > 0) {
+   KASSERT(uvmexp.swpgonly >= swpgonlydelta);
+   atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
}
 }
 
Index: uvm/uvm_pdaemon.c
===
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.88
diff -u -p -r1.88 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c   24 Nov 2020 13:49:09 -  1.88
+++ uvm/uvm_pdaemon.c   24 Feb 2021 09:53:48 -
@@ -485,7 +485,7 @@ uvmpd_scan_inactive(struct pglist *pglst
if (p->pg_flags & PG_CLEAN) {
if (p->pg_flags & PQ_SWAPBACKED) {
/* this page now lives only in swap */
-   uvmexp.swpgonly++;
+   atomic_inc_int(&uvmexp.swpgonly);
}
 
/* zap all mappings with pmap_page_protect... */
@@ -963,7 +963,7 @@ uvmpd_drop(struct pglist *pglst)
if (p->pg_flags & PG_CLEAN) {
if (p->pg_flags & PQ_SWAPBACKED) {
/* this page now lives only in swap */
-   uvmexp.swpgonly++;
+   atomic_inc_int(&uvmexp.swpgonly);
}
 
/* zap all mappings with pmap_page_protect... */
Index: uvm/uvm_swap.c
===
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.148
diff -u -p -r1.148 uvm_swap.c

Merge issignal() and CURSIG()

2021-03-02 Thread Martin Pieuchot
t/rw/msleep(9) functions call CURSIG() which needs the KERNEL_LOCK().

To remove this requirement I'd like to start by merging CURSIG() with
its underlying function issignal().  The goal of this merge is to avoid
accessing shared value like `ps_siglist' multiple times.

The diff below moves the content of the CURSIG() macro into issignal()
which shows that many checks are redundant.

Comments, oks?

Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.273
diff -u -p -r1.273 kern_sig.c
--- kern/kern_sig.c 15 Feb 2021 09:35:59 -  1.273
+++ kern/kern_sig.c 2 Mar 2021 10:50:49 -
@@ -1035,7 +1035,7 @@ ptsignal(struct proc *p, int signum, enu
goto out;
/*
 * Process is sleeping and traced... make it runnable
-* so it can discover the signal in issignal() and stop
+* so it can discover the signal in cursig() and stop
 * for the parent.
 */
if (pr->ps_flags & PS_TRACED)
@@ -1159,28 +1159,36 @@ out:
 }
 
 /*
+ * Determine signal that should be delivered to process p, the current
+ * process, 0 if none.
+ *
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
- * a syscall or trap (though this can usually be done without calling issignal
- * by checking the pending signal masks in the CURSIG macro.) The normal call
- * sequence is
+ * a syscall or trap. The normal call sequence is
  *
- * while (signum = CURSIG(curproc))
+ * while (signum = cursig(curproc))
  * postsig(signum);
  *
  * Assumes that if the P_SINTR flag is set, we're holding both the
  * kernel and scheduler locks.
  */
 int
-issignal(struct proc *p)
+cursig(struct proc *p)
 {
struct process *pr = p->p_p;
-   int signum, mask, prop;
+   int sigpending, signum, mask, prop;
int dolock = (p->p_flag & P_SINTR) == 0;
int s;
 
+   sigpending = (p->p_siglist | pr->ps_siglist);
+   if (sigpending == 0)
+   return 0;
+
+   if (!ISSET(pr->ps_flags, PS_TRACED) && SIGPENDING(p) == 0)
+   return 0;
+
for (;;) {
mask = SIGPENDING(p);
if (pr->ps_flags & PS_PPWAIT)
@@ -1304,7 +1312,7 @@ issignal(struct proc *p)
 */
if ((prop & SA_CONT) == 0 &&
(pr->ps_flags & PS_TRACED) == 0)
-   printf("issignal\n");
+   printf("%s\n", __func__);
break;  /* == ignore */
default:
/*
@@ -1766,7 +1774,7 @@ sys___thrsigdivert(struct proc *p, void 
 
dosigsuspend(p, p->p_sigmask &~ mask);
for (;;) {
-   si.si_signo = CURSIG(p);
+   si.si_signo = cursig(p);
if (si.si_signo != 0) {
sigset_t smask = sigmask(si.si_signo);
if (smask & mask) {
@@ -1907,7 +1915,7 @@ userret(struct proc *p)
 
if (SIGPENDING(p) != 0) {
KERNEL_LOCK();
-   while ((signum = CURSIG(p)) != 0)
+   while ((signum = cursig(p)) != 0)
postsig(p, signum);
KERNEL_UNLOCK();
}
@@ -1923,7 +1931,7 @@ userret(struct proc *p)
p->p_sigmask = p->p_oldmask;
 
KERNEL_LOCK();
-   while ((signum = CURSIG(p)) != 0)
+   while ((signum = cursig(p)) != 0)
postsig(p, signum);
KERNEL_UNLOCK();
}
Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.176
diff -u -p -r1.176 kern_synch.c
--- kern/kern_synch.c   8 Feb 2021 10:51:02 -   1.176
+++ kern/kern_synch.c   2 Mar 2021 10:49:15 -
@@ -479,7 +479,7 @@ sleep_signal_check(void)
 
if ((err = single_thread_check(p, 1)) != 0)
return err;
-   if ((sig = CURSIG(p)) != 0) {
+   if ((sig = cursig(p)) != 0) {
if (p->p_p->ps_sigacts->ps_sigintr & sigmask(sig))
return EINTR;
else
Index: sys/signalvar.h
===
RCS file: /cvs/src/sys/sys/signalvar.h,v
retrieving revision 1.45
diff -u -p -r1.45 signalvar.h
--- sys/signalvar.h 8 Nov 2020 20:37:24 -   1.45
+++ sys/signalvar.h 2 Mar 2021 10:49:41 -
@@ -72,17 +72,6 @@ struct   sigacts {
(((p)->p_siglist | (p)->p_p->ps_siglist) & ~(p)->p_sigma

Re: uvm: modify `uvmexp.swpgonly' atomically

2021-03-03 Thread Martin Pieuchot
On 24/02/21(Wed) 11:33, Martin Pieuchot wrote:
> As soon as the upper part of the page fault handler is executed w/o
> KERNEL_LOCK(), uvm_anfree_list() will also be executed without it.
> 
> To not corrupt the value of `uvmexp.swpgonly' counter, use atomic
> operations to modify it.
> 
> ok?

Anyone?

> Index: uvm/uvm_anon.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
> retrieving revision 1.51
> diff -u -p -r1.51 uvm_anon.c
> --- uvm/uvm_anon.c19 Jan 2021 13:21:36 -  1.51
> +++ uvm/uvm_anon.c24 Feb 2021 09:48:41 -
> @@ -120,9 +120,9 @@ uvm_anfree_list(struct vm_anon *anon, st
>   }
>   } else {
>   if (anon->an_swslot != 0) {
> - /* this page is no longer only in swap. */
> + /* This page is no longer only in swap. */
>   KASSERT(uvmexp.swpgonly > 0);
> - uvmexp.swpgonly--;
> + atomic_dec_int(&uvmexp.swpgonly);
>   }
>   }
>   anon->an_lock = NULL;
> Index: uvm/uvm_aobj.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
> retrieving revision 1.90
> diff -u -p -r1.90 uvm_aobj.c
> --- uvm/uvm_aobj.c11 Jan 2021 18:51:09 -  1.90
> +++ uvm/uvm_aobj.c24 Feb 2021 09:50:39 -
> @@ -381,7 +381,7 @@ uao_free(struct uvm_aobj *aobj)
>* this page is no longer
>* only in swap.
>*/
> - uvmexp.swpgonly--;
> + atomic_dec_int(&uvmexp.swpgonly);
>   }
>  
>   next = LIST_NEXT(elt, list);
> @@ -400,7 +400,7 @@ uao_free(struct uvm_aobj *aobj)
>   if (slot) {
>   uvm_swap_free(slot, 1);
>   /* this page is no longer only in swap. */
> - uvmexp.swpgonly--;
> + atomic_dec_int(&uvmexp.swpgonly);
>   }
>   }
>   free(aobj->u_swslots, M_UVMAOBJ, aobj->u_pages * sizeof(int));
> @@ -1549,6 +1549,6 @@ uao_dropswap_range(struct uvm_object *uo
>*/
>   if (swpgonlydelta > 0) {
>   KASSERT(uvmexp.swpgonly >= swpgonlydelta);
> - uvmexp.swpgonly -= swpgonlydelta;
> + atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
>   }
>  }
> Index: uvm/uvm_km.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_km.c,v
> retrieving revision 1.139
> diff -u -p -r1.139 uvm_km.c
> --- uvm/uvm_km.c  15 Dec 2020 22:14:42 -  1.139
> +++ uvm/uvm_km.c  24 Feb 2021 09:52:19 -
> @@ -242,6 +242,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
>   struct vm_page *pp;
>   voff_t curoff;
>   int slot;
> + int swpgonlydelta = 0;
>  
>   KASSERT(uobj->pgops == &aobj_pager);
>  
> @@ -262,8 +263,13 @@ uvm_km_pgremove(struct uvm_object *uobj,
>   uvm_pagefree(pp);
>   uvm_unlock_pageq();
>   } else if (slot != 0) {
> - uvmexp.swpgonly--;
> + swpgonlydelta++;
>   }
> + }
> +
> + if (swpgonlydelta > 0) {
> + KASSERT(uvmexp.swpgonly >= swpgonlydelta);
> + atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
>   }
>  }
>  
> Index: uvm/uvm_pdaemon.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> retrieving revision 1.88
> diff -u -p -r1.88 uvm_pdaemon.c
> --- uvm/uvm_pdaemon.c 24 Nov 2020 13:49:09 -  1.88
> +++ uvm/uvm_pdaemon.c 24 Feb 2021 09:53:48 -
> @@ -485,7 +485,7 @@ uvmpd_scan_inactive(struct pglist *pglst
>   if (p->pg_flags & PG_CLEAN) {
>   if (p->pg_flags & PQ_SWAPBACKED) {
>   /* this page now lives only in swap */
> - uvmexp.swpgonly++;
> + atomic_inc_int(&uvmexp.swpgonly);
>   }
>  
>   /* zap all mappings with pmap_page_protect... */
> @@ -963,7 +963,7 @@ uvmpd_drop(struct pglist *pglst)
>   if (p->pg_flags & PG_CLEAN) {
>

Kill SINGLE_PTRACE

2021-03-04 Thread Martin Pieuchot
SINGLE_PTRACE has almost the same semantic as SINGLE_SUSPEND.  The
difference is that there's no need to wait for other threads to be
parked.

Diff below changes single_thread_set() to be explicit when waiting is
required.  This allows us to get rid of SINGLE_PTRACE now and soon to
use SINGLE_SUSPEND around proc_stop(), even when the thread is not being
traced.

ok?

Index: kern/kern_exec.c
===
RCS file: /cvs/src/sys/kern/kern_exec.c,v
retrieving revision 1.219
diff -u -p -r1.219 kern_exec.c
--- kern/kern_exec.c15 Oct 2020 16:31:11 -  1.219
+++ kern/kern_exec.c4 Mar 2021 09:17:09 -
@@ -432,7 +432,7 @@ sys_execve(struct proc *p, void *v, regi
 * we're committed: any further errors will kill the process, so
 * kill the other threads now.
 */
-   single_thread_set(p, SINGLE_EXIT, 0);
+   single_thread_set(p, SINGLE_EXIT, 1);
 
/*
 * Prepare vmspace for remapping. Note that uvmspace_exec can replace
Index: kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.196
diff -u -p -r1.196 kern_exit.c
--- kern/kern_exit.c15 Feb 2021 09:35:59 -  1.196
+++ kern/kern_exit.c4 Mar 2021 09:17:10 -
@@ -136,7 +136,7 @@ exit1(struct proc *p, int xexit, int xsi
} else {
/* nope, multi-threaded */
if (flags == EXIT_NORMAL)
-   single_thread_set(p, SINGLE_EXIT, 0);
+   single_thread_set(p, SINGLE_EXIT, 1);
else if (flags == EXIT_THREAD)
single_thread_check(p, 0);
}
Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.274
diff -u -p -r1.274 kern_sig.c
--- kern/kern_sig.c 4 Mar 2021 09:02:37 -   1.274
+++ kern/kern_sig.c 4 Mar 2021 09:17:10 -
@@ -1490,7 +1490,7 @@ sigexit(struct proc *p, int signum)
 
/* if there are other threads, pause them */
if (P_HASSIBLING(p))
-   single_thread_set(p, SINGLE_SUSPEND, 0);
+   single_thread_set(p, SINGLE_SUSPEND, 1);
 
if (coredump(p) == 0)
signum |= WCOREFLAG;
@@ -2000,14 +2000,12 @@ single_thread_check(struct proc *p, int 
  * where the other threads should stop:
  *  - SINGLE_SUSPEND: stop wherever they are, will later either be told to exit
  *(by setting to SINGLE_EXIT) or be released (via single_thread_clear())
- *  - SINGLE_PTRACE: stop wherever they are, will wait for them to stop
- *later (via single_thread_wait()) and released as with SINGLE_SUSPEND
  *  - SINGLE_UNWIND: just unwind to kernel boundary, will be told to exit
  *or released as with SINGLE_SUSPEND
  *  - SINGLE_EXIT: unwind to kernel boundary and exit
  */
 int
-single_thread_set(struct proc *p, enum single_thread_mode mode, int deep)
+single_thread_set(struct proc *p, enum single_thread_mode mode, int wait)
 {
struct process *pr = p->p_p;
struct proc *q;
@@ -2016,7 +2014,7 @@ single_thread_set(struct proc *p, enum s
KASSERT(curproc == p);
 
SCHED_LOCK(s);
-   error = single_thread_check_locked(p, deep, s);
+   error = single_thread_check_locked(p, (mode == SINGLE_UNWIND), s);
if (error) {
SCHED_UNLOCK(s);
return error;
@@ -2024,7 +2022,6 @@ single_thread_set(struct proc *p, enum s
 
switch (mode) {
case SINGLE_SUSPEND:
-   case SINGLE_PTRACE:
break;
case SINGLE_UNWIND:
atomic_setbits_int(&pr->ps_flags, PS_SINGLEUNWIND);
@@ -2063,8 +2060,7 @@ single_thread_set(struct proc *p, enum s
/* if it's not interruptible, then just have to wait */
if (q->p_flag & P_SINTR) {
/* merely need to suspend?  just stop it */
-   if (mode == SINGLE_SUSPEND ||
-   mode == SINGLE_PTRACE) {
+   if (mode == SINGLE_SUSPEND) {
q->p_stat = SSTOP;
break;
}
@@ -2089,7 +2085,7 @@ single_thread_set(struct proc *p, enum s
}
SCHED_UNLOCK(s);
 
-   if (mode != SINGLE_PTRACE)
+   if (wait)
single_thread_wait(pr, 1);
 
return 0;



Read `ps_single' once

2021-03-04 Thread Martin Pieuchot
Running t/rw/msleep(9) w/o KERNEL_LOCK() implies that a thread can
change the value of `ps_single' while one of its siblings might be
dereferencing it.  

To prevent inconsistencies in the code executed by sibling thread, the
diff below makes sure `ps_single' is dereferenced only once in various
parts of the kernel.

ok?

Index: kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.196
diff -u -p -r1.196 kern_exit.c
--- kern/kern_exit.c15 Feb 2021 09:35:59 -  1.196
+++ kern/kern_exit.c4 Mar 2021 09:29:27 -
@@ -274,6 +274,8 @@ exit1(struct proc *p, int xexit, int xsi
 */
if (qr->ps_flags & PS_TRACED &&
!(qr->ps_flags & PS_EXITING)) {
+   struct proc *st;
+
process_untrace(qr);
 
/*
@@ -281,9 +283,9 @@ exit1(struct proc *p, int xexit, int xsi
 * direct the signal to the active
 * thread to avoid deadlock.
 */
-   if (qr->ps_single)
-   ptsignal(qr->ps_single, SIGKILL,
-   STHREAD);
+   st = qr->ps_single;
+   if (st != NULL)
+   ptsignal(st, SIGKILL, STHREAD);
else
prsignal(qr, SIGKILL);
} else {
@@ -510,7 +512,7 @@ dowait4(struct proc *q, pid_t pid, int *
 {
int nfound;
struct process *pr;
-   struct proc *p;
+   struct proc *p, *st;
int error;
 
if (pid == 0)
@@ -541,10 +543,11 @@ loop:
proc_finish_wait(q, p);
return (0);
}
+
+   st = pr->ps_single;
if (pr->ps_flags & PS_TRACED &&
-   (pr->ps_flags & PS_WAITED) == 0 && pr->ps_single &&
-   pr->ps_single->p_stat == SSTOP &&
-   (pr->ps_single->p_flag & P_SUSPSINGLE) == 0) {
+   (pr->ps_flags & PS_WAITED) == 0 && st != NULL &&
+   st->p_stat == SSTOP && (st->p_flag & P_SUSPSINGLE) == 0) {
if (single_thread_wait(pr, 0))
goto loop;
 
Index: kern/sys_process.c
===
RCS file: /cvs/src/sys/kern/sys_process.c,v
retrieving revision 1.86
diff -u -p -r1.86 sys_process.c
--- kern/sys_process.c  8 Feb 2021 10:51:02 -   1.86
+++ kern/sys_process.c  4 Mar 2021 09:29:27 -
@@ -273,7 +273,7 @@ sys_ptrace(struct proc *p, void *v, regi
 int
 ptrace_ctrl(struct proc *p, int req, pid_t pid, caddr_t addr, int data)
 {
-   struct proc *t; /* target thread */
+   struct proc *st, *t;/* target thread */
struct process *tr; /* target process */
int error = 0;
int s;
@@ -433,8 +433,9 @@ ptrace_ctrl(struct proc *p, int req, pid
 * from where it stopped."
 */
 
-   if (pid < THREAD_PID_OFFSET && tr->ps_single)
-   t = tr->ps_single;
+   st = tr->ps_single;
+   if (pid < THREAD_PID_OFFSET && st != NULL)
+   t = st;
 
/* If the address parameter is not (int *)1, set the pc. */
if ((int *)addr != (int *)1)
@@ -464,8 +465,9 @@ ptrace_ctrl(struct proc *p, int req, pid
 * from where it stopped."
 */
 
-   if (pid < THREAD_PID_OFFSET && tr->ps_single)
-   t = tr->ps_single;
+   st = tr->ps_single;
+   if (pid < THREAD_PID_OFFSET && st != NULL)
+   t = st;
 
 #ifdef PT_STEP
/*
@@ -495,8 +497,9 @@ ptrace_ctrl(struct proc *p, int req, pid
break;
 
case PT_KILL:
-   if (pid < THREAD_PID_OFFSET && tr->ps_single)
-   t = tr->ps_single;
+   st = tr->ps_single;
+   if (pid < THREAD_PID_OFFSET && st != NULL)
+   t = st;
 
/* just send the process a KILL signal. */
data = SIGKILL;
@@ -536,6 +539,7 @@ int
 ptrace_kstate(struct proc *p, int req, pid_t pid, void *addr)
 {
struct process *tr; /* target process */
+   struct proc *st;
struct ptrace_event *pe = addr;
int error;
 
@@ -582,9 +586,9 @@ ptrace_kstate(struct proc *p, int req, p
tr->ps_ptmask = pe->pe_set_event;
break;
case PT_GET_PROCESS_STATE:
-   if (tr->ps_single)
-   

single_thread_clear() w/o KERNEL_LOCK()

2021-03-04 Thread Martin Pieuchot
single_thread_clear() manipulates the same data structures as
single_thread_set() and, as such, doesn't need the KERNEL_LOCK().

However cursig() does need some sort of serialization to ensure that
per-process data structures like signals, flags and traced-signum stay
consistent.  So the diff below move the assertion up in preparation for
more mp work.

ok?

Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.274
diff -u -p -r1.274 kern_sig.c
--- kern/kern_sig.c 4 Mar 2021 09:02:37 -   1.274
+++ kern/kern_sig.c 4 Mar 2021 09:35:47 -
@@ -1182,6 +1182,8 @@ cursig(struct proc *p)
int dolock = (p->p_flag & P_SINTR) == 0;
int s;
 
+   KERNEL_ASSERT_LOCKED();
+
sigpending = (p->p_siglist | pr->ps_siglist);
if (sigpending == 0)
return 0;
@@ -1225,11 +1227,7 @@ cursig(struct proc *p)
if (dolock)
SCHED_UNLOCK(s);
 
-   if (dolock)
-   KERNEL_LOCK();
single_thread_clear(p, 0);
-   if (dolock)
-   KERNEL_UNLOCK();
 
/*
 * If we are no longer being traced, or the parent
@@ -2128,7 +2126,6 @@ single_thread_clear(struct proc *p, int 
 
KASSERT(pr->ps_single == p);
KASSERT(curproc == p);
-   KERNEL_ASSERT_LOCKED();
 
SCHED_LOCK(s);
pr->ps_single = NULL;



Re: Kill SINGLE_PTRACE

2021-03-04 Thread Martin Pieuchot
On 04/03/21(Thu) 10:36, Claudio Jeker wrote:
> On Thu, Mar 04, 2021 at 10:28:50AM +0100, Martin Pieuchot wrote:
> > SINGLE_PTRACE has almost the same semantic as SINGLE_SUSPEND.  The
> > difference is that there's no need to wait for other threads to be
> > parked.
> > 
> > Diff below changes single_thread_set() to be explicit when waiting is
> > required.  This allows us to get rid of SINGLE_PTRACE now and soon to
> > use SINGLE_SUSPEND around proc_stop(), even when the thread is not being
> > traced.
> > 
> > ok?
> > 
> 
> 
> > @@ -2000,14 +2000,12 @@ single_thread_check(struct proc *p, int 
> >   * where the other threads should stop:
> >   *  - SINGLE_SUSPEND: stop wherever they are, will later either be told to 
> > exit
> >   *(by setting to SINGLE_EXIT) or be released (via 
> > single_thread_clear())
> > - *  - SINGLE_PTRACE: stop wherever they are, will wait for them to stop
> > - *later (via single_thread_wait()) and released as with SINGLE_SUSPEND
> >   *  - SINGLE_UNWIND: just unwind to kernel boundary, will be told to exit
> >   *or released as with SINGLE_SUSPEND
> >   *  - SINGLE_EXIT: unwind to kernel boundary and exit
> >   */
> >  int
> > -single_thread_set(struct proc *p, enum single_thread_mode mode, int deep)
> > +single_thread_set(struct proc *p, enum single_thread_mode mode, int wait)
> >  {
> > struct process *pr = p->p_p;
> > struct proc *q;
> > @@ -2016,7 +2014,7 @@ single_thread_set(struct proc *p, enum s
> > KASSERT(curproc == p);
> >  
> > SCHED_LOCK(s);
> > -   error = single_thread_check_locked(p, deep, s);
> > +   error = single_thread_check_locked(p, (mode == SINGLE_UNWIND), s);
> 
> Either the comment above or the code itself are not correct.
> SINGLE_EXIT is also supposed to unwind according to comment.

The comment documents what sibling threads are supposed to do once the
current one has called single_thread_set() with a given SINGLE_* option.

Sibling threads will continue to execute until the next parking point
where single_thread_check() are.  Parking points are divided in two
categories.  In the "deep" ones unwinding is preferred for UNWIND and
EXIT, in the others only context switching occurs. 

Every single_thread_set() call is in itself a parking point to prevent
races.  The only "deap" parking point is the one in sys_execve() for
obvious reasons.

So maybe we should rename SINGLE_UNWIND into SINGLE_EXEC, would that be
clearer?  If we go this road we might want to rename SINGLE_SUSPEND to
SINGLE_STOP to better describe the reason for parking sibling threads.



Re: Read `ps_single' once

2021-03-04 Thread Martin Pieuchot
On 04/03/21(Thu) 11:01, Mark Kettenis wrote:
> > Date: Thu, 4 Mar 2021 10:54:48 +0100
> > From: Patrick Wildt 
> > 
> > Am Thu, Mar 04, 2021 at 10:42:24AM +0100 schrieb Mark Kettenis:
> > > > Date: Thu, 4 Mar 2021 10:34:24 +0100
> > > > From: Martin Pieuchot 
> > > > 
> > > > Running t/rw/msleep(9) w/o KERNEL_LOCK() implies that a thread can
> > > > change the value of `ps_single' while one of its siblings might be
> > > > dereferencing it.  
> > > > 
> > > > To prevent inconsistencies in the code executed by sibling thread, the
> > > > diff below makes sure `ps_single' is dereferenced only once in various
> > > > parts of the kernel.
> > > > 
> > > > ok?
> > > 
> > > I think that means that ps_single has to be declared "volatile".
> > 
> > Isn't there the READ_ONCE(x) macro, that does exactly that?
> 
> Not a big fan of READ_ONCE() and WRITE_ONCE(), but apparently those
> are needed to comply with the alpha memory model.  At least in some
> cases...

Updated diff using READ_ONCE(), ok?

Index: kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.196
diff -u -p -r1.196 kern_exit.c
--- kern/kern_exit.c15 Feb 2021 09:35:59 -  1.196
+++ kern/kern_exit.c4 Mar 2021 10:15:22 -
@@ -274,6 +274,8 @@ exit1(struct proc *p, int xexit, int xsi
 */
if (qr->ps_flags & PS_TRACED &&
!(qr->ps_flags & PS_EXITING)) {
+   struct proc *st;
+
process_untrace(qr);
 
/*
@@ -281,9 +283,9 @@ exit1(struct proc *p, int xexit, int xsi
 * direct the signal to the active
 * thread to avoid deadlock.
 */
-   if (qr->ps_single)
-   ptsignal(qr->ps_single, SIGKILL,
-   STHREAD);
+   st = READ_ONCE(qr->ps_single);
+   if (st != NULL)
+   ptsignal(st, SIGKILL, STHREAD);
else
prsignal(qr, SIGKILL);
} else {
@@ -510,7 +512,7 @@ dowait4(struct proc *q, pid_t pid, int *
 {
int nfound;
struct process *pr;
-   struct proc *p;
+   struct proc *p, *st;
int error;
 
if (pid == 0)
@@ -541,10 +543,11 @@ loop:
proc_finish_wait(q, p);
return (0);
}
+
+   st = READ_ONCE(pr->ps_single);
if (pr->ps_flags & PS_TRACED &&
-   (pr->ps_flags & PS_WAITED) == 0 && pr->ps_single &&
-   pr->ps_single->p_stat == SSTOP &&
-   (pr->ps_single->p_flag & P_SUSPSINGLE) == 0) {
+   (pr->ps_flags & PS_WAITED) == 0 && st != NULL &&
+   st->p_stat == SSTOP && (st->p_flag & P_SUSPSINGLE) == 0) {
if (single_thread_wait(pr, 0))
goto loop;
 
Index: kern/sys_process.c
===
RCS file: /cvs/src/sys/kern/sys_process.c,v
retrieving revision 1.86
diff -u -p -r1.86 sys_process.c
--- kern/sys_process.c  8 Feb 2021 10:51:02 -   1.86
+++ kern/sys_process.c  4 Mar 2021 10:15:57 -
@@ -273,7 +273,7 @@ sys_ptrace(struct proc *p, void *v, regi
 int
 ptrace_ctrl(struct proc *p, int req, pid_t pid, caddr_t addr, int data)
 {
-   struct proc *t; /* target thread */
+   struct proc *st, *t;/* target thread */
struct process *tr; /* target process */
int error = 0;
int s;
@@ -433,8 +433,9 @@ ptrace_ctrl(struct proc *p, int req, pid
 * from where it stopped."
 */
 
-   if (pid < THREAD_PID_OFFSET && tr->ps_single)
-   t = tr->ps_single;
+   st = READ_ONCE(tr->ps_single);
+   if (pid < THREAD_PID_OFFSET && st != NULL)
+   t = st;
 
/* If the address parameter is not (int *)1, set the pc. */
if ((int *)addr != (int *)1)
@@ -464,8 +465,9 @@ ptrace_ctrl(struct proc *p, int req, pid
 * from where it stopped."
 */
 
-   if (p

Re: Read `ps_single' once

2021-03-05 Thread Martin Pieuchot
On 04/03/21(Thu) 11:45, Mark Kettenis wrote:
> > Date: Thu, 4 Mar 2021 11:19:23 +0100
> > From: Martin Pieuchot 
> > 
> > On 04/03/21(Thu) 11:01, Mark Kettenis wrote:
> > > > Date: Thu, 4 Mar 2021 10:54:48 +0100
> > > > From: Patrick Wildt 
> > > > 
> > > > Am Thu, Mar 04, 2021 at 10:42:24AM +0100 schrieb Mark Kettenis:
> > > > > > Date: Thu, 4 Mar 2021 10:34:24 +0100
> > > > > > From: Martin Pieuchot 
> > > > > > 
> > > > > > Running t/rw/msleep(9) w/o KERNEL_LOCK() implies that a thread can
> > > > > > change the value of `ps_single' while one of its siblings might be
> > > > > > dereferencing it.  
> > > > > > 
> > > > > > To prevent inconsistencies in the code executed by sibling thread, 
> > > > > > the
> > > > > > diff below makes sure `ps_single' is dereferenced only once in 
> > > > > > various
> > > > > > parts of the kernel.
> > > > > > 
> > > > > > ok?
> > > > > 
> > > > > I think that means that ps_single has to be declared "volatile".
> > > > 
> > > > Isn't there the READ_ONCE(x) macro, that does exactly that?
> > > 
> > > Not a big fan of READ_ONCE() and WRITE_ONCE(), but apparently those
> > > are needed to comply with the alpha memory model.  At least in some
> > > cases...
> > 
> > Updated diff using READ_ONCE(), ok?
> 
> If you use READ_ONCE() you shoul also use WRITE_ONCE() everywhere
> where you modify ps_single isn't it?

I don't know, I'm learning how to do it.  I'd appreciate if somebody could
come with a READ_ONCE(9) manual explaining how this API should be used.

Updated diff including the WRITE_ONCE().

Index: kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.196
diff -u -p -r1.196 kern_exit.c
--- kern/kern_exit.c15 Feb 2021 09:35:59 -  1.196
+++ kern/kern_exit.c5 Mar 2021 10:28:05 -
@@ -274,6 +274,8 @@ exit1(struct proc *p, int xexit, int xsi
 */
if (qr->ps_flags & PS_TRACED &&
!(qr->ps_flags & PS_EXITING)) {
+   struct proc *st;
+
process_untrace(qr);
 
/*
@@ -281,9 +283,9 @@ exit1(struct proc *p, int xexit, int xsi
 * direct the signal to the active
 * thread to avoid deadlock.
 */
-   if (qr->ps_single)
-   ptsignal(qr->ps_single, SIGKILL,
-   STHREAD);
+   st = READ_ONCE(qr->ps_single);
+   if (st != NULL)
+   ptsignal(st, SIGKILL, STHREAD);
else
prsignal(qr, SIGKILL);
} else {
@@ -510,7 +512,7 @@ dowait4(struct proc *q, pid_t pid, int *
 {
int nfound;
struct process *pr;
-   struct proc *p;
+   struct proc *p, *st;
int error;
 
if (pid == 0)
@@ -541,10 +543,11 @@ loop:
proc_finish_wait(q, p);
return (0);
}
+
+   st = READ_ONCE(pr->ps_single);
if (pr->ps_flags & PS_TRACED &&
-   (pr->ps_flags & PS_WAITED) == 0 && pr->ps_single &&
-   pr->ps_single->p_stat == SSTOP &&
-   (pr->ps_single->p_flag & P_SUSPSINGLE) == 0) {
+   (pr->ps_flags & PS_WAITED) == 0 && st != NULL &&
+   st->p_stat == SSTOP && (st->p_flag & P_SUSPSINGLE) == 0) {
if (single_thread_wait(pr, 0))
goto loop;
 
Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.274
diff -u -p -r1.274 kern_sig.c
--- kern/kern_sig.c 4 Mar 2021 09:02:37 -   1.274
+++ kern/kern_sig.c 5 Mar 2021 10:28:05 -
@@ -2040,7 +2040,7 @@ single_thread_set(struct proc *p, enum s
}
pr->ps_singlecount = 0;
membar_producer();
-   pr->ps_single = p;
+   WRITE_ONCE(pr->ps_single, p);
TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) {

Re: Kill SINGLE_PTRACE

2021-03-05 Thread Martin Pieuchot
On 04/03/21(Thu) 12:25, Claudio Jeker wrote:
> On Thu, Mar 04, 2021 at 11:06:21AM +0100, Martin Pieuchot wrote:
> > [...]
> > The comment documents what sibling threads are supposed to do once the
> > current one has called single_thread_set() with a given SINGLE_* option.
> > 
> > Sibling threads will continue to execute until the next parking point
> > where single_thread_check() are.  Parking points are divided in two
> > categories.  In the "deep" ones unwinding is preferred for UNWIND and
> > EXIT, in the others only context switching occurs. 
> > 
> > Every single_thread_set() call is in itself a parking point to prevent
> > races.  The only "deep" parking point is the one in sys_execve() for
> > obvious reasons.
> 
> Actually this is where I got confused. This is the place where "deep" is
> the wrong word. The fact that SINGLE_UNWIND will abort the
> single_thread_set() if another thread is in the process to run single
> threaded should probably be added as a comment here. In the end this is
> here to prevent a race between two threads calling execve() at the same
> time.

I'd appreciate if you could add the comment yourself if you're ok with
the diff as is.  I'm not sure to understand what you're suggesting, so
it seems simpler to me if you can pick the words yourself.



Re: Read `ps_single' once

2021-03-08 Thread Martin Pieuchot
On 05/03/21(Fri) 11:30, Martin Pieuchot wrote:
> On 04/03/21(Thu) 11:45, Mark Kettenis wrote:
> > > Date: Thu, 4 Mar 2021 11:19:23 +0100
> > > From: Martin Pieuchot 
> > > 
> > > On 04/03/21(Thu) 11:01, Mark Kettenis wrote:
> > > > > Date: Thu, 4 Mar 2021 10:54:48 +0100
> > > > > From: Patrick Wildt 
> > > > > 
> > > > > Am Thu, Mar 04, 2021 at 10:42:24AM +0100 schrieb Mark Kettenis:
> > > > > > > Date: Thu, 4 Mar 2021 10:34:24 +0100
> > > > > > > From: Martin Pieuchot 
> > > > > > > 
> > > > > > > Running t/rw/msleep(9) w/o KERNEL_LOCK() implies that a thread can
> > > > > > > change the value of `ps_single' while one of its siblings might be
> > > > > > > dereferencing it.  
> > > > > > > 
> > > > > > > To prevent inconsistencies in the code executed by sibling 
> > > > > > > thread, the
> > > > > > > diff below makes sure `ps_single' is dereferenced only once in 
> > > > > > > various
> > > > > > > parts of the kernel.
> > > > > > > 
> > > > > > > ok?
> > > > > > 
> > > > > > I think that means that ps_single has to be declared "volatile".
> > > > > 
> > > > > Isn't there the READ_ONCE(x) macro, that does exactly that?
> > > > 
> > > > Not a big fan of READ_ONCE() and WRITE_ONCE(), but apparently those
> > > > are needed to comply with the alpha memory model.  At least in some
> > > > cases...
> > > 
> > > Updated diff using READ_ONCE(), ok?
> > 
> > If you use READ_ONCE() you shoul also use WRITE_ONCE() everywhere
> > where you modify ps_single isn't it?
> 
> I don't know, I'm learning how to do it.  I'd appreciate if somebody could
> come with a READ_ONCE(9) manual explaining how this API should be used.
> 
> Updated diff including the WRITE_ONCE().

Any ok?

> Index: kern/kern_exit.c
> ===
> RCS file: /cvs/src/sys/kern/kern_exit.c,v
> retrieving revision 1.196
> diff -u -p -r1.196 kern_exit.c
> --- kern/kern_exit.c  15 Feb 2021 09:35:59 -  1.196
> +++ kern/kern_exit.c  5 Mar 2021 10:28:05 -
> @@ -274,6 +274,8 @@ exit1(struct proc *p, int xexit, int xsi
>*/
>   if (qr->ps_flags & PS_TRACED &&
>   !(qr->ps_flags & PS_EXITING)) {
> + struct proc *st;
> +
>   process_untrace(qr);
>  
>   /*
> @@ -281,9 +283,9 @@ exit1(struct proc *p, int xexit, int xsi
>* direct the signal to the active
>* thread to avoid deadlock.
>*/
> - if (qr->ps_single)
> - ptsignal(qr->ps_single, SIGKILL,
> - STHREAD);
> + st = READ_ONCE(qr->ps_single);
> + if (st != NULL)
> + ptsignal(st, SIGKILL, STHREAD);
>   else
>   prsignal(qr, SIGKILL);
>   } else {
> @@ -510,7 +512,7 @@ dowait4(struct proc *q, pid_t pid, int *
>  {
>   int nfound;
>   struct process *pr;
> - struct proc *p;
> + struct proc *p, *st;
>   int error;
>  
>   if (pid == 0)
> @@ -541,10 +543,11 @@ loop:
>   proc_finish_wait(q, p);
>   return (0);
>   }
> +
> + st = READ_ONCE(pr->ps_single);
>   if (pr->ps_flags & PS_TRACED &&
> - (pr->ps_flags & PS_WAITED) == 0 && pr->ps_single &&
> - pr->ps_single->p_stat == SSTOP &&
> - (pr->ps_single->p_flag & P_SUSPSINGLE) == 0) {
> + (pr->ps_flags & PS_WAITED) == 0 && st != NULL &&
> + st->p_stat == SSTOP && (st->p_flag & P_SUSPSINGLE) == 0) {
>   if (single_thread_wait(pr, 0))
>   goto loop;
>  
> Index: kern/kern_sig.c
> ===
> RCS file: /cvs/src

Re: single_thread_clear() w/o KERNEL_LOCK()

2021-03-08 Thread Martin Pieuchot
On 04/03/21(Thu) 10:44, Martin Pieuchot wrote:
> single_thread_clear() manipulates the same data structures as
> single_thread_set() and, as such, doesn't need the KERNEL_LOCK().
> 
> However cursig() does need some sort of serialization to ensure that
> per-process data structures like signals, flags and traced-signum stay
> consistent.  So the diff below move the assertion up in preparation for
> more mp work.
> 
> ok?

Anyone?

> Index: kern/kern_sig.c
> ===
> RCS file: /cvs/src/sys/kern/kern_sig.c,v
> retrieving revision 1.274
> diff -u -p -r1.274 kern_sig.c
> --- kern/kern_sig.c   4 Mar 2021 09:02:37 -   1.274
> +++ kern/kern_sig.c   4 Mar 2021 09:35:47 -
> @@ -1182,6 +1182,8 @@ cursig(struct proc *p)
>   int dolock = (p->p_flag & P_SINTR) == 0;
>   int s;
>  
> + KERNEL_ASSERT_LOCKED();
> +
>   sigpending = (p->p_siglist | pr->ps_siglist);
>   if (sigpending == 0)
>   return 0;
> @@ -1225,11 +1227,7 @@ cursig(struct proc *p)
>   if (dolock)
>   SCHED_UNLOCK(s);
>  
> - if (dolock)
> - KERNEL_LOCK();
>   single_thread_clear(p, 0);
> - if (dolock)
> - KERNEL_UNLOCK();
>  
>   /*
>* If we are no longer being traced, or the parent
> @@ -2128,7 +2126,6 @@ single_thread_clear(struct proc *p, int 
>  
>   KASSERT(pr->ps_single == p);
>   KASSERT(curproc == p);
> - KERNEL_ASSERT_LOCKED();
>  
>   SCHED_LOCK(s);
>   pr->ps_single = NULL;
> 



Re: Read `ps_single' once

2021-03-08 Thread Martin Pieuchot
On 08/03/21(Mon) 11:57, Claudio Jeker wrote:
> On Mon, Mar 08, 2021 at 11:06:44AM +0100, Martin Pieuchot wrote:
> > On 05/03/21(Fri) 11:30, Martin Pieuchot wrote:
> > > On 04/03/21(Thu) 11:45, Mark Kettenis wrote:
> > > > > Date: Thu, 4 Mar 2021 11:19:23 +0100
> > > > > From: Martin Pieuchot 
> > > > > 
> > > > > On 04/03/21(Thu) 11:01, Mark Kettenis wrote:
> > > > > > > Date: Thu, 4 Mar 2021 10:54:48 +0100
> > > > > > > From: Patrick Wildt 
> > > > > > > 
> > > > > > > Am Thu, Mar 04, 2021 at 10:42:24AM +0100 schrieb Mark Kettenis:
> > > > > > > > > Date: Thu, 4 Mar 2021 10:34:24 +0100
> > > > > > > > > From: Martin Pieuchot 
> > > > > > > > > 
> > > > > > > > > Running t/rw/msleep(9) w/o KERNEL_LOCK() implies that a 
> > > > > > > > > thread can
> > > > > > > > > change the value of `ps_single' while one of its siblings 
> > > > > > > > > might be
> > > > > > > > > dereferencing it.  
> > > > > > > > > 
> > > > > > > > > To prevent inconsistencies in the code executed by sibling 
> > > > > > > > > thread, the
> > > > > > > > > diff below makes sure `ps_single' is dereferenced only once 
> > > > > > > > > in various
> > > > > > > > > parts of the kernel.
> > > > > > > > > 
> > > > > > > > > ok?
> > > > > > > > 
> > > > > > > > I think that means that ps_single has to be declared "volatile".
> > > > > > > 
> > > > > > > Isn't there the READ_ONCE(x) macro, that does exactly that?
> > > > > > 
> > > > > > Not a big fan of READ_ONCE() and WRITE_ONCE(), but apparently those
> > > > > > are needed to comply with the alpha memory model.  At least in some
> > > > > > cases...
> > > > > 
> > > > > Updated diff using READ_ONCE(), ok?
> > > > 
> > > > If you use READ_ONCE() you shoul also use WRITE_ONCE() everywhere
> > > > where you modify ps_single isn't it?
> > > 
> > > I don't know, I'm learning how to do it.  I'd appreciate if somebody could
> > > come with a READ_ONCE(9) manual explaining how this API should be used.
> > > 
> > > Updated diff including the WRITE_ONCE().
> > 
> > Any ok?
> 
> The one thing that bothers me is that we decided that ps_single needs the
> SCHED_LOCK but now this becomes a bit of a mishmash.

I hear what you're saying.

I'm currently concentrating on moving cursig() out of the KERNEL_LOCK()
and I'd appreciate not be blocked on discussions on which locking/lock-free
solution is the best for making the parking code mp-safe.

This diff targets a specific problem which is to make sure `ps_single'
dereferences are coherent if this value is being modified w/o KERNEL_LOCK().
It doesn't revisit/clarify the relation between the uses of `ps_single'
in ptrace and parking code.  This can, IMHO, be done in a later step.

> > > @@ -510,7 +512,7 @@ dowait4(struct proc *q, pid_t pid, int *
> > >  {
> > >   int nfound;
> > >   struct process *pr;
> > > - struct proc *p;
> > > + struct proc *p, *st;
> > >   int error;
> > >  
> > >   if (pid == 0)
> > > @@ -541,10 +543,11 @@ loop:
> > >   proc_finish_wait(q, p);
> > >   return (0);
> > >   }
> > > +
> > > + st = READ_ONCE(pr->ps_single);
> > >   if (pr->ps_flags & PS_TRACED &&
> > > - (pr->ps_flags & PS_WAITED) == 0 && pr->ps_single &&
> > > - pr->ps_single->p_stat == SSTOP &&
> > > - (pr->ps_single->p_flag & P_SUSPSINGLE) == 0) {
> > > + (pr->ps_flags & PS_WAITED) == 0 && st != NULL &&
> > > + st->p_stat == SSTOP && (st->p_flag & P_SUSPSINGLE) == 0) {
> > >   if (single_thread_wait(pr, 0))
> > >   goto loop;
> > >  
> 
> Here you access p_stat and p_flag, as far as I remember p_stat is also
> protected by SCHED_LOCK. 

Re: Read `ps_single' once

2021-03-09 Thread Martin Pieuchot
On 08/03/21(Mon) 12:37, Claudio Jeker wrote:
> On Mon, Mar 08, 2021 at 12:11:54PM +0100, Martin Pieuchot wrote:
> [...]  
> > This diff targets a specific problem which is to make sure `ps_single'
> > dereferences are coherent if this value is being modified w/o KERNEL_LOCK().
> > It doesn't revisit/clarify the relation between the uses of `ps_single'
> > in ptrace and parking code.  This can, IMHO, be done in a later step.
> 
> It only ensures that ps_single is coherent but not that data read from
> that pointer is coherent.

Yes, that's exactly the point of this diff.

> > > > > @@ -541,10 +543,11 @@ loop:
> > > > >   proc_finish_wait(q, p);
> > > > >   return (0);
> > > > >   }
> > > > > +
> > > > > + st = READ_ONCE(pr->ps_single);
> > > > >   if (pr->ps_flags & PS_TRACED &&
> > > > > - (pr->ps_flags & PS_WAITED) == 0 && pr->ps_single &&
> > > > > - pr->ps_single->p_stat == SSTOP &&
> > > > > - (pr->ps_single->p_flag & P_SUSPSINGLE) == 0) {
> > > > > + (pr->ps_flags & PS_WAITED) == 0 && st != NULL &&
> > > > > + st->p_stat == SSTOP && (st->p_flag & P_SUSPSINGLE) 
> > > > > == 0) {
> > > > >   if (single_thread_wait(pr, 0))
> > > > >   goto loop;
> > > > >  
> > > 
> > > Here you access p_stat and p_flag, as far as I remember p_stat is also
> > > protected by SCHED_LOCK. p_flag is atomic and maybe the check should be
> > > turned. So this decision may not be stable.
> > 
> > This is an incoherency which is fine as long as this code is executed
> > with the KERNEL_LOCK().
> 
> It is not if the signal handling is no longer using the KERNEL_LOCK.

But it is currently ;)

> Then the thread could be in the process of being stopped and the race
> to enter single_thread_wait() could be lost. The KERNEL_LOCK() alone does
> not prevent single_thread_set() from running.

Sure, there's plenty of races in the existing code if the KERNEL)_LOCK()
is removed.  I'm trying to move forward step by step.  I sent a simple diff
that fixes a simple problem.  Are you suggesting I should send a huge diff?



uvm: sync some comments with NetBSD

2021-03-18 Thread Martin Pieuchot
Diff below only touches comments in sys/uvm.  It reverts the commit from
2014 that turned three line comments into one line comments and sync
some more block with NetBSD -current.  This helps reducing the diff with
NetBSD.

ok?

Index: uvm/uvm_addr.c
===
RCS file: /cvs/src/sys/uvm/uvm_addr.c,v
retrieving revision 1.29
diff -u -p -r1.29 uvm_addr.c
--- uvm/uvm_addr.c  22 Sep 2020 14:31:08 -  1.29
+++ uvm/uvm_addr.c  17 Mar 2021 09:28:07 -
@@ -65,7 +65,9 @@ struct uaddr_rnd_state {
 #endif
 };
 
-/* Definition of a pivot in pivot selector. */
+/*
+ * Definition of a pivot in pivot selector.
+ */
 struct uaddr_pivot {
vaddr_t  addr;  /* End of prev. allocation. */
int  expire;/* Best before date. */
@@ -87,7 +89,11 @@ struct uaddr_pivot_state {
 extern const struct uvm_addr_functions uaddr_kernel_functions;
 struct uvm_addr_state uaddr_kbootstrap;
 
-/* Support functions. */
+
+/*
+ * Support functions.
+ */
+
 #ifndef SMALL_KERNEL
 struct vm_map_entry*uvm_addr_entrybyspace(struct uaddr_free_rbtree*,
vsize_t);
@@ -236,7 +242,9 @@ uvm_addr_fitspace(vaddr_t *min_result, v
if (fspace - before_gap - after_gap < sz)
return ENOMEM;
 
-   /* Calculate lowest address. */
+   /*
+* Calculate lowest address.
+*/
low_addr += before_gap;
low_addr = uvm_addr_align_forward(tmp = low_addr, align, offset);
if (low_addr < tmp) /* Overflow during alignment. */
@@ -244,7 +252,9 @@ uvm_addr_fitspace(vaddr_t *min_result, v
if (high_addr - after_gap - sz < low_addr)
return ENOMEM;
 
-   /* Calculate highest address. */
+   /*
+* Calculate highest address.
+*/
high_addr -= after_gap + sz;
high_addr = uvm_addr_align_backward(tmp = high_addr, align, offset);
if (high_addr > tmp)/* Overflow during alignment. */
@@ -341,7 +351,9 @@ uvm_addr_linsearch(struct vm_map *map, s
(before_gap & PAGE_MASK) == 0 && (after_gap & PAGE_MASK) == 0);
KASSERT(high + sz > high); /* Check for overflow. */
 
-   /* Hint magic. */
+   /*
+* Hint magic.
+*/
if (hint == 0)
hint = (direction == 1 ? low : high);
else if (hint > high) {
@@ -463,6 +475,7 @@ uaddr_destroy(struct uvm_addr_state *uad
  * If hint is set, search will start at the hint position.
  * Only searches forward.
  */
+
 const struct uvm_addr_functions uaddr_lin_functions = {
.uaddr_select = &uaddr_lin_select,
.uaddr_destroy = &uaddr_destroy,
@@ -489,7 +502,9 @@ uaddr_lin_select(struct vm_map *map, str
 {
vaddr_t guard_sz;
 
-   /* Deal with guardpages: search for space with one extra page. */
+   /*
+* Deal with guardpages: search for space with one extra page.
+*/
guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE);
 
if (uaddr->uaddr_maxaddr - uaddr->uaddr_minaddr - guard_sz < sz)
@@ -716,6 +731,7 @@ uaddr_rnd_print(struct uvm_addr_state *u
 /*
  * Kernel allocation bootstrap logic.
  */
+
 const struct uvm_addr_functions uaddr_kernel_functions = {
.uaddr_select = &uaddr_kbootstrap_select,
.uaddr_destroy = &uaddr_kbootstrap_destroy,
@@ -839,7 +855,9 @@ uaddr_bestfit_select(struct vm_map *map,
if (entry == NULL)
return ENOMEM;
 
-   /* Walk the tree until we find an entry that fits.  */
+   /*
+* Walk the tree until we find an entry that fits.
+*/
while (uvm_addr_fitspace(&min, &max,
VMMAP_FREE_START(entry), VMMAP_FREE_END(entry),
sz, align, offset, 0, guardsz) != 0) {
@@ -848,7 +866,9 @@ uaddr_bestfit_select(struct vm_map *map,
return ENOMEM;
}
 
-   /* Return the address that generates the least fragmentation. */
+   /*
+* Return the address that generates the least fragmentation.
+*/
*entry_out = entry;
*addr_out = (min - VMMAP_FREE_START(entry) <=
VMMAP_FREE_END(entry) - guardsz - sz - max ?
@@ -1128,7 +1148,9 @@ uaddr_pivot_select(struct vm_map *map, s
if (pivot->addr == 0 || pivot->entry == NULL || pivot->expire == 0)
goto expired;   /* Pivot is invalid (null or expired). */
 
-   /* Attempt to use the pivot to map the entry. */
+   /*
+* Attempt to use the pivot to map the entry.
+*/
entry = pivot->entry;
if (pivot->dir > 0) {
if (uvm_addr_fitspace(&min, &max,
Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.87
diff -u -p -r1.87 uvm_amap.c
--- uvm/uvm_amap.c  19 Jan 2021 13:21:36 -  1.87
+++ uvm/uvm_amap.c  17 Mar 2021 11:15:29 -00

Re: uvm: sync some comments with NetBSD

2021-03-19 Thread Martin Pieuchot
On 18/03/21(Thu) 16:49, Mark Kettenis wrote:
> > Date: Thu, 18 Mar 2021 09:26:14 +0100
> > From: Martin Pieuchot 
> > 
> > Diff below only touches comments in sys/uvm.  It reverts the commit from
> > 2014 that turned three line comments into one line comments and sync
> > some more block with NetBSD -current.  This helps reducing the diff with
> > NetBSD.
> > 
> > ok?
> 
> A few nits below where I think you change the comments in a way that
> is misleading because our implementation differs from NetBSD.

Thanks, updated diff below.

Index: uvm/uvm_addr.c
===
RCS file: /cvs/src/sys/uvm/uvm_addr.c,v
retrieving revision 1.29
diff -u -p -r1.29 uvm_addr.c
--- uvm/uvm_addr.c  22 Sep 2020 14:31:08 -  1.29
+++ uvm/uvm_addr.c  18 Mar 2021 08:26:49 -
@@ -65,7 +65,9 @@ struct uaddr_rnd_state {
 #endif
 };
 
-/* Definition of a pivot in pivot selector. */
+/*
+ * Definition of a pivot in pivot selector.
+ */
 struct uaddr_pivot {
vaddr_t  addr;  /* End of prev. allocation. */
int  expire;/* Best before date. */
@@ -87,7 +89,11 @@ struct uaddr_pivot_state {
 extern const struct uvm_addr_functions uaddr_kernel_functions;
 struct uvm_addr_state uaddr_kbootstrap;
 
-/* Support functions. */
+
+/*
+ * Support functions.
+ */
+
 #ifndef SMALL_KERNEL
 struct vm_map_entry*uvm_addr_entrybyspace(struct uaddr_free_rbtree*,
vsize_t);
@@ -236,7 +242,9 @@ uvm_addr_fitspace(vaddr_t *min_result, v
if (fspace - before_gap - after_gap < sz)
return ENOMEM;
 
-   /* Calculate lowest address. */
+   /*
+* Calculate lowest address.
+*/
low_addr += before_gap;
low_addr = uvm_addr_align_forward(tmp = low_addr, align, offset);
if (low_addr < tmp) /* Overflow during alignment. */
@@ -244,7 +252,9 @@ uvm_addr_fitspace(vaddr_t *min_result, v
if (high_addr - after_gap - sz < low_addr)
return ENOMEM;
 
-   /* Calculate highest address. */
+   /*
+* Calculate highest address.
+*/
high_addr -= after_gap + sz;
high_addr = uvm_addr_align_backward(tmp = high_addr, align, offset);
if (high_addr > tmp)/* Overflow during alignment. */
@@ -341,7 +351,9 @@ uvm_addr_linsearch(struct vm_map *map, s
(before_gap & PAGE_MASK) == 0 && (after_gap & PAGE_MASK) == 0);
KASSERT(high + sz > high); /* Check for overflow. */
 
-   /* Hint magic. */
+   /*
+* Hint magic.
+*/
if (hint == 0)
hint = (direction == 1 ? low : high);
else if (hint > high) {
@@ -463,6 +475,7 @@ uaddr_destroy(struct uvm_addr_state *uad
  * If hint is set, search will start at the hint position.
  * Only searches forward.
  */
+
 const struct uvm_addr_functions uaddr_lin_functions = {
.uaddr_select = &uaddr_lin_select,
.uaddr_destroy = &uaddr_destroy,
@@ -489,7 +502,9 @@ uaddr_lin_select(struct vm_map *map, str
 {
vaddr_t guard_sz;
 
-   /* Deal with guardpages: search for space with one extra page. */
+   /*
+* Deal with guardpages: search for space with one extra page.
+*/
guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE);
 
if (uaddr->uaddr_maxaddr - uaddr->uaddr_minaddr - guard_sz < sz)
@@ -716,6 +731,7 @@ uaddr_rnd_print(struct uvm_addr_state *u
 /*
  * Kernel allocation bootstrap logic.
  */
+
 const struct uvm_addr_functions uaddr_kernel_functions = {
.uaddr_select = &uaddr_kbootstrap_select,
.uaddr_destroy = &uaddr_kbootstrap_destroy,
@@ -839,7 +855,9 @@ uaddr_bestfit_select(struct vm_map *map,
if (entry == NULL)
return ENOMEM;
 
-   /* Walk the tree until we find an entry that fits.  */
+   /*
+* Walk the tree until we find an entry that fits.
+*/
while (uvm_addr_fitspace(&min, &max,
VMMAP_FREE_START(entry), VMMAP_FREE_END(entry),
sz, align, offset, 0, guardsz) != 0) {
@@ -848,7 +866,9 @@ uaddr_bestfit_select(struct vm_map *map,
return ENOMEM;
}
 
-   /* Return the address that generates the least fragmentation. */
+   /*
+* Return the address that generates the least fragmentation.
+*/
*entry_out = entry;
*addr_out = (min - VMMAP_FREE_START(entry) <=
VMMAP_FREE_END(entry) - guardsz - sz - max ?
@@ -1128,7 +1148,9 @@ uaddr_pivot_select(struct vm_map *map, s
if (pivot->addr == 0 || pivot->entry == NULL || pivot->expire == 0)
goto expired;   /* Pivot is invalid (null or expired). */
 
-   /* Attempt to use the pivot to map the entry. */
+ 

fork(2), PT_ATTACH & SIGTRAP

2021-03-20 Thread Martin Pieuchot
On SP systems, like bluhm@'s armv7 regression machine, the kern/ptrace2
test is failing due to a subtle behavior.  Diff below makes it pass.

http://bluhm.genua.de/regress/results/2021-03-19T15%3A17%3A02Z/logs/sys/kern/ptrace2/make.log

The failing test does a fork(2) and the parent issues a PT_ATTACH on the
child before it has been scheduled for the first time.  Then the parent
goes to sleep in waitpid() and when the child starts executing the check
below overwrites the ptrace(2)-received SIGSTOP by a SIGTRAP.

This scenario doesn't seem to happen on MP machine because the child
starts to execute itself on a different core right after sys_fork() is
finished.

What is the purpose of this check?  Should it be relaxed or removed?

Index: kern/kern_fork.c
===
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.234
diff -u -p -r1.234 kern_fork.c
--- kern/kern_fork.c15 Feb 2021 09:35:59 -  1.234
+++ kern/kern_fork.c20 Mar 2021 11:59:18 -
@@ -86,9 +86,6 @@ fork_return(void *arg)
 {
struct proc *p = (struct proc *)arg;
 
-   if (p->p_p->ps_flags & PS_TRACED)
-   psignal(p, SIGTRAP);
-
child_return(p);
 }
 



kern_sig.c: use uppercase for defines

2021-03-20 Thread Martin Pieuchot
ok?

Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.278
diff -u -p -r1.278 kern_sig.c
--- kern/kern_sig.c 12 Mar 2021 10:13:28 -  1.278
+++ kern/kern_sig.c 20 Mar 2021 12:12:26 -
@@ -118,8 +118,8 @@ const int sigprop[NSIG + 1] = {
SA_IGNORE,  /* SIGTHR */
 };
 
-#definecontsigmask (sigmask(SIGCONT))
-#definestopsigmask (sigmask(SIGSTOP) | sigmask(SIGTSTP) | \
+#defineCONTSIGMASK (sigmask(SIGCONT))
+#defineSTOPSIGMASK (sigmask(SIGSTOP) | sigmask(SIGTSTP) | \
sigmask(SIGTTIN) | sigmask(SIGTTOU))
 
 void setsigvec(struct proc *, int, struct sigaction *);
@@ -996,11 +996,11 @@ ptsignal(struct proc *p, int signum, enu
siglist = (type == SPROCESS) ? &pr->ps_siglist : &p->p_siglist;
if (prop & SA_CONT) {
siglist = &p->p_siglist;
-   atomic_clearbits_int(siglist, stopsigmask);
+   atomic_clearbits_int(siglist, STOPSIGMASK);
}
if (prop & SA_STOP) {
siglist = &p->p_siglist;
-   atomic_clearbits_int(siglist, contsigmask);
+   atomic_clearbits_int(siglist, CONTSIGMASK);
atomic_clearbits_int(&p->p_flag, P_CONTINUED);
}
atomic_setbits_int(siglist, mask);
@@ -1194,7 +1194,7 @@ cursig(struct proc *p)
for (;;) {
mask = SIGPENDING(p);
if (pr->ps_flags & PS_PPWAIT)
-   mask &= ~stopsigmask;
+   mask &= ~STOPSIGMASK;
if (mask == 0)  /* no signal to send */
return (0);
signum = ffs((long)mask);



Stop/unstop process & xsig

2021-03-20 Thread Martin Pieuchot
Diff below refactors routines to stop/unstop processes and save the signal
number which will/can be transmitted it in wait4(2).  It does the following:

- Move the "hack" involving P_SINTR to avoid grabbing the SCHED_LOCK()
  recursively inside proc_stop().

- Introduce proc_unstop(), the symmetric routine to proc_stop().

- Manipulate `ps_xsig' only in proc_stop/unstop().

Ok?

Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.278
diff -u -p -r1.278 kern_sig.c
--- kern/kern_sig.c 12 Mar 2021 10:13:28 -  1.278
+++ kern/kern_sig.c 20 Mar 2021 12:16:51 -
@@ -124,7 +124,7 @@ const int sigprop[NSIG + 1] = {
 
 void setsigvec(struct proc *, int, struct sigaction *);
 
-void proc_stop(struct proc *p, int);
+int proc_stop(struct proc *p, int, int);
 void proc_stop_sweep(void *);
 void *proc_stop_si;
 
@@ -1061,8 +1061,7 @@ ptsignal(struct proc *p, int signum, enu
if (pr->ps_flags & PS_PPWAIT)
goto out;
atomic_clearbits_int(siglist, mask);
-   pr->ps_xsig = signum;
-   proc_stop(p, 0);
+   proc_stop(p, signum, 0);
goto out;
}
/*
@@ -1170,17 +1169,12 @@ out:
  *
  * while (signum = cursig(curproc))
  * postsig(signum);
- *
- * Assumes that if the P_SINTR flag is set, we're holding both the
- * kernel and scheduler locks.
  */
 int
 cursig(struct proc *p)
 {
struct process *pr = p->p_p;
int sigpending, signum, mask, prop;
-   int dolock = (p->p_flag & P_SINTR) == 0;
-   int s;
 
KERNEL_ASSERT_LOCKED();
 
@@ -1217,31 +1211,22 @@ cursig(struct proc *p)
 */
if (((pr->ps_flags & (PS_TRACED | PS_PPWAIT)) == PS_TRACED) &&
signum != SIGKILL) {
-   pr->ps_xsig = signum;
 
single_thread_set(p, SINGLE_SUSPEND, 0);
-
-   if (dolock)
-   SCHED_LOCK(s);
-   proc_stop(p, 1);
-   if (dolock)
-   SCHED_UNLOCK(s);
-
+   signum = proc_stop(p, signum, 1);
single_thread_clear(p, 0);
 
/*
 * If we are no longer being traced, or the parent
 * didn't give us a signal, look for more signals.
 */
-   if ((pr->ps_flags & PS_TRACED) == 0 ||
-   pr->ps_xsig == 0)
+   if ((pr->ps_flags & PS_TRACED) == 0 || signum == 0)
continue;
 
/*
 * If the new signal is being masked, look for other
 * signals.
 */
-   signum = pr->ps_xsig;
mask = sigmask(signum);
if ((p->p_sigmask & mask) != 0)
continue;
@@ -1286,12 +1271,7 @@ cursig(struct proc *p)
(pr->ps_pgrp->pg_jobc == 0 &&
prop & SA_TTYSTOP))
break;  /* == ignore */
-   pr->ps_xsig = signum;
-   if (dolock)
-   SCHED_LOCK(s);
-   proc_stop(p, 1);
-   if (dolock)
-   SCHED_UNLOCK(s);
+   proc_stop(p, signum, 1);
break;
} else if (prop & SA_IGNORE) {
/*
@@ -1331,15 +1311,21 @@ keep:
  * Put the argument process into the stopped state and notify the parent
  * via wakeup.  Signals are handled elsewhere.  The process must not be
  * on the run queue.
+ *
+ * Assumes that if the P_SINTR flag is set, we're holding the scheduler
+ * lock.
  */
-void
-proc_stop(struct proc *p, int sw)
+int
+proc_stop(struct proc *p, int signum, int sw)
 {
struct process *pr = p->p_p;
+   int dolock = (p->p_flag & P_SINTR) == 0;
+   int s;
 
-#ifdef MULTIPROCESSOR
-   SCHED_ASSERT_LOCKED();
-#endif
+   pr->ps_xsig = signum;
+
+   if (dolock)
+   SCHED_LOCK(s);
 
p->p_stat = SSTOP;
atomic_clearbits_int(&pr->ps_flags, PS_WAITED);
@@ -1352,6 +1338,13 @@ proc_stop(struct proc *p, int sw)
softintr_schedule(proc_stop_si);
if (sw)
mi_switch();
+
+   if (dolock)
+   SCHED_UNLOCK(s);
+
+   signum = pr->ps_xsig;
+
+   return signum;
 }
 
 /*
@@ -1376,6 +1369,27 @@ proc_stop_sweep(void *v)
}
 }
 
+void
+proc_unstop(struct p

Re: fork(2), PT_ATTACH & SIGTRAP

2021-03-21 Thread Martin Pieuchot
On 21/03/21(Sun) 13:42, Mark Kettenis wrote:
> > Date: Sat, 20 Mar 2021 13:10:17 +0100
> > From: Martin Pieuchot 
> > 
> > On SP systems, like bluhm@'s armv7 regression machine, the kern/ptrace2
> > test is failing due to a subtle behavior.  Diff below makes it pass.
> > 
> > http://bluhm.genua.de/regress/results/2021-03-19T15%3A17%3A02Z/logs/sys/kern/ptrace2/make.log
> > 
> > The failing test does a fork(2) and the parent issues a PT_ATTACH on the
> > child before it has been scheduled for the first time.  Then the parent
> > goes to sleep in waitpid() and when the child starts executing the check
> > below overwrites the ptrace(2)-received SIGSTOP by a SIGTRAP.
> > 
> > This scenario doesn't seem to happen on MP machine because the child
> > starts to execute itself on a different core right after sys_fork() is
> > finished.
> > 
> > What is the purpose of this check?  Should it be relaxed or removed?
> 
> This is part of PT_SET_EVENT_MASK/PTRACE_FORK support:
> 
> https://github.com/openbsd/src/commit/f38bed7f869bd3503530c554b4860228ea4e8641
> 
> When reporting of the PTRACE_FORK event is requested, the debugger
> expects to see a SIGTRAP in both the parent and the child.  The code
> expects that the only way to have PS_TRACED set in the child from the
> start is when PTRACE_FORK is requested.  But the failing test shows
> there is a race with PT_ATTACH.

Thanks for the explanation.

> I think the solution is to have fork1() only run fork_return() if the
> FORK_PTRACE flag is set, and use run child_return() otherwise.

Diff below does that and prevent the race, ok?

Index: kern/kern_fork.c
===
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.234
diff -u -p -r1.234 kern_fork.c
--- kern/kern_fork.c15 Feb 2021 09:35:59 -  1.234
+++ kern/kern_fork.c21 Mar 2021 15:55:26 -
@@ -95,12 +95,15 @@ fork_return(void *arg)
 int
 sys_fork(struct proc *p, void *v, register_t *retval)
 {
+   void (*func)(void *) = child_return;
int flags;
 
flags = FORK_FORK;
-   if (p->p_p->ps_ptmask & PTRACE_FORK)
+   if (p->p_p->ps_ptmask & PTRACE_FORK) {
flags |= FORK_PTRACE;
-   return fork1(p, flags, fork_return, NULL, retval, NULL);
+   func = fork_return;
+   }
+   return fork1(p, flags, func, NULL, retval, NULL);
 }
 
 int



witness: skip first frame when saving stacktraces

2021-03-22 Thread Martin Pieuchot
The top frame is always `witness_checkorder', at least on amd64.  Diff
below makes use of stacktrace_save_at() to skip it.

Previous output:

lock order "&map->lock"(rwlock) -> "&amap->am_lock"(rwlock) first seen at:  
#0  witness_checkorder+0x4d7 [/home/os/openbsd/sys/sys/stacktrace.h:0]  
#1  rw_enter_write+0x43 [/home/os/openbsd/sys/kern/kern_rwlock.c:128]   
#2  amap_ref+0x24 [/home/os/openbsd/sys/uvm/uvm_amap.c:1341]
#3  uvm_mapent_clone+0x129 [/home/os/openbsd/sys/uvm/uvm_map.c:3826]
#4  uvm_map_extract+0x324 [/home/os/openbsd/sys/uvm/uvm_map.c:4582] 
#5  sys_kbind+0x2dd [/home/os/openbsd/sys/uvm/uvm_mmap.c:1174]  
#6  syscall+0x389 [/home/os/openbsd/sys/sys/syscall_mi.h:102]   
#7  Xsyscall+0x128   

With this diff:

lock order "&map->lock"(rwlock) -> "&amap->am_lock"(rwlock) first seen at:
#0  rw_enter_write+0x43 [/home/os/openbsd/sys/kern/kern_rwlock.c:128]
#1  amap_ref+0x24 [/home/os/openbsd/sys/uvm/uvm_amap.c:1341]
#2  uvm_mapent_clone+0x129 [/home/os/openbsd/sys/uvm/uvm_map.c:3826]
#3  uvm_map_extract+0x324 [/home/os/openbsd/sys/uvm/uvm_map.c:4582]
#4  sys_kbind+0x2dd [/home/os/openbsd/sys/uvm/uvm_mmap.c:1174]
#5  syscall+0x389 [/home/os/openbsd/sys/sys/syscall_mi.h:102]
#6  Xsyscall+0x128

ok?

Index: kern/subr_witness.c
===
RCS file: /cvs/src/sys/kern/subr_witness.c,v
retrieving revision 1.46
diff -u -p -r1.46 subr_witness.c
--- kern/subr_witness.c 10 Mar 2021 10:21:47 -  1.46
+++ kern/subr_witness.c 22 Mar 2021 10:00:15 -
@@ -764,7 +764,7 @@ witness_checkorder(struct lock_object *l
 
if (witness_cold || witness_watch < 1 || panicstr != NULL || db_active)
return;
-   
+
if ((lock->lo_flags & LO_INITIALIZED) == 0) {
if (witness_uninitialized_report > 0) {
witness_uninitialized_report--;
@@ -2472,7 +2472,7 @@ witness_lock_order_add(struct witness *p
data->wlod_key = key;
w_lohash.wloh_array[hash] = data;
w_lohash.wloh_count++;
-   stacktrace_save(&data->wlod_stack);
+   stacktrace_save_at(&data->wlod_stack, 1);
return (1);
 }
 



malloc: use km_alloc(9) for kmemusage

2021-03-22 Thread Martin Pieuchot
Diff below convert a use of uvm_km_zalloc(9) to km_alloc(9), this memory
is never released, ok?

Index: kern/kern_malloc.c
===
RCS file: /cvs/src/sys/kern/kern_malloc.c,v
retrieving revision 1.144
diff -u -p -r1.144 kern_malloc.c
--- kern/kern_malloc.c  23 Feb 2021 13:50:16 -  1.144
+++ kern/kern_malloc.c  22 Mar 2021 10:23:42 -
@@ -580,8 +580,8 @@ kmeminit(void)
FALSE, &kmem_map_store);
kmembase = (char *)base;
kmemlimit = (char *)limit;
-   kmemusage = (struct kmemusage *) uvm_km_zalloc(kernel_map,
-   (vsize_t)(nkmempages * sizeof(struct kmemusage)));
+   kmemusage = km_alloc(round_page(nkmempages * sizeof(struct kmemusage)),
+   &kv_any, &kp_zero, &kd_waitok);
for (indx = 0; indx < MINBUCKET + 16; indx++) {
XSIMPLEQ_INIT(&bucket[indx].kb_freelist);
}



uvm_page_physload: use km_alloc(9)

2021-03-22 Thread Martin Pieuchot
Convert the last MI uvm_km_zalloc(9) to km_alloc(9), ok?

Index: uvm/uvm_page.c
===
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
retrieving revision 1.155
diff -u -p -r1.155 uvm_page.c
--- uvm/uvm_page.c  19 Jan 2021 13:21:36 -  1.155
+++ uvm/uvm_page.c  22 Mar 2021 10:23:39 -
@@ -542,8 +542,8 @@ uvm_page_physload(paddr_t start, paddr_t
 
npages = end - start;  /* # of pages */
 
-   pgs = (struct vm_page *)uvm_km_zalloc(kernel_map,
-   npages * sizeof(*pgs));
+   pgs = km_alloc(npages * sizeof(*pgs), &kv_any, &kp_zero,
+   &kd_waitok);
if (pgs == NULL) {
printf("uvm_page_physload: can not malloc vm_page "
"structs for segment\n");



UVM return(val)

2021-03-23 Thread Martin Pieuchot
Diff below convert multiple "return(val)" and "return (val)" to
"return val".  I only changed those that help decrease the size
of the diff with NetBSD or didn't change anything.

ok?

Index: uvm/uvm_amap.c
===
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
retrieving revision 1.88
diff -u -p -r1.88 uvm_amap.c
--- uvm/uvm_amap.c  20 Mar 2021 10:24:21 -  1.88
+++ uvm/uvm_amap.c  23 Mar 2021 12:14:26 -
@@ -342,7 +342,7 @@ amap_alloc1(int slots, int waitf, int la
amap = pool_get(&uvm_small_amap_pool[slots - 1],
pwaitf | PR_ZERO);
if (amap == NULL)
-   return(NULL);
+   return NULL;
 
amap->am_lock = NULL;
amap->am_ref = 1;
@@ -355,7 +355,7 @@ amap_alloc1(int slots, int waitf, int la
 
if (UVM_AMAP_SMALL(amap)) {
amap->am_small.ac_nslot = slots;
-   return (amap);
+   return amap;
}
 
amap->am_ncused = 0;
@@ -392,14 +392,14 @@ amap_alloc1(int slots, int waitf, int la
}
}
 
-   return(amap);
+   return amap;
 
 fail1:
free(amap->am_buckets, M_UVMAMAP, buckets * sizeof(*amap->am_buckets));
TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, tmp)
pool_put(&uvm_amap_chunk_pool, chunk);
pool_put(&uvm_amap_pool, amap);
-   return (NULL);
+   return NULL;
 }
 
 static void
@@ -423,7 +423,7 @@ amap_alloc(vaddr_t sz, int waitf, int la
 
AMAP_B2SLOT(slots, sz); /* load slots */
if (slots > INT_MAX)
-   return (NULL);
+   return NULL;
 
amap = amap_alloc1(slots, waitf, lazyalloc);
if (amap != NULL) {
@@ -431,7 +431,7 @@ amap_alloc(vaddr_t sz, int waitf, int la
amap_list_insert(amap);
}
 
-   return(amap);
+   return amap;
 }
 
 
Index: uvm/uvm_anon.c
===
RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
retrieving revision 1.53
diff -u -p -r1.53 uvm_anon.c
--- uvm/uvm_anon.c  20 Mar 2021 10:24:21 -  1.53
+++ uvm/uvm_anon.c  23 Mar 2021 12:01:03 -
@@ -67,7 +67,7 @@ uvm_analloc(void)
anon->an_page = NULL;
anon->an_swslot = 0;
}
-   return(anon);
+   return anon;
 }
 
 /*
Index: uvm/uvm_aobj.c
===
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.92
diff -u -p -r1.92 uvm_aobj.c
--- uvm/uvm_aobj.c  20 Mar 2021 10:24:21 -  1.92
+++ uvm/uvm_aobj.c  23 Mar 2021 12:17:00 -
@@ -211,7 +211,7 @@ uao_find_swhash_elt(struct uvm_aobj *aob
 */
LIST_FOREACH(elt, swhash, list) {
if (elt->tag == page_tag)
-   return(elt);
+   return elt;
}
 
if (!create)
@@ -234,7 +234,7 @@ uao_find_swhash_elt(struct uvm_aobj *aob
LIST_INSERT_HEAD(swhash, elt, list);
elt->tag = page_tag;
 
-   return(elt);
+   return elt;
 }
 
 /*
@@ -248,7 +248,7 @@ uao_find_swslot(struct uvm_aobj *aobj, i
 * if noswap flag is set, then we never return a slot
 */
if (aobj->u_flags & UAO_FLAG_NOSWAP)
-   return(0);
+   return 0;
 
/*
 * if hashing, look in hash table.
@@ -258,15 +258,15 @@ uao_find_swslot(struct uvm_aobj *aobj, i
uao_find_swhash_elt(aobj, pageidx, FALSE);
 
if (elt)
-   return(UAO_SWHASH_ELT_PAGESLOT(elt, pageidx));
+   return UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
else
-   return(0);
+   return 0;
}
 
/*
 * otherwise, look in the array
 */
-   return(aobj->u_swslots[pageidx]);
+   return aobj->u_swslots[pageidx];
 }
 
 /*
@@ -289,7 +289,7 @@ uao_set_swslot(struct uvm_object *uobj, 
 */
if (aobj->u_flags & UAO_FLAG_NOSWAP) {
if (slot == 0)
-   return(0);  /* a clear is ok */
+   return 0;   /* a clear is ok */
 
/* but a set is not */
printf("uao_set_swslot: uobj = %p\n", uobj);
@@ -309,7 +309,7 @@ uao_set_swslot(struct uvm_object *uobj, 
uao_find_swhash_elt(aobj, pageidx, slot ? TRUE : FALSE);
if (elt == NULL) {
KASSERT(slot == 0);
-   return (0);
+   return 0;
}
 
oldslot = UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
@@ -336,7 +336,7 @@ uao_set_swslot(struct uvm_object *uobj, 
oldslot = aobj->u_swslots[pageidx];
aobj->u_swslots[pageidx] = slot;
}
-   return (oldslot);
+   return oldslot;
 }
 /*
  * end of h

UAO_USES_SWHASH()

2021-03-29 Thread Martin Pieuchot
Introduce a new macro, UAO_USES_SWHASH() and use it to reduce the diff
with NetBSD.  Also change some space into tabs for the same reason.

ok?

Index: uvm/uvm_aobj.c
===
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.93
diff -u -p -r1.93 uvm_aobj.c
--- uvm/uvm_aobj.c  26 Mar 2021 13:40:05 -  1.93
+++ uvm/uvm_aobj.c  29 Mar 2021 09:37:59 -
@@ -58,38 +58,39 @@
  * Note: for hash tables, we break the address space of the aobj into blocks
  * of UAO_SWHASH_CLUSTER_SIZE pages, which shall be a power of two.
  */
-#define UAO_SWHASH_CLUSTER_SHIFT 4
-#define UAO_SWHASH_CLUSTER_SIZE (1 << UAO_SWHASH_CLUSTER_SHIFT)
+#defineUAO_SWHASH_CLUSTER_SHIFT4
+#defineUAO_SWHASH_CLUSTER_SIZE (1 << UAO_SWHASH_CLUSTER_SHIFT)
 
 /* Get the "tag" for this page index. */
-#define UAO_SWHASH_ELT_TAG(PAGEIDX) \
-   ((PAGEIDX) >> UAO_SWHASH_CLUSTER_SHIFT)
+#defineUAO_SWHASH_ELT_TAG(idx) ((idx) >> 
UAO_SWHASH_CLUSTER_SHIFT)
+#define UAO_SWHASH_ELT_PAGESLOT_IDX(idx) \
+((idx) & (UAO_SWHASH_CLUSTER_SIZE - 1))
 
 /* Given an ELT and a page index, find the swap slot. */
-#define UAO_SWHASH_ELT_PAGESLOT_IDX(PAGEIDX) \
-   ((PAGEIDX) & (UAO_SWHASH_CLUSTER_SIZE - 1))
-#define UAO_SWHASH_ELT_PAGESLOT(ELT, PAGEIDX) \
-   ((ELT)->slots[(PAGEIDX) & (UAO_SWHASH_CLUSTER_SIZE - 1)])
+#defineUAO_SWHASH_ELT_PAGESLOT(elt, idx) \
+((elt)->slots[UAO_SWHASH_ELT_PAGESLOT_IDX(idx)])
 
 /* Given an ELT, return its pageidx base. */
-#define UAO_SWHASH_ELT_PAGEIDX_BASE(ELT) \
-   ((ELT)->tag << UAO_SWHASH_CLUSTER_SHIFT)
+#defineUAO_SWHASH_ELT_PAGEIDX_BASE(elt) \
+((elt)->tag << UAO_SWHASH_CLUSTER_SHIFT)
 
 /* The hash function. */
-#define UAO_SWHASH_HASH(AOBJ, PAGEIDX) \
-   (&(AOBJ)->u_swhash[(((PAGEIDX) >> UAO_SWHASH_CLUSTER_SHIFT) \
-   & (AOBJ)->u_swhashmask)])
+#defineUAO_SWHASH_HASH(aobj, idx) \
+(&(aobj)->u_swhash[(((idx) >> UAO_SWHASH_CLUSTER_SHIFT) \
+& (aobj)->u_swhashmask)])
 
 /*
  * The threshold which determines whether we will use an array or a
  * hash table to store the list of allocated swap blocks.
  */
-#define UAO_SWHASH_THRESHOLD (UAO_SWHASH_CLUSTER_SIZE * 4)
+#defineUAO_SWHASH_THRESHOLD(UAO_SWHASH_CLUSTER_SIZE * 4)
+#defineUAO_USES_SWHASH(aobj) \
+((aobj)->u_pages > UAO_SWHASH_THRESHOLD)
 
 /* The number of buckets in a hash, with an upper bound. */
-#define UAO_SWHASH_MAXBUCKETS 256
-#define UAO_SWHASH_BUCKETS(pages) \
-   (min((pages) >> UAO_SWHASH_CLUSTER_SHIFT, UAO_SWHASH_MAXBUCKETS))
+#defineUAO_SWHASH_MAXBUCKETS   256
+#defineUAO_SWHASH_BUCKETS(pages) \
+(min((pages) >> UAO_SWHASH_CLUSTER_SHIFT, UAO_SWHASH_MAXBUCKETS))
 
 
 /*
@@ -253,7 +254,7 @@ uao_find_swslot(struct uvm_aobj *aobj, i
/*
 * if hashing, look in hash table.
 */
-   if (aobj->u_pages > UAO_SWHASH_THRESHOLD) {
+   if (UAO_USES_SWHASH(aobj)) {
struct uao_swhash_elt *elt =
uao_find_swhash_elt(aobj, pageidx, FALSE);
 
@@ -299,7 +300,7 @@ uao_set_swslot(struct uvm_object *uobj, 
/*
 * are we using a hash table?  if so, add it in the hash.
 */
-   if (aobj->u_pages > UAO_SWHASH_THRESHOLD) {
+   if (UAO_USES_SWHASH(aobj)) {
/*
 * Avoid allocating an entry just to free it again if
 * the page had not swap slot in the first place, and
@@ -351,7 +352,7 @@ static void
 uao_free(struct uvm_aobj *aobj)
 {
 
-   if (aobj->u_pages > UAO_SWHASH_THRESHOLD) {
+   if (UAO_USES_SWHASH(aobj)) {
int i, hashbuckets = aobj->u_swhashmask + 1;
 
/*
@@ -441,7 +442,7 @@ uao_shrink_hash(struct uvm_object *uobj,
unsigned long new_hashmask;
int i;
 
-   KASSERT(aobj->u_pages > UAO_SWHASH_THRESHOLD);
+   KASSERT(UAO_USES_SWHASH(aobj));
 
/*
 * If the size of the hash table doesn't change, all we need to do is
@@ -743,7 +744,7 @@ uao_create(vsize_t size, int flags)
mflags = M_WAITOK;
 
/* allocate hash table or array depending on object size */
-   if (aobj->u_pages > UAO_SWHASH_THRESHOLD) {
+   if (UAO_USES_SWHASH(aobj)) {
aobj->u_swhash = hashinit(UAO_SWHASH_BUCKETS(pages),
M_UVMAOBJ, mflags, &aobj->u_swhashmask);
if (aobj->u_swhash == NULL) {
@@ -1370,7 +1371,7 @@ uao_pagein(struct uvm_aobj *aobj, int st
 {
boolean_t rv;
 
-   if (aobj->u_pages > UAO_SWHASH_THRESHOLD) {
+   if (UAO_USES_SWHASH(aobj)) {
struct uao_swhash_elt *elt;
int bucket;
 
@@ -1504,7 +1505,7 @@ uao_dropswap_range(struct uvm_object *uo
end = INT64_MAX;
}
 
-   if (aobj->u_pages > UAO_SWHASH_THRE

Re: [OpenBSD -current] Change event timer in main loop with kqueue

2021-03-30 Thread Martin Pieuchot
On 21/03/21(Sun) 11:27, Visa Hankala wrote:
> On Sat, Feb 27, 2021 at 01:36:29PM +, Visa Hankala wrote:
> > The kernel does not reschedule the timer when the user changes the
> > timeout period. The new period will take effect only after the current
> > period has expired. This is not explained in the manual page, though.
> > 
> > With the recent kqueue changes, it is straightforward to make the kernel
> > modify an existing timer. I think the clearest behaviour is to reset the
> > timer completely when it is modified. If there are pending events, they
> > should be cancelled because they do not necessarily correspond to the
> > new settings.
> > 
> > When f_modify and f_process are present in kqread_filtops, filt_timer
> > is not used. filt_timerexpire() activates timer knotes directly using
> > knote_activate() instead of KNOTE().
> > 
> > However, the current behaviour has been around so long that one can
> > argue that it is an actual feature. BSDs are not consistent with this,
> > though. FreeBSD resets the timer immediately, whereas NetBSD and
> > DragonFly BSD apply the new period after expiry.
> > 
> > I guess the resetting is harmless in most cases but might wreak havoc
> > at least with software that keeps poking its timers before expiry.
> 
> I have received too little feedback to commit this.
> 
> The most important question is, should the timer behaviour be changed?

I don't know if it exist code depending on this specific behavior but I
believe that, when it comes to BSD APIs exported to userland, being
aligned with FreeBSD is helpful.  That's what I learned when working on
kqueue(2) backends when porting OSS. 

> > Index: lib/libc/sys/kqueue.2
> > ===
> > RCS file: src/lib/libc/sys/kqueue.2,v
> > retrieving revision 1.43
> > diff -u -p -r1.43 kqueue.2
> > --- lib/libc/sys/kqueue.2   14 Nov 2020 10:16:15 -  1.43
> > +++ lib/libc/sys/kqueue.2   27 Feb 2021 12:54:27 -
> > @@ -468,6 +468,11 @@ contains the number of times the timeout
> >  This filter automatically sets the
> >  .Dv EV_CLEAR
> >  flag internally.
> > +.Pp
> > +If an existing timer is re-added, the existing timer and related pending 
> > events
> > +will be cancelled.
> > +The timer will be re-started using the timeout period
> > +.Fa data .
> >  .It Dv EVFILT_DEVICE
> >  Takes a descriptor as the identifier and the events to watch for in
> >  .Fa fflags ,
> > Index: sys/kern/kern_event.c
> > ===
> > RCS file: src/sys/kern/kern_event.c,v
> > retrieving revision 1.161
> > diff -u -p -r1.161 kern_event.c
> > --- sys/kern/kern_event.c   24 Feb 2021 14:59:52 -  1.161
> > +++ sys/kern/kern_event.c   27 Feb 2021 12:54:27 -
> > @@ -135,7 +135,8 @@ int filt_fileattach(struct knote *kn);
> >  void   filt_timerexpire(void *knx);
> >  intfilt_timerattach(struct knote *kn);
> >  void   filt_timerdetach(struct knote *kn);
> > -intfilt_timer(struct knote *kn, long hint);
> > +intfilt_timermodify(struct kevent *kev, struct knote *kn);
> > +intfilt_timerprocess(struct knote *kn, struct kevent *kev);
> >  void   filt_seltruedetach(struct knote *kn);
> >  
> >  const struct filterops kqread_filtops = {
> > @@ -163,7 +164,9 @@ const struct filterops timer_filtops = {
> > .f_flags= 0,
> > .f_attach   = filt_timerattach,
> > .f_detach   = filt_timerdetach,
> > -   .f_event= filt_timer,
> > +   .f_event= NULL,
> > +   .f_modify   = filt_timermodify,
> > +   .f_process  = filt_timerprocess,
> >  };
> >  
> >  struct pool knote_pool;
> > @@ -444,15 +447,48 @@ filt_timerdetach(struct knote *kn)
> > struct timeout *to;
> >  
> > to = (struct timeout *)kn->kn_hook;
> > -   timeout_del(to);
>  > +  timeout_del_barrier(to);
> > free(to, M_KEVENT, sizeof(*to));
> > kq_ntimeouts--;
> >  }
> >  
> >  int
> > -filt_timer(struct knote *kn, long hint)
> > +filt_timermodify(struct kevent *kev, struct knote *kn)
> > +{
> > +   struct timeout *to = kn->kn_hook;
> > +   int s;
> > +
> > +   /* Reset the timer. Any pending events are discarded. */
> > +
> > +   timeout_del_barrier(to);
> > +
> > +   s = splhigh();
> > +   if (kn->kn_status & KN_QUEUED)
> > +   knote_dequeue(kn);
> > +   kn->kn_status &= ~KN_ACTIVE;
> > +   splx(s);
> > +
> > +   kn->kn_data = 0;
> > +   knote_modify(kev, kn);
> > +   /* Reinit timeout to invoke tick adjustment again. */
> > +   timeout_set(to, filt_timerexpire, kn);
> > +   filt_timer_timeout_add(kn);
> > +
> > +   return (0);
> > +}
> > +
> > +int
> > +filt_timerprocess(struct knote *kn, struct kevent *kev)
> >  {
> > -   return (kn->kn_data != 0);
> > +   int active, s;
> > +
> > +   s = splsoftclock();
> > +   active = (kn->kn_data != 0);
> > +   if (active)
> > +   knote_submit(kn, kev);
> > +   splx(s);
> > +
> > +   return (active);
> >  }
> > 

Re: patch: change swblk_t type and use it in blist

2022-08-05 Thread Martin Pieuchot
On 05/08/22(Fri) 18:10, Sebastien Marie wrote:
> Hi,
> 
> When initially ported blist from DragonFlyBSD, we used custom type bsblk_t 
> and 
> bsbmp_t instead of the one used by DragonFlyBSD (swblk_t and u_swblk_t).
> 
> The reason was swblk_t is already defined on OpenBSD, and was incompatible 
> with 
> blist (int32_t). It is defined, but not used (outside some regress file which 
> seems to be not affected by type change).
> 
> This diff changes the __swblk_t definition in sys/_types.h to be 'unsigned 
> long', and switch back blist to use swblk_t (and u_swblk_t, even if it isn't 
> 'unsigned swblk_t').
> 
> It makes the diff with DragonFlyBSD more thin. I added a comment with the git 
> id 
> used for the initial port.
> 
> I tested it on i386 and amd64 (kernel and userland).
> 
> By changing bitmap type from 'u_long' to 'u_swblk_t' ('u_int64_t'), it makes 
> the 
> regress the same on 64 and 32bits archs (and it success on both).
> 
> Comments or OK ?

Makes sense to me.  I'm not a standard/type lawyer so I don't know if
this is fine for userland.  So I'm ok with it.

> diff /home/semarie/repos/openbsd/src
> commit - 73f52ef7130cefbe5a8fe028eedaad0e54be7303
> path + /home/semarie/repos/openbsd/src
> blob - e05867429cdd81c434f9ca589c1fb8c6d25957f8
> file + sys/sys/_types.h
> --- sys/sys/_types.h
> +++ sys/sys/_types.h
> @@ -60,7 +60,7 @@ typedef __uint8_t   __sa_family_t;  /* sockaddr 
> address f
>  typedef  __int32_t   __segsz_t;  /* segment size */
>  typedef  __uint32_t  __socklen_t;/* length type for network 
> syscalls */
>  typedef  long__suseconds_t;  /* microseconds (signed) */
> -typedef  __int32_t   __swblk_t;  /* swap offset */
> +typedef  unsigned long   __swblk_t;  /* swap offset */
>  typedef  __int64_t   __time_t;   /* epoch time */
>  typedef  __int32_t   __timer_t;  /* POSIX timer identifiers */
>  typedef  __uint32_t  __uid_t;/* user id */
> blob - 102ca95dd45ba6d9cab0f3fcbb033d6043ec1606
> file + sys/sys/blist.h
> --- sys/sys/blist.h
> +++ sys/sys/blist.h
> @@ -1,4 +1,5 @@
>  /* $OpenBSD: blist.h,v 1.1 2022/07/29 17:47:12 semarie Exp $ */
> +/* DragonFlyBSD:7b80531f545c7d3c51c1660130c71d01f6bccbe0:/sys/sys/blist.h */
>  /*
>   * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
>   * 
> @@ -65,15 +66,13 @@
>  #include 
>  #endif
>  
> -#define  SWBLK_BITS 64
> -typedef u_long bsbmp_t;
> -typedef u_long bsblk_t;
> +typedef u_int64_tu_swblk_t;
>  
>  /*
>   * note: currently use SWAPBLK_NONE as an absolute value rather then
>   * a flag bit.
>   */
> -#define SWAPBLK_NONE ((bsblk_t)-1)
> +#define SWAPBLK_NONE ((swblk_t)-1)
>  
>  /*
>   * blmeta and bl_bitmap_t MUST be a power of 2 in size.
> @@ -81,39 +80,39 @@ typedef u_long bsblk_t;
>  
>  typedef struct blmeta {
>   union {
> - bsblk_t bmu_avail;  /* space available under us */
> - bsbmp_t bmu_bitmap; /* bitmap if we are a leaf  */
> + swblk_t bmu_avail;  /* space available under us */
> + u_swblk_t   bmu_bitmap; /* bitmap if we are a leaf  */
>   } u;
> - bsblk_t bm_bighint; /* biggest contiguous block hint*/
> + swblk_t bm_bighint; /* biggest contiguous block hint*/
>  } blmeta_t;
>  
>  typedef struct blist {
> - bsblk_t bl_blocks;  /* area of coverage */
> + swblk_t bl_blocks;  /* area of coverage */
>   /* XXX int64_t bl_radix */
> - bsblk_t bl_radix;   /* coverage radix   */
> - bsblk_t bl_skip;/* starting skip*/
> - bsblk_t bl_free;/* number of free blocks*/
> + swblk_t bl_radix;   /* coverage radix   */
> + swblk_t bl_skip;/* starting skip*/
> + swblk_t bl_free;/* number of free blocks*/
>   blmeta_t*bl_root;   /* root of radix tree   */
> - bsblk_t bl_rootblks;/* bsblk_t blks allocated for tree */
> + swblk_t bl_rootblks;/* swblk_t blks allocated for tree */
>  } *blist_t;
>  
> -#define BLIST_META_RADIX (sizeof(bsbmp_t)*8/2)   /* 2 bits per */
> -#define BLIST_BMAP_RADIX (sizeof(bsbmp_t)*8) /* 1 bit per */
> +#define BLIST_META_RADIX (sizeof(u_swblk_t)*8/2) /* 2 bits per */
> +#define BLIST_BMAP_RADIX (sizeof(u_swblk_t)*8)   /* 1 bit per */
>  
>  /*
>   * The radix may exceed the size of a 64 bit signed (or unsigned) int
> - * when the maximal number of blocks is allocated.  With a 32-bit bsblk_t
> + * when the maximal number of blocks is allocated.  With a 32-bit swblk_t
>   * this corresponds to ~1G x PAGE_SIZE = 4096GB.  The swap code usually
>   * divides this by 4, leaving us with a capability of up to four 1TB swap
>   * devices.
>   *
> - * With a 64-bi

Re: uvm_swap: introduce uvm_swap_data_lock

2022-08-17 Thread Martin Pieuchot
On 16/01/22(Sun) 15:35, Martin Pieuchot wrote:
> On 30/12/21(Thu) 23:38, Theo Buehler wrote:
> > The diff below does two things: it adds a uvm_swap_data_lock mutex and
> > trades it for the KERNEL_LOCK in uvm_swapisfull() and uvm_swap_markbad()
> 
> Why is it enough?  Which fields is the lock protecting in these
> function?  Is it `uvmexp.swpages', could that be documented?  

It is documented in the diff below.

> 
> What about `nswapdev'?  Why is the rwlock grabbed before reading it in
> sys_swapctl()?i

Because it is always modified with the lock, I added some documentation.

> What about `swpginuse'?

This is still under KERNEL_LOCK(), documented below.

> If the mutex/rwlock are used to protect the global `swap_priority' could
> that be also documented?  Once this is documented it should be trivial to
> see that some places are missing some locking.  Is it intentional?
> 
> > The uvm_swap_data_lock protects all swap data structures, so needs to be
> > grabbed a few times, many of them already documented in the comments.
> > 
> > For review, I suggest comparing to what NetBSD did and also going
> > through the consumers (swaplist_insert, swaplist_find, swaplist_trim)
> > and check that they are properly locked when called, or that there is
> > the KERNEL_LOCK() in place when swap data structures are manipulated.
> 
> I'd suggest using the KASSERT(rw_write_held()) idiom to further reduce
> the differences with NetBSD.

Done.

> > In swapmount() I introduced locking since that's needed to be able to
> > assert that the proper locks are held in swaplist_{insert,find,trim}.
> 
> Could the KERNEL_LOCK() in uvm_swap_get() be pushed a bit further down?
> What about `uvmexp.nswget' and `uvmexp.swpgonly' in there?

This has been done as part of another change.  This diff uses an atomic
operation to increase `nswget' in case multiple threads fault on a page
in swap at the same time.

Updated diff below, ok?

Index: uvm/uvm_swap.c
===
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.163
diff -u -p -r1.163 uvm_swap.c
--- uvm/uvm_swap.c  6 Aug 2022 13:44:04 -   1.163
+++ uvm/uvm_swap.c  17 Aug 2022 11:46:20 -
@@ -45,6 +45,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -84,13 +85,16 @@
  * the system maintains a global data structure describing all swap
  * partitions/files.   there is a sorted LIST of "swappri" structures
  * which describe "swapdev"'s at that priority.   this LIST is headed
- * by the "swap_priority" global var.each "swappri" contains a 
+ * by the "swap_priority" global var.each "swappri" contains a
  * TAILQ of "swapdev" structures at that priority.
  *
  * locking:
  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
  *system call and prevents the swap priority list from changing
  *while we are in the middle of a system call (e.g. SWAP_STATS).
+ *  - uvm_swap_data_lock (mutex): this lock protects all swap data
+ *structures including the priority list, the swapdev structures,
+ *and the swapmap arena.
  *
  * each swap device has the following info:
  *  - swap device in use (could be disabled, preventing future use)
@@ -106,7 +110,7 @@
  * userland controls and configures swap with the swapctl(2) system call.
  * the sys_swapctl performs the following operations:
  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
- *  [2] SWAP_STATS: given a pointer to an array of swapent structures 
+ *  [2] SWAP_STATS: given a pointer to an array of swapent structures
  * (passed in via "arg") of a size passed in via "misc" ... we load
  * the current swap config into the array.
  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
@@ -208,9 +212,10 @@ struct extent *swapmap;/* controls the
 
 /* list of all active swap devices [by priority] */
 LIST_HEAD(swap_priority, swappri);
-struct swap_priority swap_priority;
+struct swap_priority swap_priority;/* [S] */
 
 /* locks */
+struct mutex uvm_swap_data_lock = MUTEX_INITIALIZER(IPL_NONE);
 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
 
 struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM);
@@ -224,7 +229,7 @@ void swapdrum_add(struct swapdev *, in
 struct swapdev *swapdrum_getsdp(int);
 
 struct swapdev *swaplist_find(struct vnode *, int);
-voidswaplist_insert(struct swapdev *, 
+voidswaplist_insert(struct swapdev *,
 struct swappri *, int);
 voidswaplist_trim(void);
 
@@ -472,16 +477,19 @@ uvm_swap_finicrypt_all(void)
 /*
  * swaplist_in

Fix a race in uvm_pseg_release()

2022-08-18 Thread Martin Pieuchot
The lock must be grabbed before iterating on the global array, ok?

Index: uvm/uvm_pager.c
===
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.88
diff -u -p -r1.88 uvm_pager.c
--- uvm/uvm_pager.c 15 Aug 2022 03:21:04 -  1.88
+++ uvm/uvm_pager.c 18 Aug 2022 10:31:16 -
@@ -209,6 +209,7 @@ uvm_pseg_release(vaddr_t segaddr)
struct uvm_pseg *pseg;
vaddr_t va = 0;
 
+   mtx_enter(&uvm_pseg_lck);
for (pseg = &psegs[0]; pseg != &psegs[PSEG_NUMSEGS]; pseg++) {
if (pseg->start <= segaddr &&
segaddr < pseg->start + MAX_PAGER_SEGS * MAXBSIZE)
@@ -222,7 +223,6 @@ uvm_pseg_release(vaddr_t segaddr)
/* test for no remainder */
KDASSERT(segaddr == pseg->start + id * MAXBSIZE);
 
-   mtx_enter(&uvm_pseg_lck);
 
KASSERT(UVM_PSEG_INUSE(pseg, id));
 



Simplify locking code in pdaemon

2022-08-18 Thread Martin Pieuchot
Use a "slock" variable as done in multiple places to simplify the code.
The locking stay the same.  This is just a first step to simplify this
mess.

Also get rid of the return value of the function, it is never checked.

ok?

Index: uvm/uvm_pdaemon.c
===
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.101
diff -u -p -r1.101 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c   28 Jun 2022 19:31:30 -  1.101
+++ uvm/uvm_pdaemon.c   18 Aug 2022 10:44:52 -
@@ -102,7 +102,7 @@ extern void drmbackoff(long);
  */
 
 void   uvmpd_scan(struct uvm_pmalloc *);
-boolean_t  uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
+void   uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
 void   uvmpd_tune(void);
 void   uvmpd_drop(struct pglist *);
 
@@ -377,17 +377,16 @@ uvm_aiodone_daemon(void *arg)
  * => we handle the building of swap-backed clusters
  * => we return TRUE if we are exiting because we met our target
  */
-
-boolean_t
+void
 uvmpd_scan_inactive(struct uvm_pmalloc *pma, struct pglist *pglst)
 {
-   boolean_t retval = FALSE;   /* assume we haven't hit target */
int free, result;
struct vm_page *p, *nextpg;
struct uvm_object *uobj;
struct vm_page *pps[SWCLUSTPAGES], **ppsp;
int npages;
struct vm_page *swpps[SWCLUSTPAGES];/* XXX: see below */
+   struct rwlock *slock;
int swnpages, swcpages; /* XXX: see below */
int swslot;
struct vm_anon *anon;
@@ -402,7 +401,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 */
swslot = 0;
swnpages = swcpages = 0;
-   free = 0;
dirtyreacts = 0;
p = NULL;
 
@@ -431,18 +429,14 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 */
uobj = NULL;
anon = NULL;
-
if (p) {
/*
-* update our copy of "free" and see if we've met
-* our target
+* see if we've met our target
 */
free = uvmexp.free - BUFPAGES_DEFICIT;
if (((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
(free + uvmexp.paging >= uvmexp.freetarg << 2)) ||
dirtyreacts == UVMPD_NUMDIRTYREACTS) {
-   retval = TRUE;
-
if (swslot == 0) {
/* exit now if no swap-i/o pending */
break;
@@ -450,9 +444,9 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 
/* set p to null to signal final swap i/o */
p = NULL;
+   nextpg = NULL;
}
}
-
if (p) {/* if (we have a new page to consider) */
/*
 * we are below target and have a new page to consider.
@@ -460,11 +454,12 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
uvmexp.pdscans++;
nextpg = TAILQ_NEXT(p, pageq);
 
+   anon = p->uanon;
+   uobj = p->uobject;
if (p->pg_flags & PQ_ANON) {
-   anon = p->uanon;
KASSERT(anon != NULL);
-   if (rw_enter(anon->an_lock,
-   RW_WRITE|RW_NOSLEEP)) {
+   slock = anon->an_lock;
+   if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
/* lock failed, skip this page */
continue;
}
@@ -474,23 +469,20 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 */
if (pmap_is_referenced(p)) {
uvm_pageactivate(p);
-   rw_exit(anon->an_lock);
+   rw_exit(slock);
uvmexp.pdreact++;
continue;
}
if (p->pg_flags & PG_BUSY) {
-   rw_exit(anon->an_lock);
+   rw_exit(slock);
uvmexp.pdbusy++;
-   /* someone else owns page, skip it */
continue;
}
uvmexp.pdanscan++;
} else {
-   uobj = p->uobject;
  

pdaemon locking tweak

2022-08-29 Thread Martin Pieuchot
Diff below refactors the pdaemon's locking by introducing a new *trylock()
function for a given page.  This is shamelessly stolen from NetBSD.

This is part of my ongoing effort to untangle the locks used by the page
daemon.

ok?

Index: uvm//uvm_pdaemon.c
===
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.102
diff -u -p -r1.102 uvm_pdaemon.c
--- uvm//uvm_pdaemon.c  22 Aug 2022 12:03:32 -  1.102
+++ uvm//uvm_pdaemon.c  29 Aug 2022 11:36:59 -
@@ -101,6 +101,7 @@ extern void drmbackoff(long);
  * local prototypes
  */
 
+struct rwlock  *uvmpd_trylockowner(struct vm_page *);
 void   uvmpd_scan(struct uvm_pmalloc *);
 void   uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
 void   uvmpd_tune(void);
@@ -367,6 +368,34 @@ uvm_aiodone_daemon(void *arg)
 }
 
 
+/*
+ * uvmpd_trylockowner: trylock the page's owner.
+ *
+ * => return the locked rwlock on success.  otherwise, return NULL.
+ */
+struct rwlock *
+uvmpd_trylockowner(struct vm_page *pg)
+{
+
+   struct uvm_object *uobj = pg->uobject;
+   struct rwlock *slock;
+
+   if (uobj != NULL) {
+   slock = uobj->vmobjlock;
+   } else {
+   struct vm_anon *anon = pg->uanon;
+
+   KASSERT(anon != NULL);
+   slock = anon->an_lock;
+   }
+
+   if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
+   return NULL;
+   }
+
+   return slock;
+}
+
 
 /*
  * uvmpd_scan_inactive: scan an inactive list for pages to clean or free.
@@ -454,53 +483,44 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
uvmexp.pdscans++;
nextpg = TAILQ_NEXT(p, pageq);
 
+   /*
+* move referenced pages back to active queue
+* and skip to next page.
+*/
+   if (pmap_is_referenced(p)) {
+   uvm_pageactivate(p);
+   uvmexp.pdreact++;
+   continue;
+   }
+
anon = p->uanon;
uobj = p->uobject;
-   if (p->pg_flags & PQ_ANON) {
+
+   /*
+* first we attempt to lock the object that this page
+* belongs to.  if our attempt fails we skip on to
+* the next page (no harm done).  it is important to
+* "try" locking the object as we are locking in the
+* wrong order (pageq -> object) and we don't want to
+* deadlock.
+*/
+   slock = uvmpd_trylockowner(p);
+   if (slock == NULL) {
+   continue;
+   }
+
+   if (p->pg_flags & PG_BUSY) {
+   rw_exit(slock);
+   uvmexp.pdbusy++;
+   continue;
+   }
+
+   /* does the page belong to an object? */
+   if (uobj != NULL) {
+   uvmexp.pdobscan++;
+   } else {
KASSERT(anon != NULL);
-   slock = anon->an_lock;
-   if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
-   /* lock failed, skip this page */
-   continue;
-   }
-   /*
-* move referenced pages back to active queue
-* and skip to next page.
-*/
-   if (pmap_is_referenced(p)) {
-   uvm_pageactivate(p);
-   rw_exit(slock);
-   uvmexp.pdreact++;
-   continue;
-   }
-   if (p->pg_flags & PG_BUSY) {
-   rw_exit(slock);
-   uvmexp.pdbusy++;
-   continue;
-   }
uvmexp.pdanscan++;
-   } else {
-   KASSERT(uobj != NULL);
-   slock = uobj->vmobjlock;
-   if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
-   continue;
-   }
-   /*
-* move referenced pages back to active queue
-* and skip to

uvmpd_dropswap()

2022-08-29 Thread Martin Pieuchot
Small refactoring to introduce uvmpd_dropswap().  This will make an
upcoming rewrite of the pdaemon smaller & easier to review :o)

ok?

Index: uvm/uvm_pdaemon.c
===
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.102
diff -u -p -r1.102 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c   22 Aug 2022 12:03:32 -  1.102
+++ uvm/uvm_pdaemon.c   29 Aug 2022 11:55:52 -
@@ -105,6 +105,7 @@ voiduvmpd_scan(struct uvm_pmalloc *);
 void   uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
 void   uvmpd_tune(void);
 void   uvmpd_drop(struct pglist *);
+void   uvmpd_dropswap(struct vm_page *);
 
 /*
  * uvm_wait: wait (sleep) for the page daemon to free some pages
@@ -367,6 +368,23 @@ uvm_aiodone_daemon(void *arg)
 }
 
 
+/*
+ * uvmpd_dropswap: free any swap allocated to this page.
+ *
+ * => called with owner locked.
+ */
+void
+uvmpd_dropswap(struct vm_page *pg)
+{
+   struct vm_anon *anon = pg->uanon;
+
+   if ((pg->pg_flags & PQ_ANON) && anon->an_swslot) {
+   uvm_swap_free(anon->an_swslot, 1);
+   anon->an_swslot = 0;
+   } else if (pg->pg_flags & PQ_AOBJ) {
+   uao_dropswap(pg->uobject, pg->offset >> PAGE_SHIFT);
+   }
+}
 
 /*
  * uvmpd_scan_inactive: scan an inactive list for pages to clean or free.
@@ -566,16 +584,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
KASSERT(uvmexp.swpginuse <= uvmexp.swpages);
if ((p->pg_flags & PQ_SWAPBACKED) &&
uvmexp.swpginuse == uvmexp.swpages) {
-
-   if ((p->pg_flags & PQ_ANON) &&
-   p->uanon->an_swslot) {
-   uvm_swap_free(p->uanon->an_swslot, 1);
-   p->uanon->an_swslot = 0;
-   }
-   if (p->pg_flags & PQ_AOBJ) {
-   uao_dropswap(p->uobject,
-p->offset >> PAGE_SHIFT);
-   }
+   uvmpd_dropswap(p);
}
 
/*
@@ -599,16 +608,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 */
if (swap_backed) {
/* free old swap slot (if any) */
-   if (anon) {
-   if (anon->an_swslot) {
-   uvm_swap_free(anon->an_swslot,
-   1);
-   anon->an_swslot = 0;
-   }
-   } else {
-   uao_dropswap(uobj,
-p->offset >> PAGE_SHIFT);
-   }
+   uvmpd_dropswap(p);
 
/* start new cluster (if necessary) */
if (swslot == 0) {



Re: pdaemon locking tweak

2022-08-30 Thread Martin Pieuchot
On 30/08/22(Tue) 15:28, Jonathan Gray wrote:
> On Mon, Aug 29, 2022 at 01:46:20PM +0200, Martin Pieuchot wrote:
> > Diff below refactors the pdaemon's locking by introducing a new *trylock()
> > function for a given page.  This is shamelessly stolen from NetBSD.
> > 
> > This is part of my ongoing effort to untangle the locks used by the page
> > daemon.
> > 
> > ok?
> 
> if (pmap_is_referenced(p)) {
>   uvm_pageactivate(p);
> 
> is no longer under held slock.  Which I believe is intended,
> just not obvious looking at the diff.
> 
> The page queue is already locked on entry to uvmpd_scan_inactive()

Thanks for spotting this.  Indeed the locking required for
uvm_pageactivate() is different in my local tree.  For now
let's keep the existing order of operations.

Updated diff below.

Index: uvm/uvm_pdaemon.c
===
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.103
diff -u -p -r1.103 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c   30 Aug 2022 08:30:58 -  1.103
+++ uvm/uvm_pdaemon.c   30 Aug 2022 08:39:19 -
@@ -101,6 +101,7 @@ extern void drmbackoff(long);
  * local prototypes
  */
 
+struct rwlock  *uvmpd_trylockowner(struct vm_page *);
 void   uvmpd_scan(struct uvm_pmalloc *);
 void   uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
 void   uvmpd_tune(void);
@@ -367,6 +368,34 @@ uvm_aiodone_daemon(void *arg)
}
 }
 
+/*
+ * uvmpd_trylockowner: trylock the page's owner.
+ *
+ * => return the locked rwlock on success.  otherwise, return NULL.
+ */
+struct rwlock *
+uvmpd_trylockowner(struct vm_page *pg)
+{
+
+   struct uvm_object *uobj = pg->uobject;
+   struct rwlock *slock;
+
+   if (uobj != NULL) {
+   slock = uobj->vmobjlock;
+   } else {
+   struct vm_anon *anon = pg->uanon;
+
+   KASSERT(anon != NULL);
+   slock = anon->an_lock;
+   }
+
+   if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
+   return NULL;
+   }
+
+   return slock;
+}
+
 
 /*
  * uvmpd_dropswap: free any swap allocated to this page.
@@ -474,51 +503,43 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 
anon = p->uanon;
uobj = p->uobject;
-   if (p->pg_flags & PQ_ANON) {
+
+   /*
+* first we attempt to lock the object that this page
+* belongs to.  if our attempt fails we skip on to
+* the next page (no harm done).  it is important to
+* "try" locking the object as we are locking in the
+* wrong order (pageq -> object) and we don't want to
+* deadlock.
+*/
+   slock = uvmpd_trylockowner(p);
+   if (slock == NULL) {
+   continue;
+   }
+
+   /*
+* move referenced pages back to active queue
+* and skip to next page.
+*/
+   if (pmap_is_referenced(p)) {
+   uvm_pageactivate(p);
+   rw_exit(slock);
+   uvmexp.pdreact++;
+   continue;
+   }
+
+   if (p->pg_flags & PG_BUSY) {
+   rw_exit(slock);
+   uvmexp.pdbusy++;
+   continue;
+   }
+
+   /* does the page belong to an object? */
+   if (uobj != NULL) {
+   uvmexp.pdobscan++;
+   } else {
KASSERT(anon != NULL);
-   slock = anon->an_lock;
-   if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
-   /* lock failed, skip this page */
-   continue;
-   }
-   /*
-* move referenced pages back to active queue
-* and skip to next page.
-*/
-   if (pmap_is_referenced(p)) {
-   uvm_pageactivate(p);
-   rw_exit(slock);
-   uvmexp.pdreact++;
-   continue;
-   }
-   if (p->pg_flags & PG_BUSY) {
- 

Re: ps(1): add -d (descendancy) option to display parent/child process relationships

2022-09-01 Thread Martin Pieuchot
On 01/09/22(Thu) 03:37, Job Snijders wrote:
> Dear all,
> 
> Some ps(1) implementations have an '-d' ('descendancy') option. Through
> ASCII art parent/child process relationships are grouped and displayed.
> Here is an example:
> 
> $ ps ad -O ppid,user
>   PID  PPID USER TT  STATTIME COMMAND
> 18180 12529 job  pb  I+p  0:00.01 `-- -sh (sh)
> 26689 56460 job  p3  Ip   0:00.01   `-- -ksh (ksh)
>  5153 26689 job  p3  I+p  0:40.18 `-- mutt
> 62046 25272 job  p4  Sp   0:00.25   `-- -ksh (ksh)
> 61156 62046 job  p4  R+/0 0:00.00 `-- ps -ad -O ppid
> 26816  2565 job  p5  Ip   0:00.01   `-- -ksh (ksh)
> 79431 26816 root p5  Ip   0:00.16 `-- /bin/ksh
> 43915 79431 _rpki-cl p5  S+pU 0:06.97   `-- rpki-client
> 70511 43915 _rpki-cl p5  I+pU 0:01.26 |-- rpki-client: parser 
> (rpki-client)
> 96992 43915 _rpki-cl p5  I+pU 0:00.00 |-- rpki-client: rsync 
> (rpki-client)
> 49160 43915 _rpki-cl p5  S+p  0:01.52 |-- rpki-client: http 
> (rpki-client)
> 99329 43915 _rpki-cl p5  S+p  0:03.20 `-- rpki-client: rrdp 
> (rpki-client)
> 
> The functionality is similar to pstree(1) in the ports collection.
> 
> The below changeset borrows heavily from the following two
> implementations:
> 
> 
> https://github.com/freebsd/freebsd-src/commit/044fce530f89a819827d351de364d208a30e9645.patch
> 
> https://github.com/NetBSD/src/commit/b82f6d00d93d880d3976c4f1e88c33d88a8054ad.patch
> 
> Thoughts?

I'd love to have such feature in base.

> Index: extern.h
> ===
> RCS file: /cvs/src/bin/ps/extern.h,v
> retrieving revision 1.23
> diff -u -p -r1.23 extern.h
> --- extern.h  5 Jan 2022 04:10:36 -   1.23
> +++ extern.h  1 Sep 2022 03:31:36 -
> @@ -44,44 +44,44 @@ extern VAR var[];
>  extern VARENT *vhead;
>  
>  __BEGIN_DECLS
> -void  command(const struct kinfo_proc *, VARENT *);
> -void  cputime(const struct kinfo_proc *, VARENT *);
> +void  command(const struct pinfo *, VARENT *);
> +void  cputime(const struct pinfo *, VARENT *);
>  int   donlist(void);
> -void  elapsed(const struct kinfo_proc *, VARENT *);
> +void  elapsed(const struct pinfo *, VARENT *);
>  doublegetpcpu(const struct kinfo_proc *);
> -doublegetpmem(const struct kinfo_proc *);
> -void  gname(const struct kinfo_proc *, VARENT *);
> -void  supgid(const struct kinfo_proc *, VARENT *);
> -void  supgrp(const struct kinfo_proc *, VARENT *);
> -void  logname(const struct kinfo_proc *, VARENT *);
> -void  longtname(const struct kinfo_proc *, VARENT *);
> -void  lstarted(const struct kinfo_proc *, VARENT *);
> -void  maxrss(const struct kinfo_proc *, VARENT *);
> +doublegetpmem(const struct pinfo *);
> +void  gname(const struct pinfo *, VARENT *);
> +void  supgid(const struct pinfo *, VARENT *);
> +void  supgrp(const struct pinfo *, VARENT *);
> +void  logname(const struct pinfo *, VARENT *);
> +void  longtname(const struct pinfo *, VARENT *);
> +void  lstarted(const struct pinfo *, VARENT *);
> +void  maxrss(const struct pinfo *, VARENT *);
>  void  nlisterr(struct nlist *);
> -void  p_rssize(const struct kinfo_proc *, VARENT *);
> -void  pagein(const struct kinfo_proc *, VARENT *);
> +void  p_rssize(const struct pinfo *, VARENT *);
> +void  pagein(const struct pinfo *, VARENT *);
>  void  parsefmt(char *);
> -void  pcpu(const struct kinfo_proc *, VARENT *);
> -void  pmem(const struct kinfo_proc *, VARENT *);
> -void  pri(const struct kinfo_proc *, VARENT *);
> +void  pcpu(const struct pinfo *, VARENT *);
> +void  pmem(const struct pinfo *, VARENT *);
> +void  pri(const struct pinfo *, VARENT *);
>  void  printheader(void);
> -void  pvar(const struct kinfo_proc *kp, VARENT *);
> -void  pnice(const struct kinfo_proc *kp, VARENT *);
> -void  rgname(const struct kinfo_proc *, VARENT *);
> -void  rssize(const struct kinfo_proc *, VARENT *);
> -void  runame(const struct kinfo_proc *, VARENT *);
> +void  pvar(const struct pinfo *, VARENT *);
> +void  pnice(const struct pinfo *, VARENT *);
> +void  rgname(const struct pinfo *, VARENT *);
> +void  rssize(const struct pinfo *, VARENT *);
> +void  runame(const struct pinfo *, VARENT *);
>  void  showkey(void);
> -void  started(const struct kinfo_proc *, VARENT *);
> -void  printstate(const struct kinfo_proc *, VARENT *);
> -void  printpledge(const struct kinfo_proc *, VARENT *);
> -void  tdev(const struct kinfo_proc *, VARENT *);
> -void  tname(const struct kinfo_proc *, VARENT *);
> -void  tsize(const struct kinfo_proc *, VARENT *);
> -void  dsize(const struct kinfo_proc *, VARENT *);
> -void  ssize(const struct kinfo_proc *, VARENT *);
> -void  ucomm(const struct kinfo_proc *, VARENT *);
> -void  curwd(const struct kinfo_proc *, VARENT *);
> -void  euname(const struct kinfo_proc *, VARENT *);
> -void  vsize(const struct kinfo_proc *, VARENT 

Unmap page in uvm_anon_release()

2022-09-10 Thread Martin Pieuchot
Diff below fixes a bug exposed when swapping on arm64.  When an anon is
released make sure the all the pmap references to the related page are
removed.

We could move the pmap_page_protect(pg, PROT_NONE) inside uvm_pagefree()
to avoid future issue but that's for a later refactoring.

With this diff I can no longer reproduce the SIGBUS issue on the
rockpro64 and swapping is stable as long as I/O from sdmmc(4) work.

This should be good enough to commit the diff that got reverted, but I'll
wait to be sure there's no regression.

ok?

Index: uvm/uvm_anon.c
===
RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
retrieving revision 1.54
diff -u -p -r1.54 uvm_anon.c
--- uvm/uvm_anon.c  26 Mar 2021 13:40:05 -  1.54
+++ uvm/uvm_anon.c  10 Sep 2022 12:10:34 -
@@ -255,6 +255,7 @@ uvm_anon_release(struct vm_anon *anon)
KASSERT(anon->an_ref == 0);
 
uvm_lock_pageq();
+   pmap_page_protect(pg, PROT_NONE);
uvm_pagefree(pg);
uvm_unlock_pageq();
KASSERT(anon->an_page == NULL);
Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.132
diff -u -p -r1.132 uvm_fault.c
--- uvm/uvm_fault.c 31 Aug 2022 01:27:04 -  1.132
+++ uvm/uvm_fault.c 10 Sep 2022 12:10:34 -
@@ -396,7 +396,6 @@ uvmfault_anonget(struct uvm_faultinfo *u
 * anon and try again.
 */
if (pg->pg_flags & PG_RELEASED) {
-   pmap_page_protect(pg, PROT_NONE);
KASSERT(anon->an_ref == 0);
/*
 * Released while we had unlocked amap.



Re: Unmap page in uvm_anon_release()

2022-09-10 Thread Martin Pieuchot
On 10/09/22(Sat) 15:12, Mark Kettenis wrote:
> > Date: Sat, 10 Sep 2022 14:18:02 +0200
> > From: Martin Pieuchot 
> > 
> > Diff below fixes a bug exposed when swapping on arm64.  When an anon is
> > released make sure the all the pmap references to the related page are
> > removed.
> 
> I'm a little bit puzzled by this.  So these pages are still mapped
> even though there are no references to the anon anymore?

I don't know.  I just realised that all the code paths leading to
uvm_pagefree() get rid of the pmap references by calling page_protect()
except a couple of them in the aiodone daemon and the clustering code in
the pager.

This can't hurt and make the existing code coherent.  Maybe it just
hides the bug, I don't know.



uvm_vnode locking & documentation

2022-09-10 Thread Martin Pieuchot
Previous fix from gnezdo@ pointed out that `u_flags' accesses should be
serialized by `vmobjlock'.  Diff below documents this and fix the
remaining places where the lock isn't yet taken.  One exception still
remains, the first loop of uvm_vnp_sync().  This cannot be fixed right
now due to possible deadlocks but that's not a reason for not documenting
& fixing the rest of this file.

This has been tested on amd64 and arm64.

Comments?  Oks?

Index: uvm/uvm_vnode.c
===
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.128
diff -u -p -r1.128 uvm_vnode.c
--- uvm/uvm_vnode.c 10 Sep 2022 16:14:36 -  1.128
+++ uvm/uvm_vnode.c 10 Sep 2022 18:23:57 -
@@ -68,11 +68,8 @@
  * we keep a simpleq of vnodes that are currently being sync'd.
  */
 
-LIST_HEAD(uvn_list_struct, uvm_vnode);
-struct uvn_list_struct uvn_wlist;  /* writeable uvns */
-
-SIMPLEQ_HEAD(uvn_sq_struct, uvm_vnode);
-struct uvn_sq_struct uvn_sync_q;   /* sync'ing uvns */
+LIST_HEAD(, uvm_vnode) uvn_wlist;  /* [K] writeable uvns */
+SIMPLEQ_HEAD(, uvm_vnode)  uvn_sync_q; /* [S] sync'ing uvns */
 struct rwlock uvn_sync_lock;   /* locks sync operation */
 
 extern int rebooting;
@@ -144,41 +141,40 @@ uvn_attach(struct vnode *vp, vm_prot_t a
struct partinfo pi;
u_quad_t used_vnode_size = 0;
 
-   /* first get a lock on the uvn. */
-   while (uvn->u_flags & UVM_VNODE_BLOCKED) {
-   uvn->u_flags |= UVM_VNODE_WANTED;
-   tsleep_nsec(uvn, PVM, "uvn_attach", INFSLP);
-   }
-
/* if we're mapping a BLK device, make sure it is a disk. */
if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK) {
return NULL;
}
 
+   /* first get a lock on the uvn. */
+   rw_enter(uvn->u_obj.vmobjlock, RW_WRITE);
+   while (uvn->u_flags & UVM_VNODE_BLOCKED) {
+   uvn->u_flags |= UVM_VNODE_WANTED;
+   rwsleep_nsec(uvn, uvn->u_obj.vmobjlock, PVM, "uvn_attach",
+   INFSLP);
+   }
+
/*
 * now uvn must not be in a blocked state.
 * first check to see if it is already active, in which case
 * we can bump the reference count, check to see if we need to
 * add it to the writeable list, and then return.
 */
-   rw_enter(uvn->u_obj.vmobjlock, RW_WRITE);
if (uvn->u_flags & UVM_VNODE_VALID) {   /* already active? */
KASSERT(uvn->u_obj.uo_refs > 0);
 
uvn->u_obj.uo_refs++;   /* bump uvn ref! */
-   rw_exit(uvn->u_obj.vmobjlock);
 
/* check for new writeable uvn */
if ((accessprot & PROT_WRITE) != 0 &&
(uvn->u_flags & UVM_VNODE_WRITEABLE) == 0) {
-   LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
-   /* we are now on wlist! */
uvn->u_flags |= UVM_VNODE_WRITEABLE;
+   LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
}
+   rw_exit(uvn->u_obj.vmobjlock);
 
return (&uvn->u_obj);
}
-   rw_exit(uvn->u_obj.vmobjlock);
 
/*
 * need to call VOP_GETATTR() to get the attributes, but that could
@@ -189,6 +185,7 @@ uvn_attach(struct vnode *vp, vm_prot_t a
 * it.
 */
uvn->u_flags = UVM_VNODE_ALOCK;
+   rw_exit(uvn->u_obj.vmobjlock);
 
if (vp->v_type == VBLK) {
/*
@@ -213,9 +210,11 @@ uvn_attach(struct vnode *vp, vm_prot_t a
}
 
if (result != 0) {
+   rw_enter(uvn->u_obj.vmobjlock, RW_WRITE);
if (uvn->u_flags & UVM_VNODE_WANTED)
wakeup(uvn);
uvn->u_flags = 0;
+   rw_exit(uvn->u_obj.vmobjlock);
return NULL;
}
 
@@ -236,18 +235,19 @@ uvn_attach(struct vnode *vp, vm_prot_t a
uvn->u_nio = 0;
uvn->u_size = used_vnode_size;
 
-   /* if write access, we need to add it to the wlist */
-   if (accessprot & PROT_WRITE) {
-   LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
-   uvn->u_flags |= UVM_VNODE_WRITEABLE;/* we are on wlist! */
-   }
-
/*
 * add a reference to the vnode.   this reference will stay as long
 * as there is a valid mapping of the vnode.   dropped when the
 * reference count goes to zero.
 */
vref(vp);
+
+   /* if write access, we need to add it to the wlist */
+   if (accessprot & PROT_WRITE) {
+   uvn->u_flags |= UVM_VNODE_WRITEABLE;
+   LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
+   }
+
if (oldflags & UVM_VNODE_WANTED)
wakeup(uvn);
 
@@ -273,6 +273,7 @@ uvn_reference(struct uvm_object *uobj)
struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
 #endif
 
+

Towards unlocking mmap(2) & munmap(2)

2022-09-11 Thread Martin Pieuchot
Diff below adds a minimalist set of assertions to ensure proper locks
are held in uvm_mapanon() and uvm_unmap_remove() which are the guts of
mmap(2) for anons and munmap(2).

Please test it with WITNESS enabled and report back.

Index: uvm/uvm_addr.c
===
RCS file: /cvs/src/sys/uvm/uvm_addr.c,v
retrieving revision 1.31
diff -u -p -r1.31 uvm_addr.c
--- uvm/uvm_addr.c  21 Feb 2022 10:26:20 -  1.31
+++ uvm/uvm_addr.c  11 Sep 2022 09:08:10 -
@@ -416,6 +416,8 @@ uvm_addr_invoke(struct vm_map *map, stru
!(hint >= uaddr->uaddr_minaddr && hint < uaddr->uaddr_maxaddr))
return ENOMEM;
 
+   vm_map_assert_anylock(map);
+
error = (*uaddr->uaddr_functions->uaddr_select)(map, uaddr,
entry_out, addr_out, sz, align, offset, prot, hint);
 
Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.132
diff -u -p -r1.132 uvm_fault.c
--- uvm/uvm_fault.c 31 Aug 2022 01:27:04 -  1.132
+++ uvm/uvm_fault.c 11 Sep 2022 08:57:35 -
@@ -1626,6 +1626,7 @@ uvm_fault_unwire_locked(vm_map_t map, va
struct vm_page *pg;
 
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
+   vm_map_assert_anylock(map);
 
/*
 * we assume that the area we are unwiring has actually been wired
Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.294
diff -u -p -r1.294 uvm_map.c
--- uvm/uvm_map.c   15 Aug 2022 15:53:45 -  1.294
+++ uvm/uvm_map.c   11 Sep 2022 09:37:44 -
@@ -162,6 +162,8 @@ int  uvm_map_inentry_recheck(u_long, v
 struct p_inentry *);
 boolean_t   uvm_map_inentry_fix(struct proc *, struct p_inentry *,
 vaddr_t, int (*)(vm_map_entry_t), u_long);
+boolean_t   uvm_map_is_stack_remappable(struct vm_map *,
+vaddr_t, vsize_t);
 /*
  * Tree management functions.
  */
@@ -491,6 +493,8 @@ uvmspace_dused(struct vm_map *map, vaddr
vaddr_t stack_begin, stack_end; /* Position of stack. */
 
KASSERT(map->flags & VM_MAP_ISVMSPACE);
+   vm_map_assert_anylock(map);
+
vm = (struct vmspace *)map;
stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
@@ -570,6 +574,8 @@ uvm_map_isavail(struct vm_map *map, stru
if (addr + sz < addr)
return 0;
 
+   vm_map_assert_anylock(map);
+
/*
 * Kernel memory above uvm_maxkaddr is considered unavailable.
 */
@@ -1446,6 +1452,8 @@ uvm_map_mkentry(struct vm_map *map, stru
entry->guard = 0;
entry->fspace = 0;
 
+   vm_map_assert_wrlock(map);
+
/* Reset free space in first. */
free = uvm_map_uaddr_e(map, first);
uvm_mapent_free_remove(map, free, first);
@@ -1573,6 +1581,8 @@ boolean_t
 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
 struct vm_map_entry **entry)
 {
+   vm_map_assert_anylock(map);
+
*entry = uvm_map_entrybyaddr(&map->addr, address);
return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
(*entry)->start <= address && (*entry)->end > address;
@@ -1692,6 +1702,8 @@ uvm_map_is_stack_remappable(struct vm_ma
vaddr_t end = addr + sz;
struct vm_map_entry *first, *iter, *prev = NULL;
 
+   vm_map_assert_anylock(map);
+
if (!uvm_map_lookup_entry(map, addr, &first)) {
printf("map stack 0x%lx-0x%lx of map %p failed: no mapping\n",
addr, end, map);
@@ -1843,6 +1855,8 @@ uvm_mapent_mkfree(struct vm_map *map, st
vaddr_t  addr;  /* Start of freed range. */
vaddr_t  end;   /* End of freed range. */
 
+   UVM_MAP_REQ_WRITE(map);
+
prev = *prev_ptr;
if (prev == entry)
*prev_ptr = prev = NULL;
@@ -1971,10 +1985,7 @@ uvm_unmap_remove(struct vm_map *map, vad
if (start >= end)
return;
 
-   if ((map->flags & VM_MAP_INTRSAFE) == 0)
-   splassert(IPL_NONE);
-   else
-   splassert(IPL_VM);
+   vm_map_assert_wrlock(map);
 
/* Find first affected entry. */
entry = uvm_map_entrybyaddr(&map->addr, start);
@@ -4027,6 +4038,8 @@ uvm_map_checkprot(struct vm_map *map, va
 {
struct vm_map_entry *entry;
 
+   vm_map_assert_anylock(map);
+
if (start < map->min_offset || end > map->max_offset || start > end)
return FALSE;
if (start == end)
@@ -4886,6 +4899,7 @@ uvm_map_freelist_update(struct vm_map *m
 vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags)
 {
KDASSERT(b_end >= b_star

Re: Towards unlocking mmap(2) & munmap(2)

2022-09-14 Thread Martin Pieuchot
On 14/09/22(Wed) 15:47, Klemens Nanni wrote:
> On 14.09.22 18:55, Mike Larkin wrote:
> > On Sun, Sep 11, 2022 at 12:26:31PM +0200, Martin Pieuchot wrote:
> > > Diff below adds a minimalist set of assertions to ensure proper locks
> > > are held in uvm_mapanon() and uvm_unmap_remove() which are the guts of
> > > mmap(2) for anons and munmap(2).
> > > 
> > > Please test it with WITNESS enabled and report back.
> > > 
> > 
> > Do you want this tested in conjunction with the aiodoned diff or by itself?
> 
> This diff looks like a subset of the previous uvm lock assertion diff
> that came out of the previous "unlock mmap(2) for anonymous mappings"
> thread[0].
> 
> https://marc.info/?l=openbsd-tech&m=164423248318212&w=2
> 
> It didn't land eventually, I **think** syzcaller was a blocker which we
> only realised once it was committed and picked up by syzcaller.
> 
> Now it's been some time and more UVM changes landed, but the majority
> (if not all) lock assertions and comments from the above linked diff
> should still hold true.
> 
> mpi, I can dust off and resend that diff, If you want.
> Nothing for release, but perhaps it helps testing your current efforts.

Please hold on, this diff is known to trigger a KASSERT() with witness.
I'll send an update version soon.

Thank you for disregarding this diff for the moment.



Re: Towards unlocking mmap(2) & munmap(2)

2022-10-20 Thread Martin Pieuchot
On 11/09/22(Sun) 12:26, Martin Pieuchot wrote:
> Diff below adds a minimalist set of assertions to ensure proper locks
> are held in uvm_mapanon() and uvm_unmap_remove() which are the guts of
> mmap(2) for anons and munmap(2).
> 
> Please test it with WITNESS enabled and report back.

New version of the diff that includes a lock/unlock dance  in 
uvm_map_teardown().  While grabbing this lock should not be strictly
necessary because no other reference to the map should exist when the
reaper is holding it, it helps make progress with asserts.  Grabbing
the lock is easy and it can also save us a lot of time if there is any
reference counting bugs (like we've discovered w/ vnode and swapping).

Please test and report back.

Index: uvm/uvm_addr.c
===
RCS file: /cvs/src/sys/uvm/uvm_addr.c,v
retrieving revision 1.31
diff -u -p -r1.31 uvm_addr.c
--- uvm/uvm_addr.c  21 Feb 2022 10:26:20 -  1.31
+++ uvm/uvm_addr.c  20 Oct 2022 14:09:30 -
@@ -416,6 +416,8 @@ uvm_addr_invoke(struct vm_map *map, stru
!(hint >= uaddr->uaddr_minaddr && hint < uaddr->uaddr_maxaddr))
return ENOMEM;
 
+   vm_map_assert_anylock(map);
+
error = (*uaddr->uaddr_functions->uaddr_select)(map, uaddr,
entry_out, addr_out, sz, align, offset, prot, hint);
 
Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.132
diff -u -p -r1.132 uvm_fault.c
--- uvm/uvm_fault.c 31 Aug 2022 01:27:04 -  1.132
+++ uvm/uvm_fault.c 20 Oct 2022 14:09:30 -
@@ -1626,6 +1626,7 @@ uvm_fault_unwire_locked(vm_map_t map, va
struct vm_page *pg;
 
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
+   vm_map_assert_anylock(map);
 
/*
 * we assume that the area we are unwiring has actually been wired
Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.298
diff -u -p -r1.298 uvm_map.c
--- uvm/uvm_map.c   16 Oct 2022 16:16:37 -  1.298
+++ uvm/uvm_map.c   20 Oct 2022 14:09:31 -
@@ -491,6 +491,8 @@ uvmspace_dused(struct vm_map *map, vaddr
vaddr_t stack_begin, stack_end; /* Position of stack. */
 
KASSERT(map->flags & VM_MAP_ISVMSPACE);
+   vm_map_assert_anylock(map);
+
vm = (struct vmspace *)map;
stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
@@ -570,6 +572,8 @@ uvm_map_isavail(struct vm_map *map, stru
if (addr + sz < addr)
return 0;
 
+   vm_map_assert_anylock(map);
+
/*
 * Kernel memory above uvm_maxkaddr is considered unavailable.
 */
@@ -1457,6 +1461,8 @@ uvm_map_mkentry(struct vm_map *map, stru
entry->guard = 0;
entry->fspace = 0;
 
+   vm_map_assert_wrlock(map);
+
/* Reset free space in first. */
free = uvm_map_uaddr_e(map, first);
uvm_mapent_free_remove(map, free, first);
@@ -1584,6 +1590,8 @@ boolean_t
 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
 struct vm_map_entry **entry)
 {
+   vm_map_assert_anylock(map);
+
*entry = uvm_map_entrybyaddr(&map->addr, address);
return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
(*entry)->start <= address && (*entry)->end > address;
@@ -1704,6 +1712,8 @@ uvm_map_is_stack_remappable(struct vm_ma
vaddr_t end = addr + sz;
struct vm_map_entry *first, *iter, *prev = NULL;
 
+   vm_map_assert_anylock(map);
+
if (!uvm_map_lookup_entry(map, addr, &first)) {
printf("map stack 0x%lx-0x%lx of map %p failed: no mapping\n",
addr, end, map);
@@ -1868,6 +1878,8 @@ uvm_mapent_mkfree(struct vm_map *map, st
vaddr_t  addr;  /* Start of freed range. */
vaddr_t  end;   /* End of freed range. */
 
+   UVM_MAP_REQ_WRITE(map);
+
prev = *prev_ptr;
if (prev == entry)
*prev_ptr = prev = NULL;
@@ -1996,10 +2008,7 @@ uvm_unmap_remove(struct vm_map *map, vad
if (start >= end)
return 0;
 
-   if ((map->flags & VM_MAP_INTRSAFE) == 0)
-   splassert(IPL_NONE);
-   else
-   splassert(IPL_VM);
+   vm_map_assert_wrlock(map);
 
/* Find first affected entry. */
entry = uvm_map_entrybyaddr(&map->addr, start);
@@ -2526,6 +2535,8 @@ uvm_map_teardown(struct vm_map *map)
 
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
 
+   vm_map_lock(map);
+
/* Remove address selectors. */
uvm_addr_destro

Re: Towards unlocking mmap(2) & munmap(2)

2022-10-28 Thread Martin Pieuchot
On 20/10/22(Thu) 16:17, Martin Pieuchot wrote:
> On 11/09/22(Sun) 12:26, Martin Pieuchot wrote:
> > Diff below adds a minimalist set of assertions to ensure proper locks
> > are held in uvm_mapanon() and uvm_unmap_remove() which are the guts of
> > mmap(2) for anons and munmap(2).
> > 
> > Please test it with WITNESS enabled and report back.
> 
> New version of the diff that includes a lock/unlock dance  in 
> uvm_map_teardown().  While grabbing this lock should not be strictly
> necessary because no other reference to the map should exist when the
> reaper is holding it, it helps make progress with asserts.  Grabbing
> the lock is easy and it can also save us a lot of time if there is any
> reference counting bugs (like we've discovered w/ vnode and swapping).

Here's an updated version that adds a lock/unlock dance in
uvm_map_deallocate() to satisfy the assert in uvm_unmap_remove().
Thanks to tb@ from pointing this out.

I received many positive feedback and test reports, I'm now asking for
oks.


Index: uvm/uvm_addr.c
===
RCS file: /cvs/src/sys/uvm/uvm_addr.c,v
retrieving revision 1.31
diff -u -p -r1.31 uvm_addr.c
--- uvm/uvm_addr.c  21 Feb 2022 10:26:20 -  1.31
+++ uvm/uvm_addr.c  28 Oct 2022 08:41:30 -
@@ -416,6 +416,8 @@ uvm_addr_invoke(struct vm_map *map, stru
!(hint >= uaddr->uaddr_minaddr && hint < uaddr->uaddr_maxaddr))
return ENOMEM;
 
+   vm_map_assert_anylock(map);
+
error = (*uaddr->uaddr_functions->uaddr_select)(map, uaddr,
entry_out, addr_out, sz, align, offset, prot, hint);
 
Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.132
diff -u -p -r1.132 uvm_fault.c
--- uvm/uvm_fault.c 31 Aug 2022 01:27:04 -  1.132
+++ uvm/uvm_fault.c 28 Oct 2022 08:41:30 -
@@ -1626,6 +1626,7 @@ uvm_fault_unwire_locked(vm_map_t map, va
struct vm_page *pg;
 
KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
+   vm_map_assert_anylock(map);
 
/*
 * we assume that the area we are unwiring has actually been wired
Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.301
diff -u -p -r1.301 uvm_map.c
--- uvm/uvm_map.c   24 Oct 2022 15:11:56 -  1.301
+++ uvm/uvm_map.c   28 Oct 2022 08:46:28 -
@@ -491,6 +491,8 @@ uvmspace_dused(struct vm_map *map, vaddr
vaddr_t stack_begin, stack_end; /* Position of stack. */
 
KASSERT(map->flags & VM_MAP_ISVMSPACE);
+   vm_map_assert_anylock(map);
+
vm = (struct vmspace *)map;
stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
@@ -570,6 +572,8 @@ uvm_map_isavail(struct vm_map *map, stru
if (addr + sz < addr)
return 0;
 
+   vm_map_assert_anylock(map);
+
/*
 * Kernel memory above uvm_maxkaddr is considered unavailable.
 */
@@ -1457,6 +1461,8 @@ uvm_map_mkentry(struct vm_map *map, stru
entry->guard = 0;
entry->fspace = 0;
 
+   vm_map_assert_wrlock(map);
+
/* Reset free space in first. */
free = uvm_map_uaddr_e(map, first);
uvm_mapent_free_remove(map, free, first);
@@ -1584,6 +1590,8 @@ boolean_t
 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
 struct vm_map_entry **entry)
 {
+   vm_map_assert_anylock(map);
+
*entry = uvm_map_entrybyaddr(&map->addr, address);
return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
(*entry)->start <= address && (*entry)->end > address;
@@ -1704,6 +1712,8 @@ uvm_map_is_stack_remappable(struct vm_ma
vaddr_t end = addr + sz;
struct vm_map_entry *first, *iter, *prev = NULL;
 
+   vm_map_assert_anylock(map);
+
if (!uvm_map_lookup_entry(map, addr, &first)) {
printf("map stack 0x%lx-0x%lx of map %p failed: no mapping\n",
addr, end, map);
@@ -1868,6 +1878,8 @@ uvm_mapent_mkfree(struct vm_map *map, st
vaddr_t  addr;  /* Start of freed range. */
vaddr_t  end;   /* End of freed range. */
 
+   UVM_MAP_REQ_WRITE(map);
+
prev = *prev_ptr;
if (prev == entry)
*prev_ptr = prev = NULL;
@@ -1996,10 +2008,7 @@ uvm_unmap_remove(struct vm_map *map, vad
if (start >= end)
return 0;
 
-   if ((map->flags & VM_MAP_INTRSAFE) == 0)
-   splassert(IPL_NONE);
-   else
-   splassert(IPL_VM);
+   vm_map_assert_wrl

Re: Towards unlocking mmap(2) & munmap(2)

2022-10-30 Thread Martin Pieuchot
On 30/10/22(Sun) 12:40, Klemens Nanni wrote:
> On Fri, Oct 28, 2022 at 11:08:55AM +0200, Martin Pieuchot wrote:
> > On 20/10/22(Thu) 16:17, Martin Pieuchot wrote:
> > > On 11/09/22(Sun) 12:26, Martin Pieuchot wrote:
> > > > Diff below adds a minimalist set of assertions to ensure proper locks
> > > > are held in uvm_mapanon() and uvm_unmap_remove() which are the guts of
> > > > mmap(2) for anons and munmap(2).
> > > > 
> > > > Please test it with WITNESS enabled and report back.
> > > 
> > > New version of the diff that includes a lock/unlock dance  in 
> > > uvm_map_teardown().  While grabbing this lock should not be strictly
> > > necessary because no other reference to the map should exist when the
> > > reaper is holding it, it helps make progress with asserts.  Grabbing
> > > the lock is easy and it can also save us a lot of time if there is any
> > > reference counting bugs (like we've discovered w/ vnode and swapping).
> > 
> > Here's an updated version that adds a lock/unlock dance in
> > uvm_map_deallocate() to satisfy the assert in uvm_unmap_remove().
> > Thanks to tb@ from pointing this out.
> > 
> > I received many positive feedback and test reports, I'm now asking for
> > oks.
> 
> regress on i386/GENERIC.MP+WITNESS with this diff shows

This isn't related to this diff.



Re: Towards unlocking mmap(2) & munmap(2)

2022-10-30 Thread Martin Pieuchot
On 30/10/22(Sun) 12:45, Klemens Nanni wrote:
> On Sun, Oct 30, 2022 at 12:40:02PM +, Klemens Nanni wrote:
> > regress on i386/GENERIC.MP+WITNESS with this diff shows
> 
> Another one;  This machine has three read-only NFS mounts, but none of
> them are used during builds or regress.

It's the same.  See archives of bugs@ for discussion about this lock
order reversal and a potential fix from visa@.

> 
> This one is most certainly from the NFS regress tests themselves:
> 127.0.0.1:/mnt/regress-nfs-server  3548  2088  1284   
>  62%/mnt/regress-nfs-client
> 
> witness: lock order reversal:
>  1st 0xd6381eb8 vmmaplk (&map->lock)
>  2nd 0xf5c98d24 nfsnode (&np->n_lock)
> lock order data w2 -> w1 missing
> lock order "&map->lock"(rwlock) -> "&np->n_lock"(rrwlock) first seen at:
> #0  rw_enter+0x57
> #1  rrw_enter+0x3d
> #2  nfs_lock+0x27
> #3  VOP_LOCK+0x50
> #4  vn_lock+0x91
> #5  vn_rdwr+0x64
> #6  vndstrategy+0x2bd
> #7  physio+0x18f
> #8  vndwrite+0x1a
> #9  spec_write+0x74
> #10 VOP_WRITE+0x3f
> #11 vn_write+0xde
> #12 dofilewritev+0xbb
> #13 sys_pwrite+0x55
> #14 syscall+0x2ec
> #15 Xsyscall_untramp+0xa9
> 



Please test: unlock mprotect/mmap/munmap

2022-11-06 Thread Martin Pieuchot
These 3 syscalls should now be ready to run w/o KERNEL_LOCK().  This
will reduce contention a lot.  I'd be happy to hear from test reports
on many architectures and possible workloads.

Do not forget to run "make syscalls" before building the kernel.

Index: syscalls.master
===
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.234
diff -u -p -r1.234 syscalls.master
--- syscalls.master 25 Oct 2022 16:10:31 -  1.234
+++ syscalls.master 6 Nov 2022 10:50:45 -
@@ -126,7 +126,7 @@
struct sigaction *osa); }
 47 STD NOLOCK  { gid_t sys_getgid(void); }
 48 STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
-49 STD { void *sys_mmap(void *addr, size_t len, int prot, \
+49 STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
int flags, int fd, off_t pos); }
 50 STD { int sys_setlogin(const char *namebuf); }
 #ifdef ACCOUNTING
@@ -171,8 +171,8 @@
const struct kevent *changelist, int nchanges, \
struct kevent *eventlist, int nevents, \
const struct timespec *timeout); }
-73 STD { int sys_munmap(void *addr, size_t len); }
-74 STD { int sys_mprotect(void *addr, size_t len, \
+73 STD NOLOCK  { int sys_munmap(void *addr, size_t len); }
+74 STD NOLOCK  { int sys_mprotect(void *addr, size_t len, \
int prot); }
 75 STD { int sys_madvise(void *addr, size_t len, \
int behav); }



Re: push kernel lock down in ifioctl()

2022-11-07 Thread Martin Pieuchot
On 07/11/22(Mon) 15:16, Klemens Nanni wrote:
> Not all interface ioctls need the kernel lock, but they all grab it.
> 
> Here's a mechanical diff splitting the single lock/unlock around
> ifioctl() into individual lock/unlock dances inside ifioctl().
> 
> From there we can unlock individual ioctls piece by piece.
> 
> Survives regress on sparc64 and didn't blow up on my amd64 notebook yet.
> 
> Feedback? Objection? OK?

Makes sense.  Your diff is missing the kern/sys_socket.c chunk.

This stuff is hairy.  I'd suggest moving very very carefully.  For
example, I wouldn't bother releasing the KERNEL_LOCK() before the
if_put().  Yes, what you're suggesting is correct.  Or at least should
be...

> Index: net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.665
> diff -u -p -r1.665 if.c
> --- net/if.c  8 Sep 2022 10:22:06 -   1.665
> +++ net/if.c  7 Nov 2022 15:13:01 -
> @@ -1942,19 +1942,25 @@ ifioctl(struct socket *so, u_long cmd, c
>   case SIOCIFCREATE:
>   if ((error = suser(p)) != 0)
>   return (error);
> + KERNEL_LOCK();
>   error = if_clone_create(ifr->ifr_name, 0);
> + KERNEL_UNLOCK();
>   return (error);
>   case SIOCIFDESTROY:
>   if ((error = suser(p)) != 0)
>   return (error);
> + KERNEL_LOCK();
>   error = if_clone_destroy(ifr->ifr_name);
> + KERNEL_UNLOCK();
>   return (error);
>   case SIOCSIFGATTR:
>   if ((error = suser(p)) != 0)
>   return (error);
> + KERNEL_LOCK();
>   NET_LOCK();
>   error = if_setgroupattribs(data);
>   NET_UNLOCK();
> + KERNEL_UNLOCK();
>   return (error);
>   case SIOCGIFCONF:
>   case SIOCIFGCLONERS:
> @@ -1973,12 +1979,19 @@ ifioctl(struct socket *so, u_long cmd, c
>   case SIOCGIFRDOMAIN:
>   case SIOCGIFGROUP:
>   case SIOCGIFLLPRIO:
> - return (ifioctl_get(cmd, data));
> + KERNEL_LOCK();
> + error = ifioctl_get(cmd, data);
> + KERNEL_UNLOCK();
> + return (error);
>   }
>  
> + KERNEL_LOCK();
> +
>   ifp = if_unit(ifr->ifr_name);
> - if (ifp == NULL)
> + if (ifp == NULL) {
> + KERNEL_UNLOCK();
>   return (ENXIO);
> + }
>   oif_flags = ifp->if_flags;
>   oif_xflags = ifp->if_xflags;
>  
> @@ -2396,6 +2409,8 @@ forceup:
>  
>   if (((oif_flags ^ ifp->if_flags) & IFF_UP) != 0)
>   getmicrotime(&ifp->if_lastchange);
> +
> + KERNEL_UNLOCK();
>  
>   if_put(ifp);
>  
> 



Re: Please test: unlock mprotect/mmap/munmap

2022-11-08 Thread Martin Pieuchot
On 08/11/22(Tue) 11:12, Mark Kettenis wrote:
> > Date: Tue, 8 Nov 2022 10:32:14 +0100
> > From: Christian Weisgerber 
> > 
> > Martin Pieuchot:
> > 
> > > These 3 syscalls should now be ready to run w/o KERNEL_LOCK().  This
> > > will reduce contention a lot.  I'd be happy to hear from test reports
> > > on many architectures and possible workloads.
> > 
> > This survived a full amd64 package build.
> 
> \8/
> 
> I think that means it should be comitted.

I agree.  This has been tested on i386, riscv64, m88k, arm64, amd64 (of
course) and sparc64.  I'm pretty confident.



Re: xenstore.c: return error number

2022-11-08 Thread Martin Pieuchot
On 01/11/22(Tue) 15:26, Masato Asou wrote:
> Hi,
> 
> Return error number instead of call panic().

Makes sense to me.  Do you know how this error can occur?  Is is a logic
error or are we trusting values produced by a third party?

> comment, ok?
> --
> ASOU Masato
> 
> diff --git a/sys/dev/pv/xenstore.c b/sys/dev/pv/xenstore.c
> index 1e4f15d30eb..dc89ba0fa6d 100644
> --- a/sys/dev/pv/xenstore.c
> +++ b/sys/dev/pv/xenstore.c
> @@ -118,6 +118,7 @@ struct xs_msg {
>   struct xs_msghdr xsm_hdr;
>   uint32_t xsm_read;
>   uint32_t xsm_dlen;
> + int  xsm_error;
>   uint8_t *xsm_data;
>   TAILQ_ENTRY(xs_msg)  xsm_link;
>  };
> @@ -566,9 +567,7 @@ xs_intr(void *arg)
>   }
>  
>   if (xsm->xsm_hdr.xmh_len > xsm->xsm_dlen)
> - panic("message too large: %d vs %d for type %d, rid %u",
> - xsm->xsm_hdr.xmh_len, xsm->xsm_dlen, xsm->xsm_hdr.xmh_type,
> - xsm->xsm_hdr.xmh_rid);
> + xsm->xsm_error = EMSGSIZE;
>  
>   len = MIN(xsm->xsm_hdr.xmh_len - xsm->xsm_read, avail);
>   if (len) {
> @@ -800,7 +799,9 @@ xs_cmd(struct xs_transaction *xst, int cmd, const char 
> *path,
>   error = xs_geterror(xsm);
>   DPRINTF("%s: xenstore request %d \"%s\" error %s\n",
>   xs->xs_sc->sc_dev.dv_xname, cmd, path, xsm->xsm_data);
> - } else if (mode == READ) {
> + } else if (xsm->xsm_error != 0)
> + error = xsm->xsm_error;
> + else if (mode == READ) {
>   KASSERT(iov && iov_cnt);
>   error = xs_parse(xst, xsm, iov, iov_cnt);
>   }
> 



Mark sched_yield(2) as NOLOCK

2022-11-08 Thread Martin Pieuchot
Now that mmap/munmap/mprotect(2) are no longer creating contention it is
possible to see that sched_yield(2) is one of the syscalls waiting for
the KERNEL_LOCK() to be released.  However this is no longer necessary.

Traversing `ps_threads' require either the KERNEL_LOCK() or the
SCHED_LOCK() and we are holding both in this case.  So let's drop the
requirement for the KERNEL_LOCK().

ok?

Index: kern/syscalls.master
===
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.235
diff -u -p -r1.235 syscalls.master
--- kern/syscalls.master8 Nov 2022 11:05:57 -   1.235
+++ kern/syscalls.master8 Nov 2022 13:09:10 -
@@ -531,7 +531,7 @@
 #else
 297UNIMPL
 #endif
-298STD { int sys_sched_yield(void); }
+298STD NOLOCK  { int sys_sched_yield(void); }
 299STD NOLOCK  { pid_t sys_getthrid(void); }
 300OBSOL   t32___thrsleep
 301STD NOLOCK  { int sys___thrwakeup(const volatile void *ident, \



Re: push kernel lock inside ifioctl_get()

2022-11-08 Thread Martin Pieuchot
On 08/11/22(Tue) 15:28, Klemens Nanni wrote:
> After this mechanical move, I can unlock the individual SIOCG* in there.

I'd suggest grabbing the KERNEL_LOCK() after NET_LOCK_SHARED().
Otherwise you might spin for the first one then release it when going
to sleep.

> OK?
> 
> Index: if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.667
> diff -u -p -r1.667 if.c
> --- if.c  8 Nov 2022 15:20:24 -   1.667
> +++ if.c  8 Nov 2022 15:26:07 -
> @@ -2426,33 +2426,43 @@ ifioctl_get(u_long cmd, caddr_t data)
>   size_t bytesdone;
>   const char *label;
>  
> - KERNEL_LOCK();
> -
>   switch(cmd) {
>   case SIOCGIFCONF:
> + KERNEL_LOCK();
>   NET_LOCK_SHARED();
>   error = ifconf(data);
>   NET_UNLOCK_SHARED();
> + KERNEL_UNLOCK();
>   return (error);
>   case SIOCIFGCLONERS:
> + KERNEL_LOCK();
>   error = if_clone_list((struct if_clonereq *)data);
> + KERNEL_UNLOCK();
>   return (error);
>   case SIOCGIFGMEMB:
> + KERNEL_LOCK();
>   NET_LOCK_SHARED();
>   error = if_getgroupmembers(data);
>   NET_UNLOCK_SHARED();
> + KERNEL_UNLOCK();
>   return (error);
>   case SIOCGIFGATTR:
> + KERNEL_LOCK();
>   NET_LOCK_SHARED();
>   error = if_getgroupattribs(data);
>   NET_UNLOCK_SHARED();
> + KERNEL_UNLOCK();
>   return (error);
>   case SIOCGIFGLIST:
> + KERNEL_LOCK();
>   NET_LOCK_SHARED();
>   error = if_getgrouplist(data);
>   NET_UNLOCK_SHARED();
> + KERNEL_UNLOCK();
>   return (error);
>   }
> +
> + KERNEL_LOCK();
>  
>   ifp = if_unit(ifr->ifr_name);
>   if (ifp == NULL) {
> 



btrace: string comparison in filters

2022-11-11 Thread Martin Pieuchot
Diff below adds support for the common following idiom:

syscall:open:entry
/comm == "ksh"/
{
...
}

String comparison is tricky as it can be combined with any other
expression in filters, like:

syscall:mmap:entry
/comm == "cc" && pid != 4589/
{
...
}

I don't have the energy to change the parser so I went for the easy
solution to treat any "stupid" string comparisons as 'true' albeit
printing a warning.  I'd love if somebody with some yacc knowledge
could come up with a better solution.

ok?

Index: usr.sbin/btrace/bt_parse.y
===
RCS file: /cvs/src/usr.sbin/btrace/bt_parse.y,v
retrieving revision 1.46
diff -u -p -r1.46 bt_parse.y
--- usr.sbin/btrace/bt_parse.y  28 Apr 2022 21:04:24 -  1.46
+++ usr.sbin/btrace/bt_parse.y  11 Nov 2022 14:34:37 -
@@ -218,6 +218,7 @@ variable: lvar  { $$ = bl_find($1); }
 factor : '(' expr ')'  { $$ = $2; }
| NUMBER{ $$ = ba_new($1, B_AT_LONG); }
| BUILTIN   { $$ = ba_new(NULL, $1); }
+   | CSTRING   { $$ = ba_new($1, B_AT_STR); }
| staticv
| variable
| mentry
Index: usr.sbin/btrace/btrace.c
===
RCS file: /cvs/src/usr.sbin/btrace/btrace.c,v
retrieving revision 1.64
diff -u -p -r1.64 btrace.c
--- usr.sbin/btrace/btrace.c11 Nov 2022 10:51:39 -  1.64
+++ usr.sbin/btrace/btrace.c11 Nov 2022 14:44:15 -
@@ -434,14 +434,23 @@ rules_setup(int fd)
struct bt_rule *r, *rbegin = NULL;
struct bt_probe *bp;
struct bt_stmt *bs;
+   struct bt_arg *ba;
int dokstack = 0, on = 1;
uint64_t evtflags;
 
TAILQ_FOREACH(r, &g_rules, br_next) {
evtflags = 0;
-   SLIST_FOREACH(bs, &r->br_action, bs_next) {
-   struct bt_arg *ba;
 
+   if (r->br_filter != NULL &&
+   r->br_filter->bf_condition != NULL)  {
+
+   bs = r->br_filter->bf_condition;
+   ba = SLIST_FIRST(&bs->bs_args);
+
+   evtflags |= ba2dtflags(ba);
+   }
+
+   SLIST_FOREACH(bs, &r->br_action, bs_next) {
SLIST_FOREACH(ba, &bs->bs_args, ba_next)
evtflags |= ba2dtflags(ba);
 
@@ -1175,6 +1184,36 @@ baexpr2long(struct bt_arg *ba, struct dt
lhs = ba->ba_value;
rhs = SLIST_NEXT(lhs, ba_next);
 
+   /*
+* String comparison also use '==' and '!='.
+*/
+   if (lhs->ba_type == B_AT_STR ||
+   (rhs != NULL && rhs->ba_type == B_AT_STR)) {
+   char lstr[STRLEN], rstr[STRLEN];
+
+   strlcpy(lstr, ba2str(lhs, dtev), sizeof(lstr));
+   strlcpy(rstr, ba2str(rhs, dtev), sizeof(rstr));
+
+   result = strncmp(lstr, rstr, STRLEN) == 0;
+
+   switch (ba->ba_type) {
+   case B_AT_OP_EQ:
+   break;
+   case B_AT_OP_NE:
+   result = !result;
+   break;
+   default:
+   warnx("operation '%d' unsupported on strings",
+   ba->ba_type);
+   result = 1;
+   }
+
+   debug("ba=%p eval '(%s %s %s) = %d'\n", ba, lstr, ba_name(ba),
+  rstr, result);
+
+   goto out;
+   }
+
lval = ba2long(lhs, dtev);
if (rhs == NULL) {
rval = 0;
@@ -1233,9 +1272,10 @@ baexpr2long(struct bt_arg *ba, struct dt
xabort("unsupported operation %d", ba->ba_type);
}
 
-   debug("ba=%p eval '%ld %s %ld = %d'\n", ba, lval, ba_name(ba),
+   debug("ba=%p eval '(%ld %s %ld) = %d'\n", ba, lval, ba_name(ba),
   rval, result);
 
+out:
--recursions;
 
return result;
@@ -1245,10 +1285,15 @@ const char *
 ba_name(struct bt_arg *ba)
 {
switch (ba->ba_type) {
+   case B_AT_STR:
+   return (const char *)ba->ba_value;
+   case B_AT_LONG:
+   return ba2str(ba, NULL);
case B_AT_NIL:
return "0";
case B_AT_VAR:
case B_AT_MAP:
+   case B_AT_HIST:
break;
case B_AT_BI_PID:
return "pid";
@@ -1326,7 +1371,8 @@ ba_name(struct bt_arg *ba)
xabort("unsupported type %d", ba->ba_type);
}
 
-   assert(ba->ba_type == B_AT_VAR || ba->ba_type == B_AT_MAP);
+   assert(ba->ba_type == B_AT_VAR || ba->ba_type == B_AT_MAP ||
+   ba->ba_type == B_AT_HIST);
 
static char buf[64];
size_t sz;
@@ -1516,9 +1562,13 @@ ba2str(struct bt_arg *ba, struct dt_evt 
 int
 ba2dtflags(struct bt_arg *ba)
 {
+   static long recursions;
struct bt_arg *bval;
int flags = 0;
 
+   if (++recursions >= __MAXOPER

Get rid of UVM_VNODE_CANPERSIST

2022-11-15 Thread Martin Pieuchot
UVM vnode objects include a reference count to keep track of the number
of processes that have the corresponding pages mapped in their VM space.

When the last process referencing a given library or executable dies,
the reaper will munmap this object on its behalf.  When this happens it
doesn't free the associated pages to speed-up possible re-use of the
file.  Instead the pages are placed on the inactive list but stay ready
to be pmap_enter()'d without requiring I/O as soon as a newly process
needs to access them.

The mechanism to keep pages populated, known as UVM_VNODE_CANPERSIST,
doesn't work well with swapping [0].  For some reason when the page daemon
wants to free pages on the inactive list it tries to flush the pages to
disk and panic(9) because it needs a valid reference to the vnode to do
so.

This indicates that the mechanism described above, which seems to work
fine for RO mappings, is currently buggy in more complex situations.
Flushing the pages when the last reference of the UVM object is dropped
also doesn't seem to be enough as bluhm@ reported [1].

The diff below, which has already be committed and reverted, gets rid of
the UVM_VNODE_CANPERSIST logic.  I'd like to commit it again now that
the arm64 caching bug has been found and fixed.

Getting rid of this logic means more I/O will be generated and pages
might have a faster reuse cycle.  I'm aware this might introduce a small
slowdown, however I believe we should work towards loading files from the
buffer cache to save I/O cycles instead of having another layer of cache.
Such work isn't trivial and making sure the vnode <-> UVM relation is
simple and well understood is the first step in this direction.

I'd appreciate if the diff below could be tested on many architectures,
include the offending rpi4.

Comments?  Oks?

[0] https://marc.info/?l=openbsd-bugs&m=164846737707559&w=2 
[1] https://marc.info/?l=openbsd-bugs&m=166843373415030&w=2

Index: uvm/uvm_vnode.c
===
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.130
diff -u -p -r1.130 uvm_vnode.c
--- uvm/uvm_vnode.c 20 Oct 2022 13:31:52 -  1.130
+++ uvm/uvm_vnode.c 15 Nov 2022 13:28:28 -
@@ -161,11 +161,8 @@ uvn_attach(struct vnode *vp, vm_prot_t a
 * add it to the writeable list, and then return.
 */
if (uvn->u_flags & UVM_VNODE_VALID) {   /* already active? */
+   KASSERT(uvn->u_obj.uo_refs > 0);
 
-   /* regain vref if we were persisting */
-   if (uvn->u_obj.uo_refs == 0) {
-   vref(vp);
-   }
uvn->u_obj.uo_refs++;   /* bump uvn ref! */
 
/* check for new writeable uvn */
@@ -235,14 +232,14 @@ uvn_attach(struct vnode *vp, vm_prot_t a
KASSERT(uvn->u_obj.uo_refs == 0);
uvn->u_obj.uo_refs++;
oldflags = uvn->u_flags;
-   uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST;
+   uvn->u_flags = UVM_VNODE_VALID;
uvn->u_nio = 0;
uvn->u_size = used_vnode_size;
 
/*
 * add a reference to the vnode.   this reference will stay as long
 * as there is a valid mapping of the vnode.   dropped when the
-* reference count goes to zero [and we either free or persist].
+* reference count goes to zero.
 */
vref(vp);
 
@@ -323,16 +320,6 @@ uvn_detach(struct uvm_object *uobj)
 */
vp->v_flag &= ~VTEXT;
 
-   /*
-* we just dropped the last reference to the uvn.   see if we can
-* let it "stick around".
-*/
-   if (uvn->u_flags & UVM_VNODE_CANPERSIST) {
-   /* won't block */
-   uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES);
-   goto out;
-   }
-
/* its a goner! */
uvn->u_flags |= UVM_VNODE_DYING;
 
@@ -382,7 +369,6 @@ uvn_detach(struct uvm_object *uobj)
/* wake up any sleepers */
if (oldflags & UVM_VNODE_WANTED)
wakeup(uvn);
-out:
rw_exit(uobj->vmobjlock);
 
/* drop our reference to the vnode. */
@@ -498,8 +484,8 @@ uvm_vnp_terminate(struct vnode *vp)
}
 
/*
-* done.   now we free the uvn if its reference count is zero
-* (true if we are zapping a persisting uvn).   however, if we are
+* done.   now we free the uvn if its reference count is zero.
+* however, if we are
 * terminating a uvn with active mappings we let it live ... future
 * calls down to the vnode layer will fail.
 */
@@ -507,14 +493,14 @@ uvm_vnp_terminate(struct vnode *vp)
if (uvn->u_obj.uo_refs) {
/*
 * uvn must live on it is dead-vnode state until all references
-* are gone.   restore flags.clear CANPERSIST state.
+* are gone.   restore flags.
 */
uvn->u_flags &= ~(UVM_VNODE_DYING|U

  1   2   3   4   5   6   7   8   9   10   >