> Date: Mon, 16 Nov 2020 10:11:50 -0300
> From: Martin Pieuchot <[email protected]>
>
> On 13/11/20(Fri) 21:05, Mark Kettenis wrote:
> > [...]
> > > Careful reviewers will spot an off-by-one change in the check for
> > > pagedaemon's reserved memory. My understanding is that it's a bugfix,
> > > is it correct?
> >
> > You mean for uvm_pagealloc(). I'd say yes. But this does mean that
> > in some sense the pagedaemon reserve grows by a page at the cost of
> > the kernel reserve. So I think it would be a good idea to bump the
> > kernel reserve a bit. Maybe to 8 pages?
>
> Fine with me. Could you do it?
I think it should be done as part of this diff.
> > > I also started to document the locking for some of the `uvmexp' fields.
> > > I'm well aware that reading `uvmexp.free' is done without the correct
> > > lock in the pagedaemon, this will be addressed soon. The other accesses
> > > shouldn't matter much as they are not used to make decisions.
> > >
> > > This should hopefully be good enough to make uvm_pagealloc() completely
> > > mp-safe, something that matters much because it's called from
> > > pmap_enter(9)
> > > on some architectures.
> > >
> > > ok?
> >
> > Some additional comments below...
>
> Updated diff addressing your comments below, ok?
Yes, looks good, but I'd like you to include the kernel reserve bump.
And maybe ping beck@ and ask him whether he is ok as well as this is
related to the buffer cache and page daemon.
> Index: uvm/uvm_extern.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
> retrieving revision 1.154
> diff -u -p -r1.154 uvm_extern.h
> --- uvm/uvm_extern.h 19 Oct 2020 08:19:46 -0000 1.154
> +++ uvm/uvm_extern.h 16 Nov 2020 13:09:39 -0000
> @@ -142,14 +142,15 @@ typedef int vm_prot_t;
> #define UVM_PGA_ZERO 0x0002 /* returned page must be zeroed
> */
>
> /*
> - * flags for uvm_pglistalloc()
> + * flags for uvm_pglistalloc() also used by uvm_pmr_getpages()
> */
> #define UVM_PLA_WAITOK 0x0001 /* may sleep */
> #define UVM_PLA_NOWAIT 0x0002 /* can't sleep (need one of the
> two) */
> #define UVM_PLA_ZERO 0x0004 /* zero all pages before returning */
> #define UVM_PLA_TRYCONTIG 0x0008 /* try to allocate contig physmem */
> #define UVM_PLA_FAILOK 0x0010 /* caller can handle failure */
> -#define UVM_PLA_NOWAKE 0x0020 /* don't wake the page daemon
> on failure */
> +#define UVM_PLA_NOWAKE 0x0020 /* don't wake page daemon on
> failure */
> +#define UVM_PLA_USERESERVE 0x0040 /* can allocate from kernel reserve */
>
> /*
> * lockflags that control the locking behavior of various functions.
> Index: uvm/uvm_page.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_page.c,v
> retrieving revision 1.150
> diff -u -p -r1.150 uvm_page.c
> --- uvm/uvm_page.c 22 Sep 2020 14:31:08 -0000 1.150
> +++ uvm/uvm_page.c 16 Nov 2020 13:11:19 -0000
> @@ -733,32 +733,11 @@ uvm_pglistalloc(psize_t size, paddr_t lo
> size = atop(round_page(size));
>
> /*
> - * check to see if we need to generate some free pages waking
> - * the pagedaemon.
> - */
> - if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
> - ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
> - (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
> - wakeup(&uvm.pagedaemon);
> -
> - /*
> * XXX uvm_pglistalloc is currently only used for kernel
> * objects. Unlike the checks in uvm_pagealloc, below, here
> - * we are always allowed to use the kernel reserve. However, we
> - * have to enforce the pagedaemon reserve here or allocations
> - * via this path could consume everything and we can't
> - * recover in the page daemon.
> + * we are always allowed to use the kernel reserve.
> */
> - again:
> - if ((uvmexp.free <= uvmexp.reserve_pagedaemon + size &&
> - !((curproc == uvm.pagedaemon_proc) ||
> - (curproc == syncerproc)))) {
> - if (flags & UVM_PLA_WAITOK) {
> - uvm_wait("uvm_pglistalloc");
> - goto again;
> - }
> - return (ENOMEM);
> - }
> + flags |= UVM_PLA_USERESERVE;
>
> if ((high & PAGE_MASK) != PAGE_MASK) {
> printf("uvm_pglistalloc: Upper boundary 0x%lx "
> @@ -871,7 +850,7 @@ uvm_pagerealloc_multi(struct uvm_object
> }
>
> /*
> - * uvm_pagealloc_strat: allocate vm_page from a particular free list.
> + * uvm_pagealloc: allocate vm_page from a particular free list.
> *
> * => return null if no pages free
> * => wake up pagedaemon if number of free pages drops below low water mark
> @@ -886,37 +865,21 @@ uvm_pagealloc(struct uvm_object *obj, vo
> struct vm_page *pg;
> struct pglist pgl;
> int pmr_flags;
> - boolean_t use_reserve;
>
> KASSERT(obj == NULL || anon == NULL);
> + KASSERT(anon == NULL || off == 0);
> KASSERT(off == trunc_page(off));
>
> - /*
> - * check to see if we need to generate some free pages waking
> - * the pagedaemon.
> - */
> - if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
> - ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
> - (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
> - wakeup(&uvm.pagedaemon);
> + pmr_flags = UVM_PLA_NOWAIT;
>
> /*
> - * fail if any of these conditions is true:
> - * [1] there really are no free pages, or
> - * [2] only kernel "reserved" pages remain and
> - * the page isn't being allocated to a kernel object.
> - * [3] only pagedaemon "reserved" pages remain and
> - * the requestor isn't the pagedaemon.
> + * We're allowed to use the kernel reserve if the page is
> + * being allocated to a kernel object.
> */
> - use_reserve = (flags & UVM_PGA_USERESERVE) ||
> - (obj && UVM_OBJ_IS_KERN_OBJECT(obj));
> - if ((uvmexp.free <= uvmexp.reserve_kernel && !use_reserve) ||
> - (uvmexp.free <= uvmexp.reserve_pagedaemon &&
> - !((curproc == uvm.pagedaemon_proc) ||
> - (curproc == syncerproc))))
> - goto fail;
> + if ((flags & UVM_PGA_USERESERVE) ||
> + (obj != NULL && UVM_OBJ_IS_KERN_OBJECT(obj)))
> + pmr_flags |= UVM_PLA_USERESERVE;
>
> - pmr_flags = UVM_PLA_NOWAIT;
> if (flags & UVM_PGA_ZERO)
> pmr_flags |= UVM_PLA_ZERO;
> TAILQ_INIT(&pgl);
> Index: uvm/uvm_pmemrange.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pmemrange.c,v
> retrieving revision 1.59
> diff -u -p -r1.59 uvm_pmemrange.c
> --- uvm/uvm_pmemrange.c 18 Feb 2020 12:13:40 -0000 1.59
> +++ uvm/uvm_pmemrange.c 16 Nov 2020 13:03:51 -0000
> @@ -869,6 +869,7 @@ uvm_pmr_getpages(psize_t count, paddr_t
> KASSERT(boundary == 0 || powerof2(boundary));
> KASSERT(boundary == 0 || maxseg * boundary >= count);
> KASSERT(TAILQ_EMPTY(result));
> + KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
>
> /*
> * TRYCONTIG is a noop if you only want a single segment.
> @@ -938,7 +939,41 @@ uvm_pmr_getpages(psize_t count, paddr_t
> */
> desperate = 0;
>
> +again:
> uvm_lock_fpageq();
> +
> + /*
> + * check to see if we need to generate some free pages waking
> + * the pagedaemon.
> + */
> + if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
> + ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
> + (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
> + wakeup(&uvm.pagedaemon);
> +
> + /*
> + * fail if any of these conditions is true:
> + * [1] there really are no free pages, or
> + * [2] only kernel "reserved" pages remain and
> + * the UVM_PLA_USERESERVE flag wasn't used.
> + * [3] only pagedaemon "reserved" pages remain and
> + * the requestor isn't the pagedaemon nor the syncer.
> + */
> + if ((uvmexp.free <= uvmexp.reserve_kernel) &&
> + !(flags & UVM_PLA_USERESERVE)) {
> + uvm_unlock_fpageq();
> + return ENOMEM;
> + }
> +
> + if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
> + (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) {
> + uvm_unlock_fpageq();
> + if (flags & UVM_PLA_WAITOK) {
> + uvm_wait("uvm_pmr_getpages");
> + goto again;
> + }
> + return ENOMEM;
> + }
>
> retry: /* Return point after sleeping. */
> fcount = 0;
> Index: uvm/uvmexp.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvmexp.h,v
> retrieving revision 1.5
> diff -u -p -r1.5 uvmexp.h
> --- uvm/uvmexp.h 23 Apr 2020 07:57:27 -0000 1.5
> +++ uvm/uvmexp.h 13 Nov 2020 13:51:45 -0000
> @@ -39,6 +39,10 @@
> /*
> * uvmexp: global data structures that are exported to parts of the kernel
> * other than the vm system.
> + *
> + * Locks used to protect struct members in this file:
> + * I immutable after creation
> + * F uvm_lock_fpageq
> */
> struct uvmexp {
> /* vm_page constants */
> @@ -47,16 +51,16 @@ struct uvmexp {
> int pageshift; /* page shift */
>
> /* vm_page counters */
> - int npages; /* number of pages we manage */
> - int free; /* number of free pages */
> + int npages; /* [I] number of pages we manage */
> + int free; /* [F] number of free pages */
> int active; /* number of active pages */
> int inactive; /* number of pages that we free'd but may want back */
> int paging; /* number of pages in the process of being paged out */
> int wired; /* number of wired pages */
>
> - int zeropages; /* number of zero'd pages */
> - int reserve_pagedaemon; /* number of pages reserved for pagedaemon */
> - int reserve_kernel; /* number of pages reserved for kernel */
> + int zeropages; /* [F] number of zero'd pages */
> + int reserve_pagedaemon; /* [I] # of pages reserved for pagedaemon */
> + int reserve_kernel; /* [I] # of pages reserved for kernel */
> int unused01; /* formerly anonpages */
> int vnodepages; /* XXX # of pages used by vnode page cache */
> int vtextpages; /* XXX # of pages used by vtext vnodes */
>