The uvmexp.free* variables are read in uvm_pagealloc() & uvm_pglistalloc()
before grabbing the `fpageqlock'.

`uvmexp.free' is always modified by the pmemrange allocator under the
above motioned lock.  To avoid races and the chances of missing a wakeup,
the diff below move the checks inside the critical section.

Note that this doesn't solve the race with reading `freemin', `freetarg',
`inactive' and `inactarg'.  These are currently updated under another
lock and will be addressed in an upcoming diff.

To fix this race I introduced a new UVM_PLA_USERESERVE.  Note that those
flags are now for uvm_pmr_getpages() so reflect that in comment.

Careful reviewers will spot an off-by-one change in the check for 
pagedaemon's reserved memory.  My understanding is that it's a bugfix,
is it correct?

I also started to document the locking for some of the `uvmexp' fields.
I'm well aware that reading `uvmexp.free' is done without the correct
lock in the pagedaemon, this will be addressed soon.  The other accesses
shouldn't matter much as they are not used to make decisions.

This should hopefully be good enough to make uvm_pagealloc() completely
mp-safe, something that matters much because it's called from pmap_enter(9)
on some architectures.

ok?

Index: uvm/uvm_extern.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.154
diff -u -p -r1.154 uvm_extern.h
--- uvm/uvm_extern.h    19 Oct 2020 08:19:46 -0000      1.154
+++ uvm/uvm_extern.h    13 Nov 2020 14:05:08 -0000
@@ -142,14 +142,15 @@ typedef int               vm_prot_t;
 #define        UVM_PGA_ZERO            0x0002  /* returned page must be zeroed 
*/
 
 /*
- * flags for uvm_pglistalloc()
+ * flags for uvm_pmr_getpages()
  */
 #define UVM_PLA_WAITOK         0x0001  /* may sleep */
 #define UVM_PLA_NOWAIT         0x0002  /* can't sleep (need one of the two) */
 #define UVM_PLA_ZERO           0x0004  /* zero all pages before returning */
 #define UVM_PLA_TRYCONTIG      0x0008  /* try to allocate contig physmem */
 #define UVM_PLA_FAILOK         0x0010  /* caller can handle failure */
-#define UVM_PLA_NOWAKE         0x0020  /* don't wake the page daemon on 
failure */
+#define UVM_PLA_NOWAKE         0x0020  /* don't wake page daemon on failure */
+#define UVM_PLA_USERESERVE     0x0040  /* can allocate from kernel reserve */
 
 /*
  * lockflags that control the locking behavior of various functions.
Index: uvm/uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
retrieving revision 1.150
diff -u -p -r1.150 uvm_page.c
--- uvm/uvm_page.c      22 Sep 2020 14:31:08 -0000      1.150
+++ uvm/uvm_page.c      13 Nov 2020 13:52:25 -0000
@@ -733,32 +733,11 @@ uvm_pglistalloc(psize_t size, paddr_t lo
        size = atop(round_page(size));
 
        /*
-        * check to see if we need to generate some free pages waking
-        * the pagedaemon.
-        */
-       if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
-           ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
-           (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
-               wakeup(&uvm.pagedaemon);
-
-       /*
         * XXX uvm_pglistalloc is currently only used for kernel
         * objects. Unlike the checks in uvm_pagealloc, below, here
-        * we are always allowed to use the kernel reserve. However, we
-        * have to enforce the pagedaemon reserve here or allocations
-        * via this path could consume everything and we can't
-        * recover in the page daemon.
+        * we are always allowed to use the kernel reserve.
         */
- again:
-       if ((uvmexp.free <= uvmexp.reserve_pagedaemon + size &&
-           !((curproc == uvm.pagedaemon_proc) ||
-               (curproc == syncerproc)))) {
-               if (flags & UVM_PLA_WAITOK) {
-                       uvm_wait("uvm_pglistalloc");
-                       goto again;
-               }
-               return (ENOMEM);
-       }
+       flags |= UVM_PLA_USERESERVE;
 
        if ((high & PAGE_MASK) != PAGE_MASK) {
                printf("uvm_pglistalloc: Upper boundary 0x%lx "
@@ -871,7 +850,7 @@ uvm_pagerealloc_multi(struct uvm_object 
 }
 
 /*
- * uvm_pagealloc_strat: allocate vm_page from a particular free list.
+ * uvm_pagealloc: allocate vm_page from a particular free list.
  *
  * => return null if no pages free
  * => wake up pagedaemon if number of free pages drops below low water mark
@@ -886,37 +865,16 @@ uvm_pagealloc(struct uvm_object *obj, vo
        struct vm_page *pg;
        struct pglist pgl;
        int pmr_flags;
-       boolean_t use_reserve;
 
        KASSERT(obj == NULL || anon == NULL);
+       KASSERT(anon == NULL || off == 0);
        KASSERT(off == trunc_page(off));
 
-       /*
-        * check to see if we need to generate some free pages waking
-        * the pagedaemon.
-        */
-       if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
-           ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
-           (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
-               wakeup(&uvm.pagedaemon);
-
-       /*
-        * fail if any of these conditions is true:
-        * [1]  there really are no free pages, or
-        * [2]  only kernel "reserved" pages remain and
-        *        the page isn't being allocated to a kernel object.
-        * [3]  only pagedaemon "reserved" pages remain and
-        *        the requestor isn't the pagedaemon.
-        */
-       use_reserve = (flags & UVM_PGA_USERESERVE) ||
-               (obj && UVM_OBJ_IS_KERN_OBJECT(obj));
-       if ((uvmexp.free <= uvmexp.reserve_kernel && !use_reserve) ||
-           (uvmexp.free <= uvmexp.reserve_pagedaemon &&
-            !((curproc == uvm.pagedaemon_proc) ||
-             (curproc == syncerproc))))
-               goto fail;
-
        pmr_flags = UVM_PLA_NOWAIT;
+       if ((flags & UVM_PGA_USERESERVE) ||
+           (obj != NULL && UVM_OBJ_IS_KERN_OBJECT(obj)))
+               pmr_flags |= UVM_PLA_USERESERVE;
+
        if (flags & UVM_PGA_ZERO)
                pmr_flags |= UVM_PLA_ZERO;
        TAILQ_INIT(&pgl);
Index: uvm/uvm_pmemrange.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pmemrange.c,v
retrieving revision 1.59
diff -u -p -r1.59 uvm_pmemrange.c
--- uvm/uvm_pmemrange.c 18 Feb 2020 12:13:40 -0000      1.59
+++ uvm/uvm_pmemrange.c 13 Nov 2020 13:48:14 -0000
@@ -869,6 +869,7 @@ uvm_pmr_getpages(psize_t count, paddr_t 
        KASSERT(boundary == 0 || powerof2(boundary));
        KASSERT(boundary == 0 || maxseg * boundary >= count);
        KASSERT(TAILQ_EMPTY(result));
+       KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
 
        /*
         * TRYCONTIG is a noop if you only want a single segment.
@@ -938,7 +939,41 @@ uvm_pmr_getpages(psize_t count, paddr_t 
         */
        desperate = 0;
 
+again:
        uvm_lock_fpageq();
+
+       /*
+        * check to see if we need to generate some free pages waking
+        * the pagedaemon.
+        */
+       if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
+           ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
+           (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
+               wakeup(&uvm.pagedaemon);
+
+       /*
+        * fail if any of these conditions is true:
+        * [1]  there really are no free pages, or
+        * [2]  only kernel "reserved" pages remain and
+        *        the page isn't being allocated to a kernel object.
+        * [3]  only pagedaemon "reserved" pages remain and
+        *        the requestor isn't the pagedaemon nor the syncer.
+        */
+       if ((uvmexp.free <= uvmexp.reserve_kernel) &&
+           !(flags & UVM_PLA_USERESERVE)) {
+               uvm_unlock_fpageq();
+               return ENOMEM;
+       }
+
+       if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
+           (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) {
+               uvm_unlock_fpageq();
+               if (flags & UVM_PLA_WAITOK) {
+                       uvm_wait("uvm_pmr_getpages");
+                       goto again;
+               }
+               return ENOMEM;
+       }
 
 retry:         /* Return point after sleeping. */
        fcount = 0;
Index: uvm/uvmexp.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvmexp.h,v
retrieving revision 1.5
diff -u -p -r1.5 uvmexp.h
--- uvm/uvmexp.h        23 Apr 2020 07:57:27 -0000      1.5
+++ uvm/uvmexp.h        13 Nov 2020 13:51:45 -0000
@@ -39,6 +39,10 @@
 /*
  * uvmexp: global data structures that are exported to parts of the kernel
  * other than the vm system.
+ *
+ *  Locks used to protect struct members in this file:
+ *     I       immutable after creation
+ *     F       uvm_lock_fpageq
  */
 struct uvmexp {
        /* vm_page constants */
@@ -47,16 +51,16 @@ struct uvmexp {
        int pageshift;  /* page shift */
 
        /* vm_page counters */
-       int npages;     /* number of pages we manage */
-       int free;       /* number of free pages */
+       int npages;     /* [I] number of pages we manage */
+       int free;       /* [F] number of free pages */
        int active;     /* number of active pages */
        int inactive;   /* number of pages that we free'd but may want back */
        int paging;     /* number of pages in the process of being paged out */
        int wired;      /* number of wired pages */
 
-       int zeropages;          /* number of zero'd pages */
-       int reserve_pagedaemon; /* number of pages reserved for pagedaemon */
-       int reserve_kernel;     /* number of pages reserved for kernel */
+       int zeropages;          /* [F] number of zero'd pages */
+       int reserve_pagedaemon; /* [I] # of pages reserved for pagedaemon */
+       int reserve_kernel;     /* [I] # of pages reserved for kernel */
        int unused01;           /* formerly anonpages */
        int vnodepages;         /* XXX # of pages used by vnode page cache */
        int vtextpages;         /* XXX # of pages used by vtext vnodes */

Reply via email to