On 27/06/22(Mon) 15:44, Martin Pieuchot wrote:
> Diff below contain 3 parts that can be committed independently.  The 3
> of them are necessary to allow the pagedaemon to make progress in OOM
> situation and to satisfy all the allocations waiting for pages in
> specific ranges.
> 
> * uvm/uvm_pager.c part reserves a second segment for the page daemon.
>   This is necessary to ensure the two uvm_pagermapin() calls needed by
>   uvm_swap_io() succeed in emergency OOM situation.  (the 2nd segment is
>   necessary when encryption or bouncing is required)
> 
> * uvm/uvm_swap.c part pre-allocates 16 pages in the DMA-reachable region
>   for the same reason.  Note that a sleeping point is introduced because
>   the pagedaemon is faster than the asynchronous I/O and in OOM
>   situation it tends to stay busy building cluster that it then discard
>   because no memory is available.
> 
> * uvm/uvm_pdaemon.c part changes the inner-loop scanning the inactive 
>   list of pages to account for a given memory range.  Without this the
>   daemon could spin infinitely doing nothing because the global limits
>   are reached.

Here's an updated diff with a fix on top:

 * in uvm/uvm_swap.c make sure uvm_swap_allocpages() is allowed to sleep
   when coming from uvm_fault().  This makes the faulting process wait
   instead of dying when there isn't any free pages to do the bouncing.

I'd appreciate more reviews and tests !

Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.80
diff -u -p -r1.80 uvm_pager.c
--- uvm/uvm_pager.c     28 Jun 2022 12:10:37 -0000      1.80
+++ uvm/uvm_pager.c     28 Jun 2022 15:25:30 -0000
@@ -58,8 +58,8 @@ const struct uvm_pagerops *uvmpagerops[]
  * The number of uvm_pseg instances is dynamic using an array segs.
  * At most UVM_PSEG_COUNT instances can exist.
  *
- * psegs[0] always exists (so that the pager can always map in pages).
- * psegs[0] element 0 is always reserved for the pagedaemon.
+ * psegs[0/1] always exist (so that the pager can always map in pages).
+ * psegs[0/1] element 0 are always reserved for the pagedaemon.
  *
  * Any other pseg is automatically created when no space is available
  * and automatically destroyed when it is no longer in use.
@@ -93,6 +93,7 @@ uvm_pager_init(void)
 
        /* init pager map */
        uvm_pseg_init(&psegs[0]);
+       uvm_pseg_init(&psegs[1]);
        mtx_init(&uvm_pseg_lck, IPL_VM);
 
        /* init ASYNC I/O queue */
@@ -168,9 +169,10 @@ pager_seg_restart:
                                goto pager_seg_fail;
                }
 
-               /* Keep index 0 reserved for pagedaemon. */
-               if (pseg == &psegs[0] && curproc != uvm.pagedaemon_proc)
-                       i = 1;
+               /* Keep indexes 0,1 reserved for pagedaemon. */
+               if ((pseg == &psegs[0] || pseg == &psegs[1]) &&
+                   (curproc != uvm.pagedaemon_proc))
+                       i = 2;
                else
                        i = 0;
 
@@ -229,7 +231,7 @@ uvm_pseg_release(vaddr_t segaddr)
        pseg->use &= ~(1 << id);
        wakeup(&psegs);
 
-       if (pseg != &psegs[0] && UVM_PSEG_EMPTY(pseg)) {
+       if ((pseg != &psegs[0] && pseg != &psegs[1]) && UVM_PSEG_EMPTY(pseg)) {
                va = pseg->start;
                pseg->start = 0;
        }
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.99
diff -u -p -r1.99 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c   12 May 2022 12:49:31 -0000      1.99
+++ uvm/uvm_pdaemon.c   28 Jun 2022 13:59:49 -0000
@@ -101,8 +101,8 @@ extern void drmbackoff(long);
  * local prototypes
  */
 
-void           uvmpd_scan(void);
-boolean_t      uvmpd_scan_inactive(struct pglist *);
+void           uvmpd_scan(struct uvm_pmalloc *);
+boolean_t      uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
 void           uvmpd_tune(void);
 void           uvmpd_drop(struct pglist *);
 
@@ -281,7 +281,7 @@ uvm_pageout(void *arg)
                if (pma != NULL ||
                    ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) ||
                    ((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) {
-                       uvmpd_scan();
+                       uvmpd_scan(pma);
                }
 
                /*
@@ -379,15 +379,15 @@ uvm_aiodone_daemon(void *arg)
  */
 
 boolean_t
-uvmpd_scan_inactive(struct pglist *pglst)
+uvmpd_scan_inactive(struct uvm_pmalloc *pma, struct pglist *pglst)
 {
        boolean_t retval = FALSE;       /* assume we haven't hit target */
        int free, result;
        struct vm_page *p, *nextpg;
        struct uvm_object *uobj;
-       struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
+       struct vm_page *pps[SWCLUSTPAGES], **ppsp;
        int npages;
-       struct vm_page *swpps[MAXBSIZE >> PAGE_SHIFT];  /* XXX: see below */
+       struct vm_page *swpps[SWCLUSTPAGES];    /* XXX: see below */
        int swnpages, swcpages;                         /* XXX: see below */
        int swslot;
        struct vm_anon *anon;
@@ -404,8 +404,27 @@ uvmpd_scan_inactive(struct pglist *pglst
        swnpages = swcpages = 0;
        free = 0;
        dirtyreacts = 0;
+       p = NULL;
 
-       for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) {
+       /* Start with the first page on the list that fit in pma's ranges */
+       if (pma != NULL) {
+               paddr_t paddr;
+
+               TAILQ_FOREACH(p, pglst, pageq) {
+                       paddr = atop(VM_PAGE_TO_PHYS(p));
+                       if (paddr >= pma->pm_constraint.ucr_low &&
+                           paddr < pma->pm_constraint.ucr_high)
+                               break;
+               }
+
+       }
+
+       if (p == NULL) {
+               p = TAILQ_FIRST(pglst);
+               pma = NULL;
+       }
+
+       for (; p != NULL || swslot != 0; p = nextpg) {
                /*
                 * note that p can be NULL iff we have traversed the whole
                 * list and need to do one final swap-backed clustered pageout.
@@ -419,8 +438,8 @@ uvmpd_scan_inactive(struct pglist *pglst
                         * our target
                         */
                        free = uvmexp.free - BUFPAGES_DEFICIT;
-
-                       if (free + uvmexp.paging >= uvmexp.freetarg << 2 ||
+                       if (((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
+                           (free + uvmexp.paging >= uvmexp.freetarg << 2)) ||
                            dirtyreacts == UVMPD_NUMDIRTYREACTS) {
                                retval = TRUE;
 
@@ -531,7 +550,8 @@ uvmpd_scan_inactive(struct pglist *pglst
                         * this page is dirty, skip it if we'll have met our
                         * free target when all the current pageouts complete.
                         */
-                       if (free + uvmexp.paging > uvmexp.freetarg << 2) {
+                       if ((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
+                           (free + uvmexp.paging > uvmexp.freetarg << 2)) {
                                if (anon) {
                                        rw_exit(anon->an_lock);
                                } else {
@@ -611,7 +631,7 @@ uvmpd_scan_inactive(struct pglist *pglst
 
                                /* start new cluster (if necessary) */
                                if (swslot == 0) {
-                                       swnpages = MAXBSIZE >> PAGE_SHIFT;
+                                       swnpages = SWCLUSTPAGES;
                                        swslot = uvm_swap_alloc(&swnpages,
                                            TRUE);
                                        if (swslot == 0) {
@@ -867,7 +887,7 @@ uvmpd_scan_inactive(struct pglist *pglst
  */
 
 void
-uvmpd_scan(void)
+uvmpd_scan(struct uvm_pmalloc *pma)
 {
        int free, inactive_shortage, swap_shortage, pages_freed;
        struct vm_page *p, *nextpg;
@@ -910,7 +930,7 @@ uvmpd_scan(void)
         * low bit of uvmexp.pdrevs (which we bump by one each call).
         */
        pages_freed = uvmexp.pdfreed;
-       (void) uvmpd_scan_inactive(&uvm.page_inactive);
+       (void) uvmpd_scan_inactive(pma, &uvm.page_inactive);
        pages_freed = uvmexp.pdfreed - pages_freed;
 
        /*
Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.156
diff -u -p -r1.156 uvm_swap.c
--- uvm/uvm_swap.c      7 Jun 2022 12:02:52 -0000       1.156
+++ uvm/uvm_swap.c      28 Jun 2022 15:16:58 -0000
@@ -213,6 +213,10 @@ struct swap_priority swap_priority;
 /* locks */
 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
 
+struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM);
+struct vm_page *oompps[SWCLUSTPAGES];
+int oom = 0;
+
 /*
  * prototypes
  */
@@ -235,7 +239,7 @@ void sw_reg_start(struct swapdev *);
 int uvm_swap_io(struct vm_page **, int, int, int);
 
 void swapmount(void);
-boolean_t uvm_swap_allocpages(struct vm_page **, int);
+int uvm_swap_allocpages(struct vm_page **, int, int);
 
 #ifdef UVM_SWAP_ENCRYPT
 /* for swap encrypt */
@@ -253,6 +257,8 @@ void uvm_swap_initcrypt(struct swapdev *
 void
 uvm_swap_init(void)
 {
+       int error;
+
        /*
         * first, init the swap list, its counter, and its lock.
         * then get a handle on the vnode for /dev/drum by using
@@ -281,6 +287,10 @@ uvm_swap_init(void)
        pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0,
            "swp vnd", NULL);
 
+       /* allocate pages for OOM situations. */
+       error = uvm_swap_allocpages(oompps, SWCLUSTPAGES, UVM_PLA_NOWAIT);
+       KASSERT(error == 0);
+
        /* Setup the initial swap partition */
        swapmount();
 }
@@ -323,16 +333,35 @@ uvm_swap_initcrypt(struct swapdev *sdp, 
 
 #endif /* UVM_SWAP_ENCRYPT */
 
-boolean_t
-uvm_swap_allocpages(struct vm_page **pps, int npages)
+int
+uvm_swap_allocpages(struct vm_page **pps, int npages, int flags)
 {
        struct pglist   pgl;
-       int i;
+       int error, i;
+
+       KASSERT(npages <= SWCLUSTPAGES);
 
        TAILQ_INIT(&pgl);
-       if (uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
-           dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT))
-               return FALSE;
+again:
+       error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
+           dma_constraint.ucr_high, 0, 0, &pgl, npages, flags);
+       if (error && (curproc == uvm.pagedaemon_proc)) {
+               mtx_enter(&oommtx);
+               if (oom) {
+                       msleep_nsec(&oom, &oommtx, PVM | PNORELOCK,
+                        "oom", INFSLP);
+                       goto again;
+               }
+               oom = 1;
+               for (i = 0; i < npages; i++) {
+                       pps[i] = oompps[i];
+                       atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY);
+               }
+               mtx_leave(&oommtx);
+               return 0;
+       }
+       if (error)
+               return error;
 
        for (i = 0; i < npages; i++) {
                pps[i] = TAILQ_FIRST(&pgl);
@@ -341,7 +370,7 @@ uvm_swap_allocpages(struct vm_page **pps
                TAILQ_REMOVE(&pgl, pps[i], pageq);
        }
 
-       return TRUE;
+       return 0;
 }
 
 void
@@ -349,10 +378,23 @@ uvm_swap_freepages(struct vm_page **pps,
 {
        int i;
 
+       if (pps[0] == oompps[0]) {
+               for (i = 0; i < npages; i++)
+                       uvm_pageclean(pps[i]);
+
+               mtx_enter(&oommtx);
+               KASSERT(oom == 1);
+               oom = 0;
+               mtx_leave(&oommtx);
+               wakeup(&oom);
+               return;
+       }
+
        uvm_lock_pageq();
        for (i = 0; i < npages; i++)
                uvm_pagefree(pps[i]);
        uvm_unlock_pageq();
+
 }
 
 #ifdef UVM_SWAP_ENCRYPT
@@ -1587,7 +1629,8 @@ uvm_swap_io(struct vm_page **pps, int st
        int     result, s, mapinflags, pflag, bounce = 0, i;
        boolean_t write, async;
        vaddr_t bouncekva;
-       struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT];
+       struct vm_page *tpps[SWCLUSTPAGES];
+       int pdaemon = (curproc == uvm.pagedaemon_proc);
 #ifdef UVM_SWAP_ENCRYPT
        struct swapdev *sdp;
        int     encrypt = 0;
@@ -1601,16 +1644,23 @@ uvm_swap_io(struct vm_page **pps, int st
        /* convert starting drum slot to block number */
        startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
 
+       pflag = (async || pdaemon) ? PR_NOWAIT : PR_WAITOK;
+       bp = pool_get(&bufpool, pflag | PR_ZERO);
+       if (bp == NULL)
+               return (VM_PAGER_AGAIN);
+
        /*
-        * first, map the pages into the kernel (XXX: currently required
+        * map the pages into the kernel (XXX: currently required
         * by buffer system).
         */
        mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
        if (!async)
                mapinflags |= UVMPAGER_MAPIN_WAITOK;
        kva = uvm_pagermapin(pps, npages, mapinflags);
-       if (kva == 0)
+       if (kva == 0) {
+               pool_put(&bufpool, bp);
                return (VM_PAGER_AGAIN);
+       }
 
 #ifdef UVM_SWAP_ENCRYPT
        if (write) {
@@ -1658,46 +1708,31 @@ uvm_swap_io(struct vm_page **pps, int st
        }
 
        if (bounce)  {
-               int swmapflags;
+               int swmapflags, plaflags;
 
                /* We always need write access. */
                swmapflags = UVMPAGER_MAPIN_READ;
-               if (!async)
+               plaflags = UVM_PLA_NOWAIT;
+               if (!async) {
                        swmapflags |= UVMPAGER_MAPIN_WAITOK;
-
-               if (!uvm_swap_allocpages(tpps, npages)) {
+                       plaflags = UVM_PLA_WAITOK;
+               }
+               if (uvm_swap_allocpages(tpps, npages, plaflags)) {
+                       pool_put(&bufpool, bp);
                        uvm_pagermapout(kva, npages);
                        return (VM_PAGER_AGAIN);
                }
 
                bouncekva = uvm_pagermapin(tpps, npages, swmapflags);
                if (bouncekva == 0) {
+                       KASSERT(tpps[0] != oompps[0]);
+                       pool_put(&bufpool, bp);
                        uvm_pagermapout(kva, npages);
                        uvm_swap_freepages(tpps, npages);
                        return (VM_PAGER_AGAIN);
                }
        }
 
-       /*
-        * now allocate a buf for the i/o.
-        * [make sure we don't put the pagedaemon to sleep...]
-        */
-       pflag = (async || curproc == uvm.pagedaemon_proc) ? PR_NOWAIT :
-           PR_WAITOK;
-       bp = pool_get(&bufpool, pflag | PR_ZERO);
-
-       /*
-        * if we failed to get a swapbuf, return "try again"
-        */
-       if (bp == NULL) {
-               if (bounce) {
-                       uvm_pagermapout(bouncekva, npages);
-                       uvm_swap_freepages(tpps, npages);
-               }
-               uvm_pagermapout(kva, npages);
-               return (VM_PAGER_AGAIN);
-       }
-
        /* encrypt to swap */
        if (write && bounce) {
                int i, opages;
@@ -1789,8 +1824,7 @@ uvm_swap_io(struct vm_page **pps, int st
 
        /* for async ops we must set up the iodone handler. */
        if (async) {
-               bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ?
-                                        B_PDAEMON : 0);
+               bp->b_flags |= B_CALL | (pdaemon ? B_PDAEMON : 0);
                bp->b_iodone = uvm_aio_biodone;
        }
 
Index: uvm/uvm_swap.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.h,v
retrieving revision 1.18
diff -u -p -r1.18 uvm_swap.h
--- uvm/uvm_swap.h      29 Sep 2020 11:47:41 -0000      1.18
+++ uvm/uvm_swap.h      28 Jun 2022 13:59:49 -0000
@@ -32,6 +32,7 @@
 #ifndef _UVM_UVM_SWAP_H_
 #define _UVM_UVM_SWAP_H_
 
+#define        SWCLUSTPAGES    (MAXBSIZE >> PAGE_SHIFT)
 #define        SWSLOT_BAD      (-1)
 
 #ifdef _KERNEL

Reply via email to