Diff below contain 3 parts that can be committed independently.  The 3
of them are necessary to allow the pagedaemon to make progress in OOM
situation and to satisfy all the allocations waiting for pages in
specific ranges.

* uvm/uvm_pager.c part reserves a second segment for the page daemon.
  This is necessary to ensure the two uvm_pagermapin() calls needed by
  uvm_swap_io() succeed in emergency OOM situation.  (the 2nd segment is
  necessary when encryption or bouncing is required)

* uvm/uvm_swap.c part pre-allocates 16 pages in the DMA-reachable region
  for the same reason.  Note that a sleeping point is introduced because
  the pagedaemon is faster than the asynchronous I/O and in OOM
  situation it tends to stay busy building cluster that it then discard
  because no memory is available.

* uvm/uvm_pdaemon.c part changes the inner-loop scanning the inactive 
  list of pages to account for a given memory range.  Without this the
  daemon could spin infinitely doing nothing because the global limits
  are reached.

At lot could be improved, but this at least makes swapping work in OOM
situations.

Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.78
diff -u -p -r1.78 uvm_pager.c
--- uvm/uvm_pager.c     18 Feb 2022 09:04:38 -0000      1.78
+++ uvm/uvm_pager.c     27 Jun 2022 08:44:41 -0000
@@ -58,8 +58,8 @@ const struct uvm_pagerops *uvmpagerops[]
  * The number of uvm_pseg instances is dynamic using an array segs.
  * At most UVM_PSEG_COUNT instances can exist.
  *
- * psegs[0] always exists (so that the pager can always map in pages).
- * psegs[0] element 0 is always reserved for the pagedaemon.
+ * psegs[0/1] always exist (so that the pager can always map in pages).
+ * psegs[0/1] element 0 are always reserved for the pagedaemon.
  *
  * Any other pseg is automatically created when no space is available
  * and automatically destroyed when it is no longer in use.
@@ -93,6 +93,7 @@ uvm_pager_init(void)
 
        /* init pager map */
        uvm_pseg_init(&psegs[0]);
+       uvm_pseg_init(&psegs[1]);
        mtx_init(&uvm_pseg_lck, IPL_VM);
 
        /* init ASYNC I/O queue */
@@ -168,9 +169,10 @@ pager_seg_restart:
                                goto pager_seg_fail;
                }
 
-               /* Keep index 0 reserved for pagedaemon. */
-               if (pseg == &psegs[0] && curproc != uvm.pagedaemon_proc)
-                       i = 1;
+               /* Keep indexes 0,1 reserved for pagedaemon. */
+               if ((pseg == &psegs[0] || pseg == &psegs[1]) &&
+                   (curproc != uvm.pagedaemon_proc))
+                       i = 2;
                else
                        i = 0;
 
@@ -229,7 +231,7 @@ uvm_pseg_release(vaddr_t segaddr)
        pseg->use &= ~(1 << id);
        wakeup(&psegs);
 
-       if (pseg != &psegs[0] && UVM_PSEG_EMPTY(pseg)) {
+       if ((pseg != &psegs[0] && pseg != &psegs[1]) && UVM_PSEG_EMPTY(pseg)) {
                va = pseg->start;
                pseg->start = 0;
        }
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.99
diff -u -p -r1.99 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c   12 May 2022 12:49:31 -0000      1.99
+++ uvm/uvm_pdaemon.c   27 Jun 2022 13:24:54 -0000
@@ -101,8 +101,8 @@ extern void drmbackoff(long);
  * local prototypes
  */
 
-void           uvmpd_scan(void);
-boolean_t      uvmpd_scan_inactive(struct pglist *);
+void           uvmpd_scan(struct uvm_pmalloc *);
+boolean_t      uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
 void           uvmpd_tune(void);
 void           uvmpd_drop(struct pglist *);
 
@@ -281,7 +281,7 @@ uvm_pageout(void *arg)
                if (pma != NULL ||
                    ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) ||
                    ((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) {
-                       uvmpd_scan();
+                       uvmpd_scan(pma);
                }
 
                /*
@@ -379,15 +379,15 @@ uvm_aiodone_daemon(void *arg)
  */
 
 boolean_t
-uvmpd_scan_inactive(struct pglist *pglst)
+uvmpd_scan_inactive(struct uvm_pmalloc *pma, struct pglist *pglst)
 {
        boolean_t retval = FALSE;       /* assume we haven't hit target */
        int free, result;
        struct vm_page *p, *nextpg;
        struct uvm_object *uobj;
-       struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
+       struct vm_page *pps[SWCLUSTPAGES], **ppsp;
        int npages;
-       struct vm_page *swpps[MAXBSIZE >> PAGE_SHIFT];  /* XXX: see below */
+       struct vm_page *swpps[SWCLUSTPAGES];    /* XXX: see below */
        int swnpages, swcpages;                         /* XXX: see below */
        int swslot;
        struct vm_anon *anon;
@@ -404,8 +404,27 @@ uvmpd_scan_inactive(struct pglist *pglst
        swnpages = swcpages = 0;
        free = 0;
        dirtyreacts = 0;
+       p = NULL;
 
-       for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) {
+       /* Start with the first page on the list that fit in pma's ranges */
+       if (pma != NULL) {
+               paddr_t paddr;
+
+               TAILQ_FOREACH(p, pglst, pageq) {
+                       paddr = atop(VM_PAGE_TO_PHYS(p));
+                       if (paddr >= pma->pm_constraint.ucr_low &&
+                           paddr < pma->pm_constraint.ucr_high)
+                               break;
+               }
+
+       }
+
+       if (p == NULL) {
+               p = TAILQ_FIRST(pglst);
+               pma = NULL;
+       }
+
+       for (; p != NULL || swslot != 0; p = nextpg) {
                /*
                 * note that p can be NULL iff we have traversed the whole
                 * list and need to do one final swap-backed clustered pageout.
@@ -419,8 +438,8 @@ uvmpd_scan_inactive(struct pglist *pglst
                         * our target
                         */
                        free = uvmexp.free - BUFPAGES_DEFICIT;
-
-                       if (free + uvmexp.paging >= uvmexp.freetarg << 2 ||
+                       if (((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
+                           (free + uvmexp.paging >= uvmexp.freetarg << 2)) ||
                            dirtyreacts == UVMPD_NUMDIRTYREACTS) {
                                retval = TRUE;
 
@@ -531,7 +550,8 @@ uvmpd_scan_inactive(struct pglist *pglst
                         * this page is dirty, skip it if we'll have met our
                         * free target when all the current pageouts complete.
                         */
-                       if (free + uvmexp.paging > uvmexp.freetarg << 2) {
+                       if ((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
+                           (free + uvmexp.paging > uvmexp.freetarg << 2)) {
                                if (anon) {
                                        rw_exit(anon->an_lock);
                                } else {
@@ -611,7 +631,7 @@ uvmpd_scan_inactive(struct pglist *pglst
 
                                /* start new cluster (if necessary) */
                                if (swslot == 0) {
-                                       swnpages = MAXBSIZE >> PAGE_SHIFT;
+                                       swnpages = SWCLUSTPAGES;
                                        swslot = uvm_swap_alloc(&swnpages,
                                            TRUE);
                                        if (swslot == 0) {
@@ -867,7 +887,7 @@ uvmpd_scan_inactive(struct pglist *pglst
  */
 
 void
-uvmpd_scan(void)
+uvmpd_scan(struct uvm_pmalloc *pma)
 {
        int free, inactive_shortage, swap_shortage, pages_freed;
        struct vm_page *p, *nextpg;
@@ -910,7 +930,7 @@ uvmpd_scan(void)
         * low bit of uvmexp.pdrevs (which we bump by one each call).
         */
        pages_freed = uvmexp.pdfreed;
-       (void) uvmpd_scan_inactive(&uvm.page_inactive);
+       (void) uvmpd_scan_inactive(pma, &uvm.page_inactive);
        pages_freed = uvmexp.pdfreed - pages_freed;
 
        /*
Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.156
diff -u -p -r1.156 uvm_swap.c
--- uvm/uvm_swap.c      7 Jun 2022 12:02:52 -0000       1.156
+++ uvm/uvm_swap.c      27 Jun 2022 13:23:36 -0000
@@ -213,6 +213,10 @@ struct swap_priority swap_priority;
 /* locks */
 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
 
+struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM);
+struct vm_page *oompps[SWCLUSTPAGES];
+int oom = 0;
+
 /*
  * prototypes
  */
@@ -235,7 +239,7 @@ void sw_reg_start(struct swapdev *);
 int uvm_swap_io(struct vm_page **, int, int, int);
 
 void swapmount(void);
-boolean_t uvm_swap_allocpages(struct vm_page **, int);
+int uvm_swap_allocpages(struct vm_page **, int);
 
 #ifdef UVM_SWAP_ENCRYPT
 /* for swap encrypt */
@@ -253,6 +257,8 @@ void uvm_swap_initcrypt(struct swapdev *
 void
 uvm_swap_init(void)
 {
+       int error;
+
        /*
         * first, init the swap list, its counter, and its lock.
         * then get a handle on the vnode for /dev/drum by using
@@ -281,6 +287,10 @@ uvm_swap_init(void)
        pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0,
            "swp vnd", NULL);
 
+       /* allocate pages for OOM situations. */
+       error = uvm_swap_allocpages(oompps, SWCLUSTPAGES);
+       KASSERT(error == 0);
+
        /* Setup the initial swap partition */
        swapmount();
 }
@@ -323,16 +333,35 @@ uvm_swap_initcrypt(struct swapdev *sdp, 
 
 #endif /* UVM_SWAP_ENCRYPT */
 
-boolean_t
+int
 uvm_swap_allocpages(struct vm_page **pps, int npages)
 {
        struct pglist   pgl;
-       int i;
+       int error, i;
+
+       KASSERT(npages <= SWCLUSTPAGES);
 
        TAILQ_INIT(&pgl);
-       if (uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
-           dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT))
-               return FALSE;
+again:
+       error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
+           dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT);
+       if (error && (curproc == uvm.pagedaemon_proc)) {
+               mtx_enter(&oommtx);
+               if (oom) {
+                       msleep_nsec(&oom, &oommtx, PVM | PNORELOCK,
+                        "oom", INFSLP);
+                       goto again;
+               }
+               oom = 1;
+               for (i = 0; i < npages; i++) {
+                       pps[i] = oompps[i];
+                       atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY);
+               }
+               mtx_leave(&oommtx);
+               return 0;
+       }
+       if (error)
+               return error;
 
        for (i = 0; i < npages; i++) {
                pps[i] = TAILQ_FIRST(&pgl);
@@ -341,7 +370,7 @@ uvm_swap_allocpages(struct vm_page **pps
                TAILQ_REMOVE(&pgl, pps[i], pageq);
        }
 
-       return TRUE;
+       return 0;
 }
 
 void
@@ -349,10 +378,23 @@ uvm_swap_freepages(struct vm_page **pps,
 {
        int i;
 
+       if (pps[0] == oompps[0]) {
+               for (i = 0; i < npages; i++)
+                       uvm_pageclean(pps[i]);
+
+               mtx_enter(&oommtx);
+               KASSERT(oom == 1);
+               oom = 0;
+               mtx_leave(&oommtx);
+               wakeup(&oom);
+               return;
+       }
+
        uvm_lock_pageq();
        for (i = 0; i < npages; i++)
                uvm_pagefree(pps[i]);
        uvm_unlock_pageq();
+
 }
 
 #ifdef UVM_SWAP_ENCRYPT
@@ -1587,7 +1629,8 @@ uvm_swap_io(struct vm_page **pps, int st
        int     result, s, mapinflags, pflag, bounce = 0, i;
        boolean_t write, async;
        vaddr_t bouncekva;
-       struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT];
+       struct vm_page *tpps[SWCLUSTPAGES];
+       int pdaemon = (curproc == uvm.pagedaemon_proc);
 #ifdef UVM_SWAP_ENCRYPT
        struct swapdev *sdp;
        int     encrypt = 0;
@@ -1601,16 +1644,23 @@ uvm_swap_io(struct vm_page **pps, int st
        /* convert starting drum slot to block number */
        startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
 
+       pflag = (async || pdaemon) ? PR_NOWAIT : PR_WAITOK;
+       bp = pool_get(&bufpool, pflag | PR_ZERO);
+       if (bp == NULL)
+               return (VM_PAGER_AGAIN);
+
        /*
-        * first, map the pages into the kernel (XXX: currently required
+        * map the pages into the kernel (XXX: currently required
         * by buffer system).
         */
        mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
        if (!async)
                mapinflags |= UVMPAGER_MAPIN_WAITOK;
        kva = uvm_pagermapin(pps, npages, mapinflags);
-       if (kva == 0)
+       if (kva == 0) {
+               pool_put(&bufpool, bp);
                return (VM_PAGER_AGAIN);
+       }
 
 #ifdef UVM_SWAP_ENCRYPT
        if (write) {
@@ -1664,40 +1714,22 @@ uvm_swap_io(struct vm_page **pps, int st
                swmapflags = UVMPAGER_MAPIN_READ;
                if (!async)
                        swmapflags |= UVMPAGER_MAPIN_WAITOK;
-
-               if (!uvm_swap_allocpages(tpps, npages)) {
+               if (uvm_swap_allocpages(tpps, npages)) {
+                       pool_put(&bufpool, bp);
                        uvm_pagermapout(kva, npages);
                        return (VM_PAGER_AGAIN);
                }
 
                bouncekva = uvm_pagermapin(tpps, npages, swmapflags);
                if (bouncekva == 0) {
+                       KASSERT(tpps[0] != oompps[0]);
+                       pool_put(&bufpool, bp);
                        uvm_pagermapout(kva, npages);
                        uvm_swap_freepages(tpps, npages);
                        return (VM_PAGER_AGAIN);
                }
        }
 
-       /*
-        * now allocate a buf for the i/o.
-        * [make sure we don't put the pagedaemon to sleep...]
-        */
-       pflag = (async || curproc == uvm.pagedaemon_proc) ? PR_NOWAIT :
-           PR_WAITOK;
-       bp = pool_get(&bufpool, pflag | PR_ZERO);
-
-       /*
-        * if we failed to get a swapbuf, return "try again"
-        */
-       if (bp == NULL) {
-               if (bounce) {
-                       uvm_pagermapout(bouncekva, npages);
-                       uvm_swap_freepages(tpps, npages);
-               }
-               uvm_pagermapout(kva, npages);
-               return (VM_PAGER_AGAIN);
-       }
-
        /* encrypt to swap */
        if (write && bounce) {
                int i, opages;
@@ -1789,8 +1821,7 @@ uvm_swap_io(struct vm_page **pps, int st
 
        /* for async ops we must set up the iodone handler. */
        if (async) {
-               bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ?
-                                        B_PDAEMON : 0);
+               bp->b_flags |= B_CALL | (pdaemon ? B_PDAEMON : 0);
                bp->b_iodone = uvm_aio_biodone;
        }
 
Index: uvm/uvm_swap.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.h,v
retrieving revision 1.18
diff -u -p -r1.18 uvm_swap.h
--- uvm/uvm_swap.h      29 Sep 2020 11:47:41 -0000      1.18
+++ uvm/uvm_swap.h      27 Jun 2022 10:08:51 -0000
@@ -32,6 +32,7 @@
 #ifndef _UVM_UVM_SWAP_H_
 #define _UVM_UVM_SWAP_H_
 
+#define        SWCLUSTPAGES    (MAXBSIZE >> PAGE_SHIFT)
 #define        SWSLOT_BAD      (-1)
 
 #ifdef _KERNEL

Reply via email to