On 27/06/22(Mon) 15:44, Martin Pieuchot wrote: > Diff below contain 3 parts that can be committed independently. The 3 > of them are necessary to allow the pagedaemon to make progress in OOM > situation and to satisfy all the allocations waiting for pages in > specific ranges. > > * uvm/uvm_pager.c part reserves a second segment for the page daemon. > This is necessary to ensure the two uvm_pagermapin() calls needed by > uvm_swap_io() succeed in emergency OOM situation. (the 2nd segment is > necessary when encryption or bouncing is required) > > * uvm/uvm_swap.c part pre-allocates 16 pages in the DMA-reachable region > for the same reason. Note that a sleeping point is introduced because > the pagedaemon is faster than the asynchronous I/O and in OOM > situation it tends to stay busy building cluster that it then discard > because no memory is available. > > * uvm/uvm_pdaemon.c part changes the inner-loop scanning the inactive > list of pages to account for a given memory range. Without this the > daemon could spin infinitely doing nothing because the global limits > are reached.
Here's an updated diff with a fix on top: * in uvm/uvm_swap.c make sure uvm_swap_allocpages() is allowed to sleep when coming from uvm_fault(). This makes the faulting process wait instead of dying when there isn't any free pages to do the bouncing. I'd appreciate more reviews and tests ! Index: uvm/uvm_pager.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pager.c,v retrieving revision 1.80 diff -u -p -r1.80 uvm_pager.c --- uvm/uvm_pager.c 28 Jun 2022 12:10:37 -0000 1.80 +++ uvm/uvm_pager.c 28 Jun 2022 15:25:30 -0000 @@ -58,8 +58,8 @@ const struct uvm_pagerops *uvmpagerops[] * The number of uvm_pseg instances is dynamic using an array segs. * At most UVM_PSEG_COUNT instances can exist. * - * psegs[0] always exists (so that the pager can always map in pages). - * psegs[0] element 0 is always reserved for the pagedaemon. + * psegs[0/1] always exist (so that the pager can always map in pages). + * psegs[0/1] element 0 are always reserved for the pagedaemon. * * Any other pseg is automatically created when no space is available * and automatically destroyed when it is no longer in use. @@ -93,6 +93,7 @@ uvm_pager_init(void) /* init pager map */ uvm_pseg_init(&psegs[0]); + uvm_pseg_init(&psegs[1]); mtx_init(&uvm_pseg_lck, IPL_VM); /* init ASYNC I/O queue */ @@ -168,9 +169,10 @@ pager_seg_restart: goto pager_seg_fail; } - /* Keep index 0 reserved for pagedaemon. */ - if (pseg == &psegs[0] && curproc != uvm.pagedaemon_proc) - i = 1; + /* Keep indexes 0,1 reserved for pagedaemon. */ + if ((pseg == &psegs[0] || pseg == &psegs[1]) && + (curproc != uvm.pagedaemon_proc)) + i = 2; else i = 0; @@ -229,7 +231,7 @@ uvm_pseg_release(vaddr_t segaddr) pseg->use &= ~(1 << id); wakeup(&psegs); - if (pseg != &psegs[0] && UVM_PSEG_EMPTY(pseg)) { + if ((pseg != &psegs[0] && pseg != &psegs[1]) && UVM_PSEG_EMPTY(pseg)) { va = pseg->start; pseg->start = 0; } Index: uvm/uvm_pdaemon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v retrieving revision 1.99 diff -u -p -r1.99 uvm_pdaemon.c --- uvm/uvm_pdaemon.c 12 May 2022 12:49:31 -0000 1.99 +++ uvm/uvm_pdaemon.c 28 Jun 2022 13:59:49 -0000 @@ -101,8 +101,8 @@ extern void drmbackoff(long); * local prototypes */ -void uvmpd_scan(void); -boolean_t uvmpd_scan_inactive(struct pglist *); +void uvmpd_scan(struct uvm_pmalloc *); +boolean_t uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *); void uvmpd_tune(void); void uvmpd_drop(struct pglist *); @@ -281,7 +281,7 @@ uvm_pageout(void *arg) if (pma != NULL || ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) || ((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) { - uvmpd_scan(); + uvmpd_scan(pma); } /* @@ -379,15 +379,15 @@ uvm_aiodone_daemon(void *arg) */ boolean_t -uvmpd_scan_inactive(struct pglist *pglst) +uvmpd_scan_inactive(struct uvm_pmalloc *pma, struct pglist *pglst) { boolean_t retval = FALSE; /* assume we haven't hit target */ int free, result; struct vm_page *p, *nextpg; struct uvm_object *uobj; - struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp; + struct vm_page *pps[SWCLUSTPAGES], **ppsp; int npages; - struct vm_page *swpps[MAXBSIZE >> PAGE_SHIFT]; /* XXX: see below */ + struct vm_page *swpps[SWCLUSTPAGES]; /* XXX: see below */ int swnpages, swcpages; /* XXX: see below */ int swslot; struct vm_anon *anon; @@ -404,8 +404,27 @@ uvmpd_scan_inactive(struct pglist *pglst swnpages = swcpages = 0; free = 0; dirtyreacts = 0; + p = NULL; - for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) { + /* Start with the first page on the list that fit in pma's ranges */ + if (pma != NULL) { + paddr_t paddr; + + TAILQ_FOREACH(p, pglst, pageq) { + paddr = atop(VM_PAGE_TO_PHYS(p)); + if (paddr >= pma->pm_constraint.ucr_low && + paddr < pma->pm_constraint.ucr_high) + break; + } + + } + + if (p == NULL) { + p = TAILQ_FIRST(pglst); + pma = NULL; + } + + for (; p != NULL || swslot != 0; p = nextpg) { /* * note that p can be NULL iff we have traversed the whole * list and need to do one final swap-backed clustered pageout. @@ -419,8 +438,8 @@ uvmpd_scan_inactive(struct pglist *pglst * our target */ free = uvmexp.free - BUFPAGES_DEFICIT; - - if (free + uvmexp.paging >= uvmexp.freetarg << 2 || + if (((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) && + (free + uvmexp.paging >= uvmexp.freetarg << 2)) || dirtyreacts == UVMPD_NUMDIRTYREACTS) { retval = TRUE; @@ -531,7 +550,8 @@ uvmpd_scan_inactive(struct pglist *pglst * this page is dirty, skip it if we'll have met our * free target when all the current pageouts complete. */ - if (free + uvmexp.paging > uvmexp.freetarg << 2) { + if ((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) && + (free + uvmexp.paging > uvmexp.freetarg << 2)) { if (anon) { rw_exit(anon->an_lock); } else { @@ -611,7 +631,7 @@ uvmpd_scan_inactive(struct pglist *pglst /* start new cluster (if necessary) */ if (swslot == 0) { - swnpages = MAXBSIZE >> PAGE_SHIFT; + swnpages = SWCLUSTPAGES; swslot = uvm_swap_alloc(&swnpages, TRUE); if (swslot == 0) { @@ -867,7 +887,7 @@ uvmpd_scan_inactive(struct pglist *pglst */ void -uvmpd_scan(void) +uvmpd_scan(struct uvm_pmalloc *pma) { int free, inactive_shortage, swap_shortage, pages_freed; struct vm_page *p, *nextpg; @@ -910,7 +930,7 @@ uvmpd_scan(void) * low bit of uvmexp.pdrevs (which we bump by one each call). */ pages_freed = uvmexp.pdfreed; - (void) uvmpd_scan_inactive(&uvm.page_inactive); + (void) uvmpd_scan_inactive(pma, &uvm.page_inactive); pages_freed = uvmexp.pdfreed - pages_freed; /* Index: uvm/uvm_swap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_swap.c,v retrieving revision 1.156 diff -u -p -r1.156 uvm_swap.c --- uvm/uvm_swap.c 7 Jun 2022 12:02:52 -0000 1.156 +++ uvm/uvm_swap.c 28 Jun 2022 15:16:58 -0000 @@ -213,6 +213,10 @@ struct swap_priority swap_priority; /* locks */ struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk"); +struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM); +struct vm_page *oompps[SWCLUSTPAGES]; +int oom = 0; + /* * prototypes */ @@ -235,7 +239,7 @@ void sw_reg_start(struct swapdev *); int uvm_swap_io(struct vm_page **, int, int, int); void swapmount(void); -boolean_t uvm_swap_allocpages(struct vm_page **, int); +int uvm_swap_allocpages(struct vm_page **, int, int); #ifdef UVM_SWAP_ENCRYPT /* for swap encrypt */ @@ -253,6 +257,8 @@ void uvm_swap_initcrypt(struct swapdev * void uvm_swap_init(void) { + int error; + /* * first, init the swap list, its counter, and its lock. * then get a handle on the vnode for /dev/drum by using @@ -281,6 +287,10 @@ uvm_swap_init(void) pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0, "swp vnd", NULL); + /* allocate pages for OOM situations. */ + error = uvm_swap_allocpages(oompps, SWCLUSTPAGES, UVM_PLA_NOWAIT); + KASSERT(error == 0); + /* Setup the initial swap partition */ swapmount(); } @@ -323,16 +333,35 @@ uvm_swap_initcrypt(struct swapdev *sdp, #endif /* UVM_SWAP_ENCRYPT */ -boolean_t -uvm_swap_allocpages(struct vm_page **pps, int npages) +int +uvm_swap_allocpages(struct vm_page **pps, int npages, int flags) { struct pglist pgl; - int i; + int error, i; + + KASSERT(npages <= SWCLUSTPAGES); TAILQ_INIT(&pgl); - if (uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low, - dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT)) - return FALSE; +again: + error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low, + dma_constraint.ucr_high, 0, 0, &pgl, npages, flags); + if (error && (curproc == uvm.pagedaemon_proc)) { + mtx_enter(&oommtx); + if (oom) { + msleep_nsec(&oom, &oommtx, PVM | PNORELOCK, + "oom", INFSLP); + goto again; + } + oom = 1; + for (i = 0; i < npages; i++) { + pps[i] = oompps[i]; + atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY); + } + mtx_leave(&oommtx); + return 0; + } + if (error) + return error; for (i = 0; i < npages; i++) { pps[i] = TAILQ_FIRST(&pgl); @@ -341,7 +370,7 @@ uvm_swap_allocpages(struct vm_page **pps TAILQ_REMOVE(&pgl, pps[i], pageq); } - return TRUE; + return 0; } void @@ -349,10 +378,23 @@ uvm_swap_freepages(struct vm_page **pps, { int i; + if (pps[0] == oompps[0]) { + for (i = 0; i < npages; i++) + uvm_pageclean(pps[i]); + + mtx_enter(&oommtx); + KASSERT(oom == 1); + oom = 0; + mtx_leave(&oommtx); + wakeup(&oom); + return; + } + uvm_lock_pageq(); for (i = 0; i < npages; i++) uvm_pagefree(pps[i]); uvm_unlock_pageq(); + } #ifdef UVM_SWAP_ENCRYPT @@ -1587,7 +1629,8 @@ uvm_swap_io(struct vm_page **pps, int st int result, s, mapinflags, pflag, bounce = 0, i; boolean_t write, async; vaddr_t bouncekva; - struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT]; + struct vm_page *tpps[SWCLUSTPAGES]; + int pdaemon = (curproc == uvm.pagedaemon_proc); #ifdef UVM_SWAP_ENCRYPT struct swapdev *sdp; int encrypt = 0; @@ -1601,16 +1644,23 @@ uvm_swap_io(struct vm_page **pps, int st /* convert starting drum slot to block number */ startblk = btodb((u_int64_t)startslot << PAGE_SHIFT); + pflag = (async || pdaemon) ? PR_NOWAIT : PR_WAITOK; + bp = pool_get(&bufpool, pflag | PR_ZERO); + if (bp == NULL) + return (VM_PAGER_AGAIN); + /* - * first, map the pages into the kernel (XXX: currently required + * map the pages into the kernel (XXX: currently required * by buffer system). */ mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; if (!async) mapinflags |= UVMPAGER_MAPIN_WAITOK; kva = uvm_pagermapin(pps, npages, mapinflags); - if (kva == 0) + if (kva == 0) { + pool_put(&bufpool, bp); return (VM_PAGER_AGAIN); + } #ifdef UVM_SWAP_ENCRYPT if (write) { @@ -1658,46 +1708,31 @@ uvm_swap_io(struct vm_page **pps, int st } if (bounce) { - int swmapflags; + int swmapflags, plaflags; /* We always need write access. */ swmapflags = UVMPAGER_MAPIN_READ; - if (!async) + plaflags = UVM_PLA_NOWAIT; + if (!async) { swmapflags |= UVMPAGER_MAPIN_WAITOK; - - if (!uvm_swap_allocpages(tpps, npages)) { + plaflags = UVM_PLA_WAITOK; + } + if (uvm_swap_allocpages(tpps, npages, plaflags)) { + pool_put(&bufpool, bp); uvm_pagermapout(kva, npages); return (VM_PAGER_AGAIN); } bouncekva = uvm_pagermapin(tpps, npages, swmapflags); if (bouncekva == 0) { + KASSERT(tpps[0] != oompps[0]); + pool_put(&bufpool, bp); uvm_pagermapout(kva, npages); uvm_swap_freepages(tpps, npages); return (VM_PAGER_AGAIN); } } - /* - * now allocate a buf for the i/o. - * [make sure we don't put the pagedaemon to sleep...] - */ - pflag = (async || curproc == uvm.pagedaemon_proc) ? PR_NOWAIT : - PR_WAITOK; - bp = pool_get(&bufpool, pflag | PR_ZERO); - - /* - * if we failed to get a swapbuf, return "try again" - */ - if (bp == NULL) { - if (bounce) { - uvm_pagermapout(bouncekva, npages); - uvm_swap_freepages(tpps, npages); - } - uvm_pagermapout(kva, npages); - return (VM_PAGER_AGAIN); - } - /* encrypt to swap */ if (write && bounce) { int i, opages; @@ -1789,8 +1824,7 @@ uvm_swap_io(struct vm_page **pps, int st /* for async ops we must set up the iodone handler. */ if (async) { - bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ? - B_PDAEMON : 0); + bp->b_flags |= B_CALL | (pdaemon ? B_PDAEMON : 0); bp->b_iodone = uvm_aio_biodone; } Index: uvm/uvm_swap.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_swap.h,v retrieving revision 1.18 diff -u -p -r1.18 uvm_swap.h --- uvm/uvm_swap.h 29 Sep 2020 11:47:41 -0000 1.18 +++ uvm/uvm_swap.h 28 Jun 2022 13:59:49 -0000 @@ -32,6 +32,7 @@ #ifndef _UVM_UVM_SWAP_H_ #define _UVM_UVM_SWAP_H_ +#define SWCLUSTPAGES (MAXBSIZE >> PAGE_SHIFT) #define SWSLOT_BAD (-1) #ifdef _KERNEL