Diff below contain 3 parts that can be committed independently. The 3
of them are necessary to allow the pagedaemon to make progress in OOM
situation and to satisfy all the allocations waiting for pages in
specific ranges.
* uvm/uvm_pager.c part reserves a second segment for the page daemon.
This is necessary to ensure the two uvm_pagermapin() calls needed by
uvm_swap_io() succeed in emergency OOM situation. (the 2nd segment is
necessary when encryption or bouncing is required)
* uvm/uvm_swap.c part pre-allocates 16 pages in the DMA-reachable region
for the same reason. Note that a sleeping point is introduced because
the pagedaemon is faster than the asynchronous I/O and in OOM
situation it tends to stay busy building cluster that it then discard
because no memory is available.
* uvm/uvm_pdaemon.c part changes the inner-loop scanning the inactive
list of pages to account for a given memory range. Without this the
daemon could spin infinitely doing nothing because the global limits
are reached.
At lot could be improved, but this at least makes swapping work in OOM
situations.
Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.78
diff -u -p -r1.78 uvm_pager.c
--- uvm/uvm_pager.c 18 Feb 2022 09:04:38 -0000 1.78
+++ uvm/uvm_pager.c 27 Jun 2022 08:44:41 -0000
@@ -58,8 +58,8 @@ const struct uvm_pagerops *uvmpagerops[]
* The number of uvm_pseg instances is dynamic using an array segs.
* At most UVM_PSEG_COUNT instances can exist.
*
- * psegs[0] always exists (so that the pager can always map in pages).
- * psegs[0] element 0 is always reserved for the pagedaemon.
+ * psegs[0/1] always exist (so that the pager can always map in pages).
+ * psegs[0/1] element 0 are always reserved for the pagedaemon.
*
* Any other pseg is automatically created when no space is available
* and automatically destroyed when it is no longer in use.
@@ -93,6 +93,7 @@ uvm_pager_init(void)
/* init pager map */
uvm_pseg_init(&psegs[0]);
+ uvm_pseg_init(&psegs[1]);
mtx_init(&uvm_pseg_lck, IPL_VM);
/* init ASYNC I/O queue */
@@ -168,9 +169,10 @@ pager_seg_restart:
goto pager_seg_fail;
}
- /* Keep index 0 reserved for pagedaemon. */
- if (pseg == &psegs[0] && curproc != uvm.pagedaemon_proc)
- i = 1;
+ /* Keep indexes 0,1 reserved for pagedaemon. */
+ if ((pseg == &psegs[0] || pseg == &psegs[1]) &&
+ (curproc != uvm.pagedaemon_proc))
+ i = 2;
else
i = 0;
@@ -229,7 +231,7 @@ uvm_pseg_release(vaddr_t segaddr)
pseg->use &= ~(1 << id);
wakeup(&psegs);
- if (pseg != &psegs[0] && UVM_PSEG_EMPTY(pseg)) {
+ if ((pseg != &psegs[0] && pseg != &psegs[1]) && UVM_PSEG_EMPTY(pseg)) {
va = pseg->start;
pseg->start = 0;
}
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.99
diff -u -p -r1.99 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c 12 May 2022 12:49:31 -0000 1.99
+++ uvm/uvm_pdaemon.c 27 Jun 2022 13:24:54 -0000
@@ -101,8 +101,8 @@ extern void drmbackoff(long);
* local prototypes
*/
-void uvmpd_scan(void);
-boolean_t uvmpd_scan_inactive(struct pglist *);
+void uvmpd_scan(struct uvm_pmalloc *);
+boolean_t uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
void uvmpd_tune(void);
void uvmpd_drop(struct pglist *);
@@ -281,7 +281,7 @@ uvm_pageout(void *arg)
if (pma != NULL ||
((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) ||
((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) {
- uvmpd_scan();
+ uvmpd_scan(pma);
}
/*
@@ -379,15 +379,15 @@ uvm_aiodone_daemon(void *arg)
*/
boolean_t
-uvmpd_scan_inactive(struct pglist *pglst)
+uvmpd_scan_inactive(struct uvm_pmalloc *pma, struct pglist *pglst)
{
boolean_t retval = FALSE; /* assume we haven't hit target */
int free, result;
struct vm_page *p, *nextpg;
struct uvm_object *uobj;
- struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
+ struct vm_page *pps[SWCLUSTPAGES], **ppsp;
int npages;
- struct vm_page *swpps[MAXBSIZE >> PAGE_SHIFT]; /* XXX: see below */
+ struct vm_page *swpps[SWCLUSTPAGES]; /* XXX: see below */
int swnpages, swcpages; /* XXX: see below */
int swslot;
struct vm_anon *anon;
@@ -404,8 +404,27 @@ uvmpd_scan_inactive(struct pglist *pglst
swnpages = swcpages = 0;
free = 0;
dirtyreacts = 0;
+ p = NULL;
- for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) {
+ /* Start with the first page on the list that fit in pma's ranges */
+ if (pma != NULL) {
+ paddr_t paddr;
+
+ TAILQ_FOREACH(p, pglst, pageq) {
+ paddr = atop(VM_PAGE_TO_PHYS(p));
+ if (paddr >= pma->pm_constraint.ucr_low &&
+ paddr < pma->pm_constraint.ucr_high)
+ break;
+ }
+
+ }
+
+ if (p == NULL) {
+ p = TAILQ_FIRST(pglst);
+ pma = NULL;
+ }
+
+ for (; p != NULL || swslot != 0; p = nextpg) {
/*
* note that p can be NULL iff we have traversed the whole
* list and need to do one final swap-backed clustered pageout.
@@ -419,8 +438,8 @@ uvmpd_scan_inactive(struct pglist *pglst
* our target
*/
free = uvmexp.free - BUFPAGES_DEFICIT;
-
- if (free + uvmexp.paging >= uvmexp.freetarg << 2 ||
+ if (((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
+ (free + uvmexp.paging >= uvmexp.freetarg << 2)) ||
dirtyreacts == UVMPD_NUMDIRTYREACTS) {
retval = TRUE;
@@ -531,7 +550,8 @@ uvmpd_scan_inactive(struct pglist *pglst
* this page is dirty, skip it if we'll have met our
* free target when all the current pageouts complete.
*/
- if (free + uvmexp.paging > uvmexp.freetarg << 2) {
+ if ((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
+ (free + uvmexp.paging > uvmexp.freetarg << 2)) {
if (anon) {
rw_exit(anon->an_lock);
} else {
@@ -611,7 +631,7 @@ uvmpd_scan_inactive(struct pglist *pglst
/* start new cluster (if necessary) */
if (swslot == 0) {
- swnpages = MAXBSIZE >> PAGE_SHIFT;
+ swnpages = SWCLUSTPAGES;
swslot = uvm_swap_alloc(&swnpages,
TRUE);
if (swslot == 0) {
@@ -867,7 +887,7 @@ uvmpd_scan_inactive(struct pglist *pglst
*/
void
-uvmpd_scan(void)
+uvmpd_scan(struct uvm_pmalloc *pma)
{
int free, inactive_shortage, swap_shortage, pages_freed;
struct vm_page *p, *nextpg;
@@ -910,7 +930,7 @@ uvmpd_scan(void)
* low bit of uvmexp.pdrevs (which we bump by one each call).
*/
pages_freed = uvmexp.pdfreed;
- (void) uvmpd_scan_inactive(&uvm.page_inactive);
+ (void) uvmpd_scan_inactive(pma, &uvm.page_inactive);
pages_freed = uvmexp.pdfreed - pages_freed;
/*
Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.156
diff -u -p -r1.156 uvm_swap.c
--- uvm/uvm_swap.c 7 Jun 2022 12:02:52 -0000 1.156
+++ uvm/uvm_swap.c 27 Jun 2022 13:23:36 -0000
@@ -213,6 +213,10 @@ struct swap_priority swap_priority;
/* locks */
struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
+struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM);
+struct vm_page *oompps[SWCLUSTPAGES];
+int oom = 0;
+
/*
* prototypes
*/
@@ -235,7 +239,7 @@ void sw_reg_start(struct swapdev *);
int uvm_swap_io(struct vm_page **, int, int, int);
void swapmount(void);
-boolean_t uvm_swap_allocpages(struct vm_page **, int);
+int uvm_swap_allocpages(struct vm_page **, int);
#ifdef UVM_SWAP_ENCRYPT
/* for swap encrypt */
@@ -253,6 +257,8 @@ void uvm_swap_initcrypt(struct swapdev *
void
uvm_swap_init(void)
{
+ int error;
+
/*
* first, init the swap list, its counter, and its lock.
* then get a handle on the vnode for /dev/drum by using
@@ -281,6 +287,10 @@ uvm_swap_init(void)
pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0,
"swp vnd", NULL);
+ /* allocate pages for OOM situations. */
+ error = uvm_swap_allocpages(oompps, SWCLUSTPAGES);
+ KASSERT(error == 0);
+
/* Setup the initial swap partition */
swapmount();
}
@@ -323,16 +333,35 @@ uvm_swap_initcrypt(struct swapdev *sdp,
#endif /* UVM_SWAP_ENCRYPT */
-boolean_t
+int
uvm_swap_allocpages(struct vm_page **pps, int npages)
{
struct pglist pgl;
- int i;
+ int error, i;
+
+ KASSERT(npages <= SWCLUSTPAGES);
TAILQ_INIT(&pgl);
- if (uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
- dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT))
- return FALSE;
+again:
+ error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
+ dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT);
+ if (error && (curproc == uvm.pagedaemon_proc)) {
+ mtx_enter(&oommtx);
+ if (oom) {
+ msleep_nsec(&oom, &oommtx, PVM | PNORELOCK,
+ "oom", INFSLP);
+ goto again;
+ }
+ oom = 1;
+ for (i = 0; i < npages; i++) {
+ pps[i] = oompps[i];
+ atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY);
+ }
+ mtx_leave(&oommtx);
+ return 0;
+ }
+ if (error)
+ return error;
for (i = 0; i < npages; i++) {
pps[i] = TAILQ_FIRST(&pgl);
@@ -341,7 +370,7 @@ uvm_swap_allocpages(struct vm_page **pps
TAILQ_REMOVE(&pgl, pps[i], pageq);
}
- return TRUE;
+ return 0;
}
void
@@ -349,10 +378,23 @@ uvm_swap_freepages(struct vm_page **pps,
{
int i;
+ if (pps[0] == oompps[0]) {
+ for (i = 0; i < npages; i++)
+ uvm_pageclean(pps[i]);
+
+ mtx_enter(&oommtx);
+ KASSERT(oom == 1);
+ oom = 0;
+ mtx_leave(&oommtx);
+ wakeup(&oom);
+ return;
+ }
+
uvm_lock_pageq();
for (i = 0; i < npages; i++)
uvm_pagefree(pps[i]);
uvm_unlock_pageq();
+
}
#ifdef UVM_SWAP_ENCRYPT
@@ -1587,7 +1629,8 @@ uvm_swap_io(struct vm_page **pps, int st
int result, s, mapinflags, pflag, bounce = 0, i;
boolean_t write, async;
vaddr_t bouncekva;
- struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT];
+ struct vm_page *tpps[SWCLUSTPAGES];
+ int pdaemon = (curproc == uvm.pagedaemon_proc);
#ifdef UVM_SWAP_ENCRYPT
struct swapdev *sdp;
int encrypt = 0;
@@ -1601,16 +1644,23 @@ uvm_swap_io(struct vm_page **pps, int st
/* convert starting drum slot to block number */
startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
+ pflag = (async || pdaemon) ? PR_NOWAIT : PR_WAITOK;
+ bp = pool_get(&bufpool, pflag | PR_ZERO);
+ if (bp == NULL)
+ return (VM_PAGER_AGAIN);
+
/*
- * first, map the pages into the kernel (XXX: currently required
+ * map the pages into the kernel (XXX: currently required
* by buffer system).
*/
mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
if (!async)
mapinflags |= UVMPAGER_MAPIN_WAITOK;
kva = uvm_pagermapin(pps, npages, mapinflags);
- if (kva == 0)
+ if (kva == 0) {
+ pool_put(&bufpool, bp);
return (VM_PAGER_AGAIN);
+ }
#ifdef UVM_SWAP_ENCRYPT
if (write) {
@@ -1664,40 +1714,22 @@ uvm_swap_io(struct vm_page **pps, int st
swmapflags = UVMPAGER_MAPIN_READ;
if (!async)
swmapflags |= UVMPAGER_MAPIN_WAITOK;
-
- if (!uvm_swap_allocpages(tpps, npages)) {
+ if (uvm_swap_allocpages(tpps, npages)) {
+ pool_put(&bufpool, bp);
uvm_pagermapout(kva, npages);
return (VM_PAGER_AGAIN);
}
bouncekva = uvm_pagermapin(tpps, npages, swmapflags);
if (bouncekva == 0) {
+ KASSERT(tpps[0] != oompps[0]);
+ pool_put(&bufpool, bp);
uvm_pagermapout(kva, npages);
uvm_swap_freepages(tpps, npages);
return (VM_PAGER_AGAIN);
}
}
- /*
- * now allocate a buf for the i/o.
- * [make sure we don't put the pagedaemon to sleep...]
- */
- pflag = (async || curproc == uvm.pagedaemon_proc) ? PR_NOWAIT :
- PR_WAITOK;
- bp = pool_get(&bufpool, pflag | PR_ZERO);
-
- /*
- * if we failed to get a swapbuf, return "try again"
- */
- if (bp == NULL) {
- if (bounce) {
- uvm_pagermapout(bouncekva, npages);
- uvm_swap_freepages(tpps, npages);
- }
- uvm_pagermapout(kva, npages);
- return (VM_PAGER_AGAIN);
- }
-
/* encrypt to swap */
if (write && bounce) {
int i, opages;
@@ -1789,8 +1821,7 @@ uvm_swap_io(struct vm_page **pps, int st
/* for async ops we must set up the iodone handler. */
if (async) {
- bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ?
- B_PDAEMON : 0);
+ bp->b_flags |= B_CALL | (pdaemon ? B_PDAEMON : 0);
bp->b_iodone = uvm_aio_biodone;
}
Index: uvm/uvm_swap.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.h,v
retrieving revision 1.18
diff -u -p -r1.18 uvm_swap.h
--- uvm/uvm_swap.h 29 Sep 2020 11:47:41 -0000 1.18
+++ uvm/uvm_swap.h 27 Jun 2022 10:08:51 -0000
@@ -32,6 +32,7 @@
#ifndef _UVM_UVM_SWAP_H_
#define _UVM_UVM_SWAP_H_
+#define SWCLUSTPAGES (MAXBSIZE >> PAGE_SHIFT)
#define SWSLOT_BAD (-1)
#ifdef _KERNEL