On 27/06/22(Mon) 15:44, Martin Pieuchot wrote:
> Diff below contain 3 parts that can be committed independently. The 3
> of them are necessary to allow the pagedaemon to make progress in OOM
> situation and to satisfy all the allocations waiting for pages in
> specific ranges.
>
> * uvm/uvm_pager.c part reserves a second segment for the page daemon.
> This is necessary to ensure the two uvm_pagermapin() calls needed by
> uvm_swap_io() succeed in emergency OOM situation. (the 2nd segment is
> necessary when encryption or bouncing is required)
>
> * uvm/uvm_swap.c part pre-allocates 16 pages in the DMA-reachable region
> for the same reason. Note that a sleeping point is introduced because
> the pagedaemon is faster than the asynchronous I/O and in OOM
> situation it tends to stay busy building cluster that it then discard
> because no memory is available.
>
> * uvm/uvm_pdaemon.c part changes the inner-loop scanning the inactive
> list of pages to account for a given memory range. Without this the
> daemon could spin infinitely doing nothing because the global limits
> are reached.
Here's an updated diff with a fix on top:
* in uvm/uvm_swap.c make sure uvm_swap_allocpages() is allowed to sleep
when coming from uvm_fault(). This makes the faulting process wait
instead of dying when there isn't any free pages to do the bouncing.
I'd appreciate more reviews and tests !
Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.80
diff -u -p -r1.80 uvm_pager.c
--- uvm/uvm_pager.c 28 Jun 2022 12:10:37 -0000 1.80
+++ uvm/uvm_pager.c 28 Jun 2022 15:25:30 -0000
@@ -58,8 +58,8 @@ const struct uvm_pagerops *uvmpagerops[]
* The number of uvm_pseg instances is dynamic using an array segs.
* At most UVM_PSEG_COUNT instances can exist.
*
- * psegs[0] always exists (so that the pager can always map in pages).
- * psegs[0] element 0 is always reserved for the pagedaemon.
+ * psegs[0/1] always exist (so that the pager can always map in pages).
+ * psegs[0/1] element 0 are always reserved for the pagedaemon.
*
* Any other pseg is automatically created when no space is available
* and automatically destroyed when it is no longer in use.
@@ -93,6 +93,7 @@ uvm_pager_init(void)
/* init pager map */
uvm_pseg_init(&psegs[0]);
+ uvm_pseg_init(&psegs[1]);
mtx_init(&uvm_pseg_lck, IPL_VM);
/* init ASYNC I/O queue */
@@ -168,9 +169,10 @@ pager_seg_restart:
goto pager_seg_fail;
}
- /* Keep index 0 reserved for pagedaemon. */
- if (pseg == &psegs[0] && curproc != uvm.pagedaemon_proc)
- i = 1;
+ /* Keep indexes 0,1 reserved for pagedaemon. */
+ if ((pseg == &psegs[0] || pseg == &psegs[1]) &&
+ (curproc != uvm.pagedaemon_proc))
+ i = 2;
else
i = 0;
@@ -229,7 +231,7 @@ uvm_pseg_release(vaddr_t segaddr)
pseg->use &= ~(1 << id);
wakeup(&psegs);
- if (pseg != &psegs[0] && UVM_PSEG_EMPTY(pseg)) {
+ if ((pseg != &psegs[0] && pseg != &psegs[1]) && UVM_PSEG_EMPTY(pseg)) {
va = pseg->start;
pseg->start = 0;
}
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.99
diff -u -p -r1.99 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c 12 May 2022 12:49:31 -0000 1.99
+++ uvm/uvm_pdaemon.c 28 Jun 2022 13:59:49 -0000
@@ -101,8 +101,8 @@ extern void drmbackoff(long);
* local prototypes
*/
-void uvmpd_scan(void);
-boolean_t uvmpd_scan_inactive(struct pglist *);
+void uvmpd_scan(struct uvm_pmalloc *);
+boolean_t uvmpd_scan_inactive(struct uvm_pmalloc *, struct pglist *);
void uvmpd_tune(void);
void uvmpd_drop(struct pglist *);
@@ -281,7 +281,7 @@ uvm_pageout(void *arg)
if (pma != NULL ||
((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) ||
((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) {
- uvmpd_scan();
+ uvmpd_scan(pma);
}
/*
@@ -379,15 +379,15 @@ uvm_aiodone_daemon(void *arg)
*/
boolean_t
-uvmpd_scan_inactive(struct pglist *pglst)
+uvmpd_scan_inactive(struct uvm_pmalloc *pma, struct pglist *pglst)
{
boolean_t retval = FALSE; /* assume we haven't hit target */
int free, result;
struct vm_page *p, *nextpg;
struct uvm_object *uobj;
- struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
+ struct vm_page *pps[SWCLUSTPAGES], **ppsp;
int npages;
- struct vm_page *swpps[MAXBSIZE >> PAGE_SHIFT]; /* XXX: see below */
+ struct vm_page *swpps[SWCLUSTPAGES]; /* XXX: see below */
int swnpages, swcpages; /* XXX: see below */
int swslot;
struct vm_anon *anon;
@@ -404,8 +404,27 @@ uvmpd_scan_inactive(struct pglist *pglst
swnpages = swcpages = 0;
free = 0;
dirtyreacts = 0;
+ p = NULL;
- for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) {
+ /* Start with the first page on the list that fit in pma's ranges */
+ if (pma != NULL) {
+ paddr_t paddr;
+
+ TAILQ_FOREACH(p, pglst, pageq) {
+ paddr = atop(VM_PAGE_TO_PHYS(p));
+ if (paddr >= pma->pm_constraint.ucr_low &&
+ paddr < pma->pm_constraint.ucr_high)
+ break;
+ }
+
+ }
+
+ if (p == NULL) {
+ p = TAILQ_FIRST(pglst);
+ pma = NULL;
+ }
+
+ for (; p != NULL || swslot != 0; p = nextpg) {
/*
* note that p can be NULL iff we have traversed the whole
* list and need to do one final swap-backed clustered pageout.
@@ -419,8 +438,8 @@ uvmpd_scan_inactive(struct pglist *pglst
* our target
*/
free = uvmexp.free - BUFPAGES_DEFICIT;
-
- if (free + uvmexp.paging >= uvmexp.freetarg << 2 ||
+ if (((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
+ (free + uvmexp.paging >= uvmexp.freetarg << 2)) ||
dirtyreacts == UVMPD_NUMDIRTYREACTS) {
retval = TRUE;
@@ -531,7 +550,8 @@ uvmpd_scan_inactive(struct pglist *pglst
* this page is dirty, skip it if we'll have met our
* free target when all the current pageouts complete.
*/
- if (free + uvmexp.paging > uvmexp.freetarg << 2) {
+ if ((pma == NULL || (pma->pm_flags & UVM_PMA_FREED)) &&
+ (free + uvmexp.paging > uvmexp.freetarg << 2)) {
if (anon) {
rw_exit(anon->an_lock);
} else {
@@ -611,7 +631,7 @@ uvmpd_scan_inactive(struct pglist *pglst
/* start new cluster (if necessary) */
if (swslot == 0) {
- swnpages = MAXBSIZE >> PAGE_SHIFT;
+ swnpages = SWCLUSTPAGES;
swslot = uvm_swap_alloc(&swnpages,
TRUE);
if (swslot == 0) {
@@ -867,7 +887,7 @@ uvmpd_scan_inactive(struct pglist *pglst
*/
void
-uvmpd_scan(void)
+uvmpd_scan(struct uvm_pmalloc *pma)
{
int free, inactive_shortage, swap_shortage, pages_freed;
struct vm_page *p, *nextpg;
@@ -910,7 +930,7 @@ uvmpd_scan(void)
* low bit of uvmexp.pdrevs (which we bump by one each call).
*/
pages_freed = uvmexp.pdfreed;
- (void) uvmpd_scan_inactive(&uvm.page_inactive);
+ (void) uvmpd_scan_inactive(pma, &uvm.page_inactive);
pages_freed = uvmexp.pdfreed - pages_freed;
/*
Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.156
diff -u -p -r1.156 uvm_swap.c
--- uvm/uvm_swap.c 7 Jun 2022 12:02:52 -0000 1.156
+++ uvm/uvm_swap.c 28 Jun 2022 15:16:58 -0000
@@ -213,6 +213,10 @@ struct swap_priority swap_priority;
/* locks */
struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
+struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM);
+struct vm_page *oompps[SWCLUSTPAGES];
+int oom = 0;
+
/*
* prototypes
*/
@@ -235,7 +239,7 @@ void sw_reg_start(struct swapdev *);
int uvm_swap_io(struct vm_page **, int, int, int);
void swapmount(void);
-boolean_t uvm_swap_allocpages(struct vm_page **, int);
+int uvm_swap_allocpages(struct vm_page **, int, int);
#ifdef UVM_SWAP_ENCRYPT
/* for swap encrypt */
@@ -253,6 +257,8 @@ void uvm_swap_initcrypt(struct swapdev *
void
uvm_swap_init(void)
{
+ int error;
+
/*
* first, init the swap list, its counter, and its lock.
* then get a handle on the vnode for /dev/drum by using
@@ -281,6 +287,10 @@ uvm_swap_init(void)
pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0,
"swp vnd", NULL);
+ /* allocate pages for OOM situations. */
+ error = uvm_swap_allocpages(oompps, SWCLUSTPAGES, UVM_PLA_NOWAIT);
+ KASSERT(error == 0);
+
/* Setup the initial swap partition */
swapmount();
}
@@ -323,16 +333,35 @@ uvm_swap_initcrypt(struct swapdev *sdp,
#endif /* UVM_SWAP_ENCRYPT */
-boolean_t
-uvm_swap_allocpages(struct vm_page **pps, int npages)
+int
+uvm_swap_allocpages(struct vm_page **pps, int npages, int flags)
{
struct pglist pgl;
- int i;
+ int error, i;
+
+ KASSERT(npages <= SWCLUSTPAGES);
TAILQ_INIT(&pgl);
- if (uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
- dma_constraint.ucr_high, 0, 0, &pgl, npages, UVM_PLA_NOWAIT))
- return FALSE;
+again:
+ error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
+ dma_constraint.ucr_high, 0, 0, &pgl, npages, flags);
+ if (error && (curproc == uvm.pagedaemon_proc)) {
+ mtx_enter(&oommtx);
+ if (oom) {
+ msleep_nsec(&oom, &oommtx, PVM | PNORELOCK,
+ "oom", INFSLP);
+ goto again;
+ }
+ oom = 1;
+ for (i = 0; i < npages; i++) {
+ pps[i] = oompps[i];
+ atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY);
+ }
+ mtx_leave(&oommtx);
+ return 0;
+ }
+ if (error)
+ return error;
for (i = 0; i < npages; i++) {
pps[i] = TAILQ_FIRST(&pgl);
@@ -341,7 +370,7 @@ uvm_swap_allocpages(struct vm_page **pps
TAILQ_REMOVE(&pgl, pps[i], pageq);
}
- return TRUE;
+ return 0;
}
void
@@ -349,10 +378,23 @@ uvm_swap_freepages(struct vm_page **pps,
{
int i;
+ if (pps[0] == oompps[0]) {
+ for (i = 0; i < npages; i++)
+ uvm_pageclean(pps[i]);
+
+ mtx_enter(&oommtx);
+ KASSERT(oom == 1);
+ oom = 0;
+ mtx_leave(&oommtx);
+ wakeup(&oom);
+ return;
+ }
+
uvm_lock_pageq();
for (i = 0; i < npages; i++)
uvm_pagefree(pps[i]);
uvm_unlock_pageq();
+
}
#ifdef UVM_SWAP_ENCRYPT
@@ -1587,7 +1629,8 @@ uvm_swap_io(struct vm_page **pps, int st
int result, s, mapinflags, pflag, bounce = 0, i;
boolean_t write, async;
vaddr_t bouncekva;
- struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT];
+ struct vm_page *tpps[SWCLUSTPAGES];
+ int pdaemon = (curproc == uvm.pagedaemon_proc);
#ifdef UVM_SWAP_ENCRYPT
struct swapdev *sdp;
int encrypt = 0;
@@ -1601,16 +1644,23 @@ uvm_swap_io(struct vm_page **pps, int st
/* convert starting drum slot to block number */
startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
+ pflag = (async || pdaemon) ? PR_NOWAIT : PR_WAITOK;
+ bp = pool_get(&bufpool, pflag | PR_ZERO);
+ if (bp == NULL)
+ return (VM_PAGER_AGAIN);
+
/*
- * first, map the pages into the kernel (XXX: currently required
+ * map the pages into the kernel (XXX: currently required
* by buffer system).
*/
mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
if (!async)
mapinflags |= UVMPAGER_MAPIN_WAITOK;
kva = uvm_pagermapin(pps, npages, mapinflags);
- if (kva == 0)
+ if (kva == 0) {
+ pool_put(&bufpool, bp);
return (VM_PAGER_AGAIN);
+ }
#ifdef UVM_SWAP_ENCRYPT
if (write) {
@@ -1658,46 +1708,31 @@ uvm_swap_io(struct vm_page **pps, int st
}
if (bounce) {
- int swmapflags;
+ int swmapflags, plaflags;
/* We always need write access. */
swmapflags = UVMPAGER_MAPIN_READ;
- if (!async)
+ plaflags = UVM_PLA_NOWAIT;
+ if (!async) {
swmapflags |= UVMPAGER_MAPIN_WAITOK;
-
- if (!uvm_swap_allocpages(tpps, npages)) {
+ plaflags = UVM_PLA_WAITOK;
+ }
+ if (uvm_swap_allocpages(tpps, npages, plaflags)) {
+ pool_put(&bufpool, bp);
uvm_pagermapout(kva, npages);
return (VM_PAGER_AGAIN);
}
bouncekva = uvm_pagermapin(tpps, npages, swmapflags);
if (bouncekva == 0) {
+ KASSERT(tpps[0] != oompps[0]);
+ pool_put(&bufpool, bp);
uvm_pagermapout(kva, npages);
uvm_swap_freepages(tpps, npages);
return (VM_PAGER_AGAIN);
}
}
- /*
- * now allocate a buf for the i/o.
- * [make sure we don't put the pagedaemon to sleep...]
- */
- pflag = (async || curproc == uvm.pagedaemon_proc) ? PR_NOWAIT :
- PR_WAITOK;
- bp = pool_get(&bufpool, pflag | PR_ZERO);
-
- /*
- * if we failed to get a swapbuf, return "try again"
- */
- if (bp == NULL) {
- if (bounce) {
- uvm_pagermapout(bouncekva, npages);
- uvm_swap_freepages(tpps, npages);
- }
- uvm_pagermapout(kva, npages);
- return (VM_PAGER_AGAIN);
- }
-
/* encrypt to swap */
if (write && bounce) {
int i, opages;
@@ -1789,8 +1824,7 @@ uvm_swap_io(struct vm_page **pps, int st
/* for async ops we must set up the iodone handler. */
if (async) {
- bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ?
- B_PDAEMON : 0);
+ bp->b_flags |= B_CALL | (pdaemon ? B_PDAEMON : 0);
bp->b_iodone = uvm_aio_biodone;
}
Index: uvm/uvm_swap.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.h,v
retrieving revision 1.18
diff -u -p -r1.18 uvm_swap.h
--- uvm/uvm_swap.h 29 Sep 2020 11:47:41 -0000 1.18
+++ uvm/uvm_swap.h 28 Jun 2022 13:59:49 -0000
@@ -32,6 +32,7 @@
#ifndef _UVM_UVM_SWAP_H_
#define _UVM_UVM_SWAP_H_
+#define SWCLUSTPAGES (MAXBSIZE >> PAGE_SHIFT)
#define SWSLOT_BAD (-1)
#ifdef _KERNEL