The branch main has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=095f6305772be1dae27e7af9d87db0387625440d
commit 095f6305772be1dae27e7af9d87db0387625440d Author: Mark Johnston <ma...@freebsd.org> AuthorDate: 2025-01-06 18:15:28 +0000 Commit: Mark Johnston <ma...@freebsd.org> CommitDate: 2025-08-01 20:15:28 +0000 vm_pageout: Scan inactive dirty pages less aggressively Consider a database workload where the bulk of RAM is used for a fixed-size file-backed cache. Any leftover pages are used for filesystem caching or anonymous memory. In particular, there is little memory pressure and the inactive queue is scanned rarely. Once in a while, the free page count dips a bit below the setpoint, triggering an inactive queue scan. Since almost all of the memory there is used by the database cache, the scan encounters only referenced and/or dirty pages, moving them to the active and laundry queues. In particular, it ends up completely depleting the inactive queue, even for a small, non-urgent free page shortage. This scan might process many gigabytes worth of pages in one go, triggering VM object lock contention (on the DB cache file's VM object) and consuming CPU, which can cause application latency spikes. Observing this behaviour, my observation is that we should abort scanning once we've encountered many dirty pages without meeting the shortage. In general we've tried to make the page daemon control loops avoid large bursts of work, and if a scan fails to turn up clean pages, there's not much use in moving everything to laundry queue at once. In particular, pacing this work ensures that the page daemon isn't frequently acquiring and releasing the VM object lock over long periods, especially when multiple page daemon threads are in use. Modify the inactive scan to abort early if we encounter enough dirty pages without meeting the shortage. If the shortage hasn't been met, this will trigger shortfall laundering, wherein the laundry thread will clean as many pages as needed to meet the instantaneous shortfall. Laundered pages will be placed near the head of the inactive queue, so will be immediately visible to the page daemon during its next scan of the inactive queue. Reviewed by: alc, kib MFC after: 1 month Sponsored by: Modirum MDPay Sponsored by: Klara, Inc. Differential Revision: https://reviews.freebsd.org/D48337 --- sys/vm/swap_pager.c | 2 +- sys/vm/swap_pager.h | 1 + sys/vm/vm_pageout.c | 61 +++++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index c01b9e45a32b..676e585a6b53 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -386,7 +386,7 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) } static bool swap_pager_full = true; /* swap space exhaustion (task killing) */ -static bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */ +bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h index 3287886026f7..da1457762c0b 100644 --- a/sys/vm/swap_pager.h +++ b/sys/vm/swap_pager.h @@ -68,6 +68,7 @@ struct swdevt { #ifdef _KERNEL +extern bool swap_pager_almost_full; extern int swap_pager_avail; extern int nsw_cluster_max; diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index b500eb8156bc..3f1be78342c9 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -80,6 +80,7 @@ #include <sys/kernel.h> #include <sys/blockcount.h> #include <sys/eventhandler.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/proc.h> @@ -183,26 +184,33 @@ SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); -static int act_scan_laundry_weight = 3; - static int -sysctl_act_scan_laundry_weight(SYSCTL_HANDLER_ARGS) +sysctl_laundry_weight(SYSCTL_HANDLER_ARGS) { - int error, newval; + int error, val; - newval = act_scan_laundry_weight; - error = sysctl_handle_int(oidp, &newval, 0, req); - if (error || req->newptr == NULL) + val = *(int *)arg1; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) return (error); - if (newval < 1) + if (val < arg2 || val > 100) return (EINVAL); - act_scan_laundry_weight = newval; + *(int *)arg1 = val; return (0); } -SYSCTL_PROC(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN | CTLTYPE_INT, - &act_scan_laundry_weight, 0, sysctl_act_scan_laundry_weight, "I", + +static int act_scan_laundry_weight = 3; +SYSCTL_PROC(_vm, OID_AUTO, act_scan_laundry_weight, + CTLTYPE_INT | CTLFLAG_RWTUN, &act_scan_laundry_weight, 1, + sysctl_laundry_weight, "I", "weight given to clean vs. dirty pages in active queue scans"); +static int inact_scan_laundry_weight = 1; +SYSCTL_PROC(_vm, OID_AUTO, inact_scan_laundry_weight, + CTLTYPE_INT | CTLFLAG_RWTUN, &inact_scan_laundry_weight, 0, + sysctl_laundry_weight, "I", + "weight given to clean vs. dirty pages in inactive queue scans"); + static u_int vm_background_launder_rate = 4096; SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, &vm_background_launder_rate, 0, @@ -1417,7 +1425,8 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage) struct vm_pagequeue *pq; vm_object_t object; vm_page_astate_t old, new; - int act_delta, addl_page_shortage, starting_page_shortage, refs; + int act_delta, addl_page_shortage, dirty_count, dirty_thresh; + int starting_page_shortage, refs; object = NULL; vm_batchqueue_init(&rq); @@ -1431,6 +1440,18 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage) */ addl_page_shortage = 0; + /* + * dirty_count is the number of pages encountered that require + * laundering before reclamation is possible. If we encounter a large + * number of dirty pages, we may abort the scan without meeting the page + * shortage in the hope that laundering will allow a future scan to meet + * the target. + */ + dirty_count = 0; + dirty_thresh = inact_scan_laundry_weight * page_shortage; + if (dirty_thresh == 0) + dirty_thresh = INT_MAX; + /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the @@ -1443,7 +1464,7 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage) pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); - while (page_shortage > 0) { + while (page_shortage > 0 && dirty_count < dirty_thresh) { /* * If we need to refill the scan batch queue, release any * optimistically held object lock. This gives someone else a @@ -1617,8 +1638,20 @@ free_page: page_shortage--; continue; } - if ((object->flags & OBJ_DEAD) == 0) + if ((object->flags & OBJ_DEAD) == 0) { vm_page_launder(m); + + /* + * If the page would be paged out to a swap device, and + * no devices are configured or they are all nearly + * full, then don't count it against our threshold, + * since it most likely can't be used to meet our + * target. + */ + if ((object->flags & OBJ_SWAP) == 0 || + !atomic_load_bool(&swap_pager_almost_full)) + dirty_count++; + } skip_page: vm_page_xunbusy(m); continue;