> Date: Mon, 21 Dec 2020 16:46:32 -0300
> From: Martin Pieuchot <[email protected]>
>
> During a page fault multiples counters are updated. They fall into two
> categories "fault counters" and "global statistics" both of which are
> currently represented by int-sized fields inside a global: `uvmexp'.
>
> Diff below makes use of the per-CPU counters_inc(9) API to make sure no
> update is lost with an unlocked fault handler. I only converted the
> fields touched by uvm_fault() to have a working solution and start a
> discussion.
>
> - Should we keep a single enum for all fields inside `uvmexp' or do we
> want to separate "statistics counters" which are mostly used sys/arch
> from "fault counters" which are only used in uvm/uvm_fault.c?
>
> - The counter_add(9) API deals with uint64_t and currently uvmexp uses
> int. Should we truncate or change the size of uvmexp fields or do
> something else?
>
> Comments?
I think this breaks "show uvmexp" in ddb.
You fear that using atomic operations for these counters would lead to
too much bus contention on systems with a large number of CPUs?
> Index: kern/init_main.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/init_main.c,v
> retrieving revision 1.302
> diff -u -p -r1.302 init_main.c
> --- kern/init_main.c 7 Dec 2020 16:55:28 -0000 1.302
> +++ kern/init_main.c 21 Dec 2020 19:37:13 -0000
> @@ -432,6 +432,7 @@ main(void *framep)
> #endif
>
> mbcpuinit(); /* enable per cpu mbuf data */
> + uvm_init_percpu();
>
> /* init exec and emul */
> init_exec();
> Index: uvm/uvm_extern.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
> retrieving revision 1.155
> diff -u -p -r1.155 uvm_extern.h
> --- uvm/uvm_extern.h 1 Dec 2020 13:56:22 -0000 1.155
> +++ uvm/uvm_extern.h 21 Dec 2020 19:37:13 -0000
> @@ -289,6 +289,7 @@ void uvm_vsunlock_device(struct proc
> *
> void *);
> void uvm_pause(void);
> void uvm_init(void);
> +void uvm_init_percpu(void);
> int uvm_io(vm_map_t, struct uio *, int);
>
> #define UVM_IO_FIXPROT 0x01
> Index: uvm/uvm_fault.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.109
> diff -u -p -r1.109 uvm_fault.c
> --- uvm/uvm_fault.c 8 Dec 2020 12:26:31 -0000 1.109
> +++ uvm/uvm_fault.c 21 Dec 2020 19:37:13 -0000
> @@ -35,6 +35,7 @@
> #include <sys/param.h>
> #include <sys/systm.h>
> #include <sys/kernel.h>
> +#include <sys/percpu.h>
> #include <sys/proc.h>
> #include <sys/malloc.h>
> #include <sys/mman.h>
> @@ -271,7 +272,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
> int result;
>
> result = 0; /* XXX shut up gcc */
> - uvmexp.fltanget++;
> + counters_inc(uvmexp_counters, flt_anget);
> /* bump rusage counters */
> if (anon->an_page)
> curproc->p_ru.ru_minflt++;
> @@ -295,7 +296,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
> if ((pg->pg_flags & (PG_BUSY|PG_RELEASED)) == 0)
> return (VM_PAGER_OK);
> atomic_setbits_int(&pg->pg_flags, PG_WANTED);
> - uvmexp.fltpgwait++;
> + counters_inc(uvmexp_counters, flt_pgwait);
>
> /*
> * the last unlock must be an atomic unlock+wait on
> @@ -310,7 +311,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
>
> if (pg == NULL) { /* out of RAM. */
> uvmfault_unlockall(ufi, amap, NULL);
> - uvmexp.fltnoram++;
> + counters_inc(uvmexp_counters, flt_noram);
> uvm_wait("flt_noram1");
> /* ready to relock and try again */
> } else {
> @@ -325,7 +326,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
> * it is ok to read an_swslot here because
> * we hold PG_BUSY on the page.
> */
> - uvmexp.pageins++;
> + counters_inc(uvmexp_counters, pageins);
> result = uvm_swap_get(pg, anon->an_swslot,
> PGO_SYNCIO);
>
> @@ -369,7 +370,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
> uvm_anfree(anon); /* frees page for us */
> if (locked)
> uvmfault_unlockall(ufi, amap, NULL);
> - uvmexp.fltpgrele++;
> + counters_inc(uvmexp_counters, flt_pgrele);
> return (VM_PAGER_REFAULT); /* refault! */
> }
>
> @@ -426,7 +427,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
> }
>
> /* try it again! */
> - uvmexp.fltanretry++;
> + counters_inc(uvmexp_counters, flt_anretry);
> continue;
>
> } /* while (1) */
> @@ -547,7 +548,7 @@ uvm_fault_check(struct uvm_faultinfo *uf
> /* need to clear */
> uvmfault_unlockmaps(ufi, FALSE);
> uvmfault_amapcopy(ufi);
> - uvmexp.fltamcopy++;
> + counters_inc(uvmexp_counters, flt_amcopy);
> return (ERESTART);
> } else {
> /*
> @@ -699,7 +700,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf
> */
>
> if ((access_type & PROT_WRITE) != 0 && anon->an_ref > 1) {
> - uvmexp.flt_acow++;
> + counters_inc(uvmexp_counters, flt_acow);
> oanon = anon; /* oanon = old */
> anon = uvm_analloc();
> if (anon) {
> @@ -710,10 +711,10 @@ uvm_fault_upper(struct uvm_faultinfo *uf
> if (anon == NULL || pg == NULL) {
> uvmfault_unlockall(ufi, amap, NULL);
> if (anon == NULL)
> - uvmexp.fltnoanon++;
> + counters_inc(uvmexp_counters, flt_noanon);
> else {
> uvm_anfree(anon);
> - uvmexp.fltnoram++;
> + counters_inc(uvmexp_counters, flt_noram);
> }
>
> if (uvm_swapisfull())
> @@ -745,7 +746,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf
> * thus, no one can get at it until we are done with it.
> */
> } else {
> - uvmexp.flt_anon++;
> + counters_inc(uvmexp_counters, flt_anon);
> oanon = anon;
> pg = anon->an_page;
> if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */
> @@ -861,7 +862,7 @@ uvm_fault_upper_lookup(struct uvm_faulti
> uvm_lock_pageq();
> uvm_pageactivate(anon->an_page); /* reactivate */
> uvm_unlock_pageq();
> - uvmexp.fltnamap++;
> + counters_inc(uvmexp_counters, flt_namap);
>
> /*
> * Since this isn't the page that's actually faulting,
> @@ -909,7 +910,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
> struct vm_page *pages[UVM_MAXRANGE];
> int error = ERESTART;
>
> - uvmexp.faults++; /* XXX: locking? */
> + counters_inc(uvmexp_counters, faults);
> TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
>
> /* init the IN parameters in the ufi */
> @@ -994,7 +995,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
> * ("get" has the option of doing a pmap_enter for us)
> */
> if (uobj != NULL) {
> - uvmexp.fltlget++;
> + counters_inc(uvmexp_counters, flt_lget);
> gotpages = flt->npages;
> (void) uobj->pgops->pgo_get(uobj, ufi->entry->offset +
> (flt->startva - ufi->entry->start),
> @@ -1038,7 +1039,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
> uvm_lock_pageq();
> uvm_pageactivate(pages[lcv]); /* reactivate */
> uvm_unlock_pageq();
> - uvmexp.fltnomap++;
> + counters_inc(uvmexp_counters, flt_nomap);
>
> /*
> * Since this page isn't the page that's
> @@ -1109,7 +1110,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
>
> uvmfault_unlockall(ufi, amap, NULL);
>
> - uvmexp.fltget++;
> + counters_inc(uvmexp_counters, flt_get);
> gotpages = 1;
> uoff = (ufi->orig_rvaddr - ufi->entry->start) +
> ufi->entry->offset;
> result = uobj->pgops->pgo_get(uobj, uoff, &uobjpage, &gotpages,
> @@ -1183,7 +1184,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
> *
> * set "pg" to the page we want to map in (uobjpage, usually)
> */
> - uvmexp.flt_obj++;
> + counters_inc(uvmexp_counters, flt_obj);
> if (UVM_ET_ISCOPYONWRITE(ufi->entry))
> flt->enter_prot &= ~PROT_WRITE;
> pg = uobjpage; /* map in the actual object */
> @@ -1235,10 +1236,10 @@ uvm_fault_lower(struct uvm_faultinfo *uf
> /* unlock and fail ... */
> uvmfault_unlockall(ufi, amap, uobj);
> if (anon == NULL)
> - uvmexp.fltnoanon++;
> + counters_inc(uvmexp_counters, flt_noanon);
> else {
> uvm_anfree(anon);
> - uvmexp.fltnoram++;
> + counters_inc(uvmexp_counters, flt_noram);
> }
>
> if (uvm_swapisfull())
> @@ -1254,7 +1255,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
>
> /* fill in the data */
> if (uobjpage != PGO_DONTCARE) {
> - uvmexp.flt_prcopy++;
> + counters_inc(uvmexp_counters, flt_prcopy);
> /* copy page [pg now dirty] */
> uvm_pagecopy(uobjpage, pg);
>
> @@ -1277,7 +1278,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
> uvm_unlock_pageq();
> uobj = NULL;
> } else {
> - uvmexp.flt_przero++;
> + counters_inc(uvmexp_counters, flt_przero);
> /*
> * Page is zero'd and marked dirty by uvm_pagealloc()
> * above.
> @@ -1288,7 +1289,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
> ufi->orig_rvaddr - ufi->entry->start, anon, 0)) {
> uvmfault_unlockall(ufi, amap, NULL);
> uvm_anfree(anon);
> - uvmexp.fltnoamap++;
> + counters_inc(uvmexp_counters, flt_noamap);
>
> if (uvm_swapisfull())
> return (ENOMEM);
> @@ -1580,7 +1581,7 @@ uvmfault_relock(struct uvm_faultinfo *uf
> return TRUE;
> }
>
> - uvmexp.fltrelck++;
> + counters_inc(uvmexp_counters, flt_relck);
>
> /*
> * relock map. fail if version mismatch (in which case nothing
> @@ -1592,6 +1593,6 @@ uvmfault_relock(struct uvm_faultinfo *uf
> return(FALSE);
> }
>
> - uvmexp.fltrelckok++;
> + counters_inc(uvmexp_counters, flt_relckok);
> return(TRUE); /* got it! */
> }
> Index: uvm/uvm_init.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_init.c,v
> retrieving revision 1.40
> diff -u -p -r1.40 uvm_init.c
> --- uvm/uvm_init.c 11 May 2017 00:42:05 -0000 1.40
> +++ uvm/uvm_init.c 21 Dec 2020 19:37:13 -0000
> @@ -35,6 +35,7 @@
> #include <sys/param.h>
> #include <sys/systm.h>
> #include <sys/filedesc.h>
> +#include <sys/percpu.h>
> #include <sys/resourcevar.h>
> #include <sys/mman.h>
> #include <sys/malloc.h>
> @@ -52,6 +53,9 @@
> struct uvm uvm; /* decl */
> struct uvmexp uvmexp; /* decl */
>
> +COUNTERS_BOOT_MEMORY(uvmexp_countersboot, exp_ncounters);
> +struct cpumem *uvmexp_counters =
> COUNTERS_BOOT_INITIALIZER(uvmexp_countersboot);
> +
> #if defined(VM_MIN_KERNEL_ADDRESS)
> vaddr_t vm_min_kernel_address = VM_MIN_KERNEL_ADDRESS;
> #else
> @@ -184,4 +188,10 @@ uvm_init(void)
> uaddr_bestfit_create(vm_map_min(kmem_map),
> vm_map_max(kmem_map)));
> #endif /* !SMALL_KERNEL */
> +}
> +
> +void
> +uvm_init_percpu(void)
> +{
> + uvmexp_counters = counters_alloc_ncpus(uvmexp_counters, exp_ncounters);
> }
> Index: uvm/uvm_meter.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
> retrieving revision 1.41
> diff -u -p -r1.41 uvm_meter.c
> --- uvm/uvm_meter.c 24 Jun 2020 22:03:45 -0000 1.41
> +++ uvm/uvm_meter.c 21 Dec 2020 19:37:13 -0000
> @@ -39,6 +39,7 @@
> #include <sys/param.h>
> #include <sys/systm.h>
> #include <sys/kernel.h>
> +#include <sys/percpu.h>
> #include <sys/proc.h>
> #include <sys/sysctl.h>
> #include <sys/vmmeter.h>
> @@ -178,9 +179,42 @@ uvm_sysctl(int *name, u_int namelen, voi
> return (sysctl_rdstruct(oldp, oldlenp, newp, &vmtotals,
> sizeof(vmtotals)));
>
> - case VM_UVMEXP:
> - return (sysctl_rdstruct(oldp, oldlenp, newp, &uvmexp,
> - sizeof(uvmexp)));
> + case VM_UVMEXP: {
> + struct uvmexp uexp;
> + uint64_t counters[exp_ncounters];
> +
> + memcpy(&uexp, &uvmexp, sizeof(uexp));
> +
> + counters_read(uvmexp_counters, counters, exp_ncounters);
> +
> + /* stat counters */
> + uexp.faults = (int)counters[faults];
> + uexp.pageins = (int)counters[pageins];
> +
> + /* fault subcounters */
> + uexp.fltnoram = (int)counters[flt_noram];
> + uexp.fltnoanon = (int)counters[flt_noanon];
> + uexp.fltnoamap = (int)counters[flt_noamap];
> + uexp.fltpgwait = (int)counters[flt_pgwait];
> + uexp.fltpgrele = (int)counters[flt_pgrele];
> + uexp.fltrelck = (int)counters[flt_relck];
> + uexp.fltrelckok = (int)counters[flt_relckok];
> + uexp.fltanget = (int)counters[flt_anget];
> + uexp.fltanretry = (int)counters[flt_anretry];
> + uexp.fltamcopy = (int)counters[flt_amcopy];
> + uexp.fltnamap = (int)counters[flt_namap];
> + uexp.fltnomap = (int)counters[flt_nomap];
> + uexp.fltlget = (int)counters[flt_lget];
> + uexp.fltget = (int)counters[flt_get];
> + uexp.flt_anon = (int)counters[flt_anon];
> + uexp.flt_acow = (int)counters[flt_acow];
> + uexp.flt_obj = (int)counters[flt_obj];
> + uexp.flt_prcopy = (int)counters[flt_prcopy];
> + uexp.flt_przero = (int)counters[flt_przero];
> +
> + return (sysctl_rdstruct(oldp, oldlenp, newp, &uexp,
> + sizeof(uexp)));
> + }
>
> case VM_NKMEMPAGES:
> return (sysctl_rdint(oldp, oldlenp, newp, nkmempages));
> Index: uvm/uvmexp.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvmexp.h,v
> retrieving revision 1.7
> diff -u -p -r1.7 uvmexp.h
> --- uvm/uvmexp.h 14 Dec 2020 13:29:18 -0000 1.7
> +++ uvm/uvmexp.h 21 Dec 2020 19:37:13 -0000
> @@ -156,4 +156,41 @@ struct _ps_strings {
> void *val;
> };
>
> +#ifdef _KERNEL
> +
> +/*
> + * Per-cpu UVM counters.
> + */
> +extern struct cpumem *uvmexp_counters;
> +
> +enum uvm_exp_counters {
> + /* stat counters */
> + faults, /* page fault count */
> + pageins, /* pagein operation count */
> +
> + /* fault subcounters */
> + flt_noram, /* number of times fault was out of ram */
> + flt_noanon, /* number of times fault was out of anons */
> + flt_noamap, /* number of times fault was out of amap chunks */
> + flt_pgwait, /* number of times fault had to wait on a page */
> + flt_pgrele, /* number of times fault found a released page */
> + flt_relck, /* number of times fault relock called */
> + flt_relckok, /* number of times fault relock is a success */
> + flt_anget, /* number of times fault gets anon page */
> + flt_anretry, /* number of times fault retrys an anon get */
> + flt_amcopy, /* number of times fault clears "needs copy" */
> + flt_namap, /* number of times fault maps a neighbor anon page */
> + flt_nomap, /* number of times fault maps a neighbor obj page */
> + flt_lget, /* number of times fault does a locked pgo_get */
> + flt_get, /* number of times fault does an unlocked get */
> + flt_anon, /* number of times fault anon (case 1a) */
> + flt_acow, /* number of times fault anon cow (case 1b) */
> + flt_obj, /* number of times fault is on object page (2a) */
> + flt_prcopy, /* number of times fault promotes with copy (2b) */
> + flt_przero, /* number of times fault promotes with zerofill (2b) */
> +
> + exp_ncounters
> +};
> +
> +#endif /* _KERNEL */
> #endif /*_UVM_UVMEXP_ */
>
>