> On 23 Dec 2014, at 11:38, David Gwynne <da...@gwynne.id.au> wrote: > > On Mon, Dec 22, 2014 at 10:54:16AM -0500, Ted Unangst wrote: >> On Mon, Dec 22, 2014 at 14:59, Mike Belopuhov wrote: >>> 1) how is it different from what we have now? >>> >>> 2) why can't you do it when you pool_put? >> >> Right now, if you allocate a bunch of things, then free them, the >> pages won't be freed because the timestamp is new. But you've already >> freed everything, so there won't be a future pool_put to trigger the >> final page freeing. This keeps pages from getting stuck. >> >>> 3) why can't you call it from uvm when there's memory pressure since >>> from what i understand pool_reclaim was supposed to work like that? >> >> Calling reclaim only when we're low on memory is sometimes too late. > > and we never do. reclaim is only called when sysctl kern.pool_debug > is fiddled with. > >> >>> 4) i assume you don't want to call pool_reclaim from pool_gc_pages >>> because of the mutex_enter_try benefits, but why does logic in >>> these functions differ, e.g. why did you omit the pr_itemsperpage >>> bit? >> >> We're not trying to release all free pages. Only the ones that are both >> too many and too old. This is the same logic that is already in >> pool_put. >> >>> 7) it looks like pool_reclaim_all should also raise an IPL since it >>> does the same thing. wasn't it noteced before? >> >> Likely. I don't think reclaim_all gets called very often. > > or at all, really. > > mikeb, id really appreciate it if you could benchmark with this diff. > > if you're interested in seeing the effect of freeing at different > intervals, you could try the diff below. it adds kern.pool.wait_free > and kern.pool.wait_gc tunables so you can set how long a page has > to be idle before a pool_put and the gc respectively will select a > page for freeing. > > id prefer to pick values that Just Work(tm) all the time, but being > able to test different values easily might be useful in the short > term.
ive been running the diff below in production for nearly a month now without issue. anyone want to ok this? if i commit this would anyone object? dlg > > Index: sbin/sysctl/sysctl.c > =================================================================== > RCS file: /cvs/src/sbin/sysctl/sysctl.c,v > retrieving revision 1.207 > diff -u -p -r1.207 sysctl.c > --- sbin/sysctl/sysctl.c 19 Nov 2014 18:04:54 -0000 1.207 > +++ sbin/sysctl/sysctl.c 22 Dec 2014 23:44:52 -0000 > @@ -44,6 +44,7 @@ > #include <sys/sched.h> > #include <sys/sensors.h> > #include <sys/vmmeter.h> > +#include <sys/pool.h> > #include <net/route.h> > #include <net/if.h> > > @@ -125,6 +126,7 @@ struct ctlname semname[] = CTL_KERN_SEMI > struct ctlname shmname[] = CTL_KERN_SHMINFO_NAMES; > struct ctlname watchdogname[] = CTL_KERN_WATCHDOG_NAMES; > struct ctlname tcname[] = CTL_KERN_TIMECOUNTER_NAMES; > +struct ctlname poolname[] = CTL_KERN_POOL_NAMES; > struct ctlname *vfsname; > #ifdef CTL_MACHDEP_NAMES > struct ctlname machdepname[] = CTL_MACHDEP_NAMES; > @@ -207,6 +209,7 @@ int sysctl_seminfo(char *, char **, int > int sysctl_shminfo(char *, char **, int *, int, int *); > int sysctl_watchdog(char *, char **, int *, int, int *); > int sysctl_tc(char *, char **, int *, int, int *); > +int sysctl_pool(char *, char **, int *, int, int *); > int sysctl_sensors(char *, char **, int *, int, int *); > void print_sensordev(char *, int *, u_int, struct sensordev *); > void print_sensor(struct sensor *); > @@ -388,6 +391,11 @@ parse(char *string, int flags) > return; > warnx("use dmesg to view %s", string); > return; > + case KERN_POOL: > + len = sysctl_pool(string, &bufp, mib, flags, &type); > + if (len < 0) > + return; > + break; > case KERN_PROC: > if (flags == 0) > return; > @@ -1633,6 +1641,7 @@ struct list semlist = { semname, KERN_SE > struct list shmlist = { shmname, KERN_SHMINFO_MAXID }; > struct list watchdoglist = { watchdogname, KERN_WATCHDOG_MAXID }; > struct list tclist = { tcname, KERN_TIMECOUNTER_MAXID }; > +struct list poollist = { poolname, KERN_POOL_MAXID }; > > /* > * handle vfs namei cache statistics > @@ -2302,6 +2311,25 @@ sysctl_tc(char *string, char **bufpp, in > return (-1); > mib[2] = indx; > *typep = tclist.list[indx].ctl_type; > + return (3); > +} > + > +/* > + * Handle pools support > + */ > +int > +sysctl_pool(char *string, char **bufpp, int mib[], int flags, int *typep) > +{ > + int indx; > + > + if (*bufpp == NULL) { > + listall(string, &poollist); > + return (-1); > + } > + if ((indx = findname(string, "third", bufpp, &poollist)) == -1) > + return (-1); > + mib[2] = indx; > + *typep = poollist.list[indx].ctl_type; > return (3); > } > > Index: sys/kern/init_main.c > =================================================================== > RCS file: /cvs/src/sys/kern/init_main.c,v > retrieving revision 1.227 > diff -u -p -r1.227 init_main.c > --- sys/kern/init_main.c 15 Dec 2014 02:24:23 -0000 1.227 > +++ sys/kern/init_main.c 22 Dec 2014 23:44:55 -0000 > @@ -121,6 +121,8 @@ extern struct timeout setperf_to; > void setperf_auto(void *); > #endif > > +extern struct timeout pool_gc_tick; > + > int cmask = CMASK; > extern struct user *proc0paddr; > > @@ -554,6 +556,11 @@ main(void *framep) > #ifndef SMALL_KERNEL > timeout_set(&setperf_to, setperf_auto, NULL); > #endif > + > + /* > + * Start the idle pool page garbage collector > + */ > + timeout_add_sec(&pool_gc_tick, 1); > > /* > * proc0: nothing to do, back to sleep > Index: sys/kern/kern_sysctl.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_sysctl.c,v > retrieving revision 1.277 > diff -u -p -r1.277 kern_sysctl.c > --- sys/kern/kern_sysctl.c 12 Dec 2014 07:45:46 -0000 1.277 > +++ sys/kern/kern_sysctl.c 22 Dec 2014 23:44:55 -0000 > @@ -496,7 +496,8 @@ kern_sysctl(int *name, u_int namelen, vo > case KERN_NPROCS: > return (sysctl_rdint(oldp, oldlenp, newp, nprocesses)); > case KERN_POOL: > - return (sysctl_dopool(name + 1, namelen - 1, oldp, oldlenp)); > + return (sysctl_pool(name + 1, namelen - 1, oldp, oldlenp, > + newp, newlen)); > case KERN_STACKGAPRANDOM: > stackgap = stackgap_random; > error = sysctl_int(oldp, oldlenp, newp, newlen, &stackgap); > Index: sys/kern/subr_pool.c > =================================================================== > RCS file: /cvs/src/sys/kern/subr_pool.c,v > retrieving revision 1.173 > diff -u -p -r1.173 subr_pool.c > --- sys/kern/subr_pool.c 22 Dec 2014 00:33:40 -0000 1.173 > +++ sys/kern/subr_pool.c 22 Dec 2014 23:44:55 -0000 > @@ -40,6 +40,8 @@ > #include <sys/syslog.h> > #include <sys/rwlock.h> > #include <sys/sysctl.h> > +#include <sys/task.h> > +#include <sys/timeout.h> > > #include <uvm/uvm_extern.h> > > @@ -160,6 +162,14 @@ void pool_print1(struct pool *, const c > > #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0) > > +/* stale page garbage collectors */ > +void pool_gc_sched(void *); > +struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL); > +void pool_gc_pages(void *, void *); > +struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL, NULL); > +int pool_wait_free = 1; > +int pool_wait_gc = 8; > + > static inline int > phtree_compare(struct pool_item_header *a, struct pool_item_header *b) > { > @@ -678,7 +688,7 @@ pool_put(struct pool *pp, void *v) > /* is it time to free a page? */ > if (pp->pr_nidle > pp->pr_maxpages && > (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL && > - (ticks - ph->ph_tick) > hz) { > + (ticks - ph->ph_tick) > (hz * pool_wait_free)) { > freeph = ph; > pool_p_remove(pp, freeph); > } > @@ -1269,11 +1279,13 @@ pool_walk(struct pool *pp, int full, > * kern.pool.name.<pool#> - the name for pool#. > */ > int > -sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp) > +sysctl_pool(int *name, u_int namelen, char *oldp, size_t *oldlenp, > + void *newp, size_t newlen) > { > struct kinfo_pool pi; > struct pool *pp; > int rv = ENOENT; > + int wait; > > switch (name[0]) { > case KERN_POOL_NPOOLS: > @@ -1284,6 +1296,35 @@ sysctl_dopool(int *name, u_int namelen, > case KERN_POOL_NAME: > case KERN_POOL_POOL: > break; > + > + case KERN_POOL_WAITFREE: > + if (namelen != 1) > + return (ENOTDIR); > + wait = pool_wait_free; > + rv = sysctl_int(oldp, oldlenp, newp, newlen, &wait); > + if (rv != 0) > + return (rv); > + if (newp == NULL) > + return (0); > + if (wait < 1 || wait >= pool_wait_gc) > + return (EINVAL); > + pool_wait_free = wait; > + return (0); > + > + case KERN_POOL_WAITGC: > + if (namelen != 1) > + return (ENOTDIR); > + wait = pool_wait_gc; > + rv = sysctl_int(oldp, oldlenp, newp, newlen, &wait); > + if (rv != 0) > + return (rv); > + if (newp == NULL) > + return (0); > + if (wait <= pool_wait_free || wait > 3600) > + return (EINVAL); > + pool_wait_gc = wait; > + return (0); > + > default: > return (EOPNOTSUPP); > } > @@ -1337,6 +1378,46 @@ done: > rw_exit_read(&pool_lock); > > return (rv); > +} > + > +void > +pool_gc_sched(void *x) > +{ > + task_add(systq, &pool_gc_task); > +} > + > +void > +pool_gc_pages(void *x, void *y) > +{ > + extern int ticks; > + struct pool *pp; > + struct pool_item_header *ph, *freeph; > + int s; > + > + rw_enter_read(&pool_lock); > + s = splvm(); > + SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) { > + if (!mtx_enter_try(&pp->pr_mtx)) > + continue; > + > + /* is it time to free a page? */ > + if (pp->pr_nidle > pp->pr_minpages && > + (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL && > + (ticks - ph->ph_tick) > (hz * pool_wait_gc)) { > + freeph = ph; > + pool_p_remove(pp, freeph); > + } else > + freeph = NULL; > + > + mtx_leave(&pp->pr_mtx); > + > + if (freeph != NULL) > + pool_p_free(pp, freeph); > + } > + splx(s); /* XXX */ > + rw_exit_read(&pool_lock); > + > + timeout_add_sec(&pool_gc_tick, 1); > } > > /* > Index: sys/sys/pool.h > =================================================================== > RCS file: /cvs/src/sys/sys/pool.h,v > retrieving revision 1.55 > diff -u -p -r1.55 pool.h > --- sys/sys/pool.h 19 Dec 2014 02:15:25 -0000 1.55 > +++ sys/sys/pool.h 22 Dec 2014 23:44:55 -0000 > @@ -39,10 +39,24 @@ > * kern.pool.npools > * kern.pool.name.<number> > * kern.pool.pool.<number> > + * kern.pool.wait_free > + * kern.pool.wait_gc > */ > #define KERN_POOL_NPOOLS 1 > #define KERN_POOL_NAME 2 > #define KERN_POOL_POOL 3 > +#define KERN_POOL_WAITFREE 4 > +#define KERN_POOL_WAITGC 5 > +#define KERN_POOL_MAXID 6 > + > +#define CTL_KERN_POOL_NAMES { \ > + { 0, 0 }, \ > + { "npools", CTLTYPE_INT }, \ > + { "name", CTLTYPE_NODE }, \ > + { "pool", CTLTYPE_NODE }, \ > + { "wait_free", CTLTYPE_INT }, \ > + { "wait_gc", CTLTYPE_INT } \ > +} > > struct kinfo_pool { > unsigned int pr_size; /* size of a pool item */ > Index: sys/sys/sysctl.h > =================================================================== > RCS file: /cvs/src/sys/sys/sysctl.h,v > retrieving revision 1.153 > diff -u -p -r1.153 sysctl.h > --- sys/sys/sysctl.h 12 Dec 2014 07:45:46 -0000 1.153 > +++ sys/sys/sysctl.h 22 Dec 2014 23:44:55 -0000 > @@ -945,7 +945,7 @@ int sysctl_vnode(char *, size_t *, struc > #ifdef GPROF > int sysctl_doprof(int *, u_int, void *, size_t *, void *, size_t); > #endif > -int sysctl_dopool(int *, u_int, char *, size_t *); > +int sysctl_pool(int *, u_int, char *, size_t *, void *, size_t); > > int kern_sysctl(int *, u_int, void *, size_t *, void *, size_t, > struct proc *);