On Mon, Dec 22, 2014 at 10:54:16AM -0500, Ted Unangst wrote:
> On Mon, Dec 22, 2014 at 14:59, Mike Belopuhov wrote:
> > 1) how is it different from what we have now?
> > 
> > 2) why can't you do it when you pool_put?
> 
> Right now, if you allocate a bunch of things, then free them, the
> pages won't be freed because the timestamp is new. But you've already
> freed everything, so there won't be a future pool_put to trigger the
> final page freeing. This keeps pages from getting stuck.
> 
> > 3) why can't you call it from uvm when there's memory pressure since
> > from what i understand pool_reclaim was supposed to work like that?
> 
> Calling reclaim only when we're low on memory is sometimes too late.

and we never do. reclaim is only called when sysctl kern.pool_debug
is fiddled with.

> 
> > 4) i assume you don't want to call pool_reclaim from pool_gc_pages
> > because of the mutex_enter_try benefits, but why does logic in
> > these functions differ, e.g. why did you omit the pr_itemsperpage
> > bit?
> 
> We're not trying to release all free pages. Only the ones that are both
> too many and too old. This is the same logic that is already in
> pool_put.
> 
> > 7) it looks like pool_reclaim_all should also raise an IPL since it
> > does the same thing.  wasn't it noteced before?
> 
> Likely. I don't think reclaim_all gets called very often.

or at all, really.

mikeb, id really appreciate it if you could benchmark with this diff.

if you're interested in seeing the effect of freeing at different
intervals, you could try the diff below. it adds kern.pool.wait_free
and kern.pool.wait_gc tunables so you can set how long a page has
to be idle before a pool_put and the gc respectively will select a
page for freeing.

id prefer to pick values that Just Work(tm) all the time, but being
able to test different values easily might be useful in the short
term.

Index: sbin/sysctl/sysctl.c
===================================================================
RCS file: /cvs/src/sbin/sysctl/sysctl.c,v
retrieving revision 1.207
diff -u -p -r1.207 sysctl.c
--- sbin/sysctl/sysctl.c        19 Nov 2014 18:04:54 -0000      1.207
+++ sbin/sysctl/sysctl.c        22 Dec 2014 23:44:52 -0000
@@ -44,6 +44,7 @@
 #include <sys/sched.h>
 #include <sys/sensors.h>
 #include <sys/vmmeter.h>
+#include <sys/pool.h>
 #include <net/route.h>
 #include <net/if.h>
 
@@ -125,6 +126,7 @@ struct ctlname semname[] = CTL_KERN_SEMI
 struct ctlname shmname[] = CTL_KERN_SHMINFO_NAMES;
 struct ctlname watchdogname[] = CTL_KERN_WATCHDOG_NAMES;
 struct ctlname tcname[] = CTL_KERN_TIMECOUNTER_NAMES;
+struct ctlname poolname[] = CTL_KERN_POOL_NAMES;
 struct ctlname *vfsname;
 #ifdef CTL_MACHDEP_NAMES
 struct ctlname machdepname[] = CTL_MACHDEP_NAMES;
@@ -207,6 +209,7 @@ int sysctl_seminfo(char *, char **, int 
 int sysctl_shminfo(char *, char **, int *, int, int *);
 int sysctl_watchdog(char *, char **, int *, int, int *);
 int sysctl_tc(char *, char **, int *, int, int *);
+int sysctl_pool(char *, char **, int *, int, int *);
 int sysctl_sensors(char *, char **, int *, int, int *);
 void print_sensordev(char *, int *, u_int, struct sensordev *);
 void print_sensor(struct sensor *);
@@ -388,6 +391,11 @@ parse(char *string, int flags)
                                return;
                        warnx("use dmesg to view %s", string);
                        return;
+               case KERN_POOL:
+                       len = sysctl_pool(string, &bufp, mib, flags, &type);
+                       if (len < 0)
+                               return;
+                       break;
                case KERN_PROC:
                        if (flags == 0)
                                return;
@@ -1633,6 +1641,7 @@ struct list semlist = { semname, KERN_SE
 struct list shmlist = { shmname, KERN_SHMINFO_MAXID };
 struct list watchdoglist = { watchdogname, KERN_WATCHDOG_MAXID };
 struct list tclist = { tcname, KERN_TIMECOUNTER_MAXID };
+struct list poollist = { poolname, KERN_POOL_MAXID };
 
 /*
  * handle vfs namei cache statistics
@@ -2302,6 +2311,25 @@ sysctl_tc(char *string, char **bufpp, in
                return (-1);
        mib[2] = indx;
        *typep = tclist.list[indx].ctl_type;
+       return (3);
+}
+
+/*
+ * Handle pools support
+ */
+int
+sysctl_pool(char *string, char **bufpp, int mib[], int flags, int *typep)
+{
+       int indx;
+
+       if (*bufpp == NULL) {
+               listall(string, &poollist);
+               return (-1);
+       }
+       if ((indx = findname(string, "third", bufpp, &poollist)) == -1)
+               return (-1);
+       mib[2] = indx;
+       *typep = poollist.list[indx].ctl_type;
        return (3);
 }
 
Index: sys/kern/init_main.c
===================================================================
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.227
diff -u -p -r1.227 init_main.c
--- sys/kern/init_main.c        15 Dec 2014 02:24:23 -0000      1.227
+++ sys/kern/init_main.c        22 Dec 2014 23:44:55 -0000
@@ -121,6 +121,8 @@ extern struct timeout setperf_to;
 void setperf_auto(void *);
 #endif
 
+extern struct timeout pool_gc_tick;
+
 int    cmask = CMASK;
 extern struct user *proc0paddr;
 
@@ -554,6 +556,11 @@ main(void *framep)
 #ifndef SMALL_KERNEL
        timeout_set(&setperf_to, setperf_auto, NULL);
 #endif
+
+       /*
+        * Start the idle pool page garbage collector
+        */
+       timeout_add_sec(&pool_gc_tick, 1);
 
         /*
          * proc0: nothing to do, back to sleep
Index: sys/kern/kern_sysctl.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.277
diff -u -p -r1.277 kern_sysctl.c
--- sys/kern/kern_sysctl.c      12 Dec 2014 07:45:46 -0000      1.277
+++ sys/kern/kern_sysctl.c      22 Dec 2014 23:44:55 -0000
@@ -496,7 +496,8 @@ kern_sysctl(int *name, u_int namelen, vo
        case KERN_NPROCS:
                return (sysctl_rdint(oldp, oldlenp, newp, nprocesses));
        case KERN_POOL:
-               return (sysctl_dopool(name + 1, namelen - 1, oldp, oldlenp));
+               return (sysctl_pool(name + 1, namelen - 1, oldp, oldlenp,
+                   newp, newlen));
        case KERN_STACKGAPRANDOM:
                stackgap = stackgap_random;
                error = sysctl_int(oldp, oldlenp, newp, newlen, &stackgap);
Index: sys/kern/subr_pool.c
===================================================================
RCS file: /cvs/src/sys/kern/subr_pool.c,v
retrieving revision 1.173
diff -u -p -r1.173 subr_pool.c
--- sys/kern/subr_pool.c        22 Dec 2014 00:33:40 -0000      1.173
+++ sys/kern/subr_pool.c        22 Dec 2014 23:44:55 -0000
@@ -40,6 +40,8 @@
 #include <sys/syslog.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
+#include <sys/task.h>
+#include <sys/timeout.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -160,6 +162,14 @@ void        pool_print1(struct pool *, const c
 
 #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0)
 
+/* stale page garbage collectors */
+void   pool_gc_sched(void *);
+struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
+void   pool_gc_pages(void *, void *);
+struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL, NULL);
+int pool_wait_free = 1;
+int pool_wait_gc = 8;
+
 static inline int
 phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
 {
@@ -678,7 +688,7 @@ pool_put(struct pool *pp, void *v)
        /* is it time to free a page? */
        if (pp->pr_nidle > pp->pr_maxpages &&
            (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
-           (ticks - ph->ph_tick) > hz) {
+           (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
                freeph = ph;
                pool_p_remove(pp, freeph);
        }
@@ -1269,11 +1279,13 @@ pool_walk(struct pool *pp, int full,
  * kern.pool.name.<pool#> - the name for pool#.
  */
 int
-sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp)
+sysctl_pool(int *name, u_int namelen, char *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
 {
        struct kinfo_pool pi;
        struct pool *pp;
        int rv = ENOENT;
+       int wait;
 
        switch (name[0]) {
        case KERN_POOL_NPOOLS:
@@ -1284,6 +1296,35 @@ sysctl_dopool(int *name, u_int namelen, 
        case KERN_POOL_NAME:
        case KERN_POOL_POOL:
                break;
+
+       case KERN_POOL_WAITFREE:
+               if (namelen != 1)
+                       return (ENOTDIR);
+               wait = pool_wait_free;
+               rv = sysctl_int(oldp, oldlenp, newp, newlen, &wait);
+               if (rv != 0)
+                       return (rv);
+               if (newp == NULL)
+                       return (0);
+               if (wait < 1 || wait >= pool_wait_gc)
+                       return (EINVAL);
+               pool_wait_free = wait;
+               return (0);
+
+       case KERN_POOL_WAITGC:
+               if (namelen != 1)
+                       return (ENOTDIR);
+               wait = pool_wait_gc;
+               rv = sysctl_int(oldp, oldlenp, newp, newlen, &wait);
+               if (rv != 0)
+                       return (rv);
+               if (newp == NULL)
+                       return (0);
+               if (wait <= pool_wait_free || wait > 3600)
+                       return (EINVAL);
+               pool_wait_gc = wait;
+               return (0);
+
        default:
                return (EOPNOTSUPP);
        }
@@ -1337,6 +1378,46 @@ done:
        rw_exit_read(&pool_lock);
 
        return (rv);
+}
+
+void
+pool_gc_sched(void *x)
+{
+       task_add(systq, &pool_gc_task);
+}
+
+void
+pool_gc_pages(void *x, void *y)
+{
+       extern int ticks;
+       struct pool *pp;
+       struct pool_item_header *ph, *freeph;
+       int s;
+
+       rw_enter_read(&pool_lock);
+       s = splvm();
+       SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
+               if (!mtx_enter_try(&pp->pr_mtx))
+                       continue;
+
+               /* is it time to free a page? */
+               if (pp->pr_nidle > pp->pr_minpages &&
+                   (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
+                   (ticks - ph->ph_tick) > (hz * pool_wait_gc)) {
+                       freeph = ph;
+                       pool_p_remove(pp, freeph);
+               } else
+                       freeph = NULL;
+
+               mtx_leave(&pp->pr_mtx);
+
+               if (freeph != NULL)
+                       pool_p_free(pp, freeph);
+       }
+       splx(s); /* XXX */
+       rw_exit_read(&pool_lock);
+
+       timeout_add_sec(&pool_gc_tick, 1);
 }
 
 /*
Index: sys/sys/pool.h
===================================================================
RCS file: /cvs/src/sys/sys/pool.h,v
retrieving revision 1.55
diff -u -p -r1.55 pool.h
--- sys/sys/pool.h      19 Dec 2014 02:15:25 -0000      1.55
+++ sys/sys/pool.h      22 Dec 2014 23:44:55 -0000
@@ -39,10 +39,24 @@
  * kern.pool.npools
  * kern.pool.name.<number>
  * kern.pool.pool.<number>
+ * kern.pool.wait_free
+ * kern.pool.wait_gc
  */
 #define KERN_POOL_NPOOLS       1
 #define KERN_POOL_NAME         2
 #define KERN_POOL_POOL         3
+#define KERN_POOL_WAITFREE     4
+#define KERN_POOL_WAITGC       5
+#define KERN_POOL_MAXID                6
+
+#define CTL_KERN_POOL_NAMES { \
+       { 0, 0 }, \
+       { "npools", CTLTYPE_INT }, \
+       { "name", CTLTYPE_NODE }, \
+       { "pool", CTLTYPE_NODE }, \
+       { "wait_free", CTLTYPE_INT }, \
+       { "wait_gc", CTLTYPE_INT } \
+}
 
 struct kinfo_pool {
        unsigned int    pr_size;        /* size of a pool item */
Index: sys/sys/sysctl.h
===================================================================
RCS file: /cvs/src/sys/sys/sysctl.h,v
retrieving revision 1.153
diff -u -p -r1.153 sysctl.h
--- sys/sys/sysctl.h    12 Dec 2014 07:45:46 -0000      1.153
+++ sys/sys/sysctl.h    22 Dec 2014 23:44:55 -0000
@@ -945,7 +945,7 @@ int sysctl_vnode(char *, size_t *, struc
 #ifdef GPROF
 int sysctl_doprof(int *, u_int, void *, size_t *, void *, size_t);
 #endif
-int sysctl_dopool(int *, u_int, char *, size_t *);
+int sysctl_pool(int *, u_int, char *, size_t *, void *, size_t);
 
 int kern_sysctl(int *, u_int, void *, size_t *, void *, size_t,
                     struct proc *);

Reply via email to