Author: jeff
Date: Wed Oct 14 02:10:07 2015
New Revision: 289279
URL: https://svnweb.freebsd.org/changeset/base/289279

Log:
  Parallelize the buffer cache and rewrite getnewbuf().  This results in a
  8x performance improvement in a micro benchmark on a 4 socket machine.
  
   - Get buffer headers from a per-cpu uma cache that sits in from of the
     free queue.
   - Use a per-cpu quantum cache in vmem to eliminate contention for kva.
   - Use multiple clean queues according to buffer cache size to eliminate
     clean queue lock contention.
   - Introduce a bufspace daemon that attempts to prevent getnewbuf() callers
     from blocking or doing direct recycling.
   - Close some bufspace allocation races that could lead to endless
     recycling.
   - Further the transition to a more modern style of small functions grouped
     by prefix in order to improve growing complexity.
  
  Sponsored by: EMC / Isilon
  Reviewed by:  kib
  Tested by:    pho

Modified:
  head/sys/kern/vfs_bio.c
  head/sys/vm/vm_init.c

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c     Wed Oct 14 00:43:29 2015        (r289278)
+++ head/sys/kern/vfs_bio.c     Wed Oct 14 02:10:07 2015        (r289279)
@@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmem.h>
@@ -100,6 +101,7 @@ caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
+struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -116,11 +118,18 @@ static void vfs_vmio_extend(struct buf *
 static int vfs_bio_clcheck(struct vnode *vp, int size,
                daddr_t lblkno, daddr_t blkno);
 static int buf_flush(struct vnode *vp, int);
+static int buf_recycle(bool);
+static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
+static void bufkva_reclaim(vmem_t *, int);
+static void bufkva_free(struct buf *);
+static int buf_import(void *, void **, int, int);
+static void buf_release(void *, void **, int);
+
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -145,23 +154,23 @@ static long bufkvaspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
-    "Maximum allowed value of bufspace (including buf_daemon)");
+SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
+    "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 
0,
-    "Maximum amount of malloced memory for buffers");
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
+    0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
+SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 long hibufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
-    "Maximum allowed value of bufspace (excluding buf_daemon)");
-static int bufreusecnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
-    "Number of times we have reused a buffer");
+SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
+    "Maximum allowed value of bufspace (excluding metadata)");
+long bufspacethresh;
+SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
+    0, "Bufspace consumed before waking the daemon to free some");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
@@ -205,10 +214,10 @@ SYSCTL_INT(_vfs, OID_AUTO, numfreebuffer
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
-   "XXX Unused");
+   "Target number of free buffers");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
-   "XXX Complicatedly unused");
+   "Threshold for clean buffer recycling");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
@@ -219,6 +228,9 @@ static int mappingrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
+static int numbufallocfails;
+SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
+    "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
@@ -233,16 +245,6 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_
     "Permit the use of the unmapped i/o");
 
 /*
- * Lock for the non-dirty bufqueues
- */
-static struct mtx_padalign bqclean;
-
-/*
- * Lock for the dirty queue.
- */
-static struct mtx_padalign bqdirty;
-
-/*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign bdlock;
@@ -271,6 +273,11 @@ static struct mtx_padalign bdirtylock;
 static int bd_request;
 
 /*
+ * Request/wakeup point for the bufspace daemon.
+ */
+static int bufspace_request;
+
+/*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
@@ -298,7 +305,7 @@ static int runningbufreq;
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static volatile int needsbuffer;
@@ -311,14 +318,21 @@ static int bdirtywait;
 /*
  * Definitions for the buffer free lists.
  */
-#define BUFFER_QUEUES  4       /* number of free buffer queues */
-
 #define QUEUE_NONE     0       /* on no queue */
-#define QUEUE_CLEAN    1       /* non-B_DELWRI buffers */
+#define QUEUE_EMPTY    1       /* empty buffer headers */
 #define QUEUE_DIRTY    2       /* B_DELWRI buffers */
-#define QUEUE_EMPTY    3       /* empty buffer headers */
+#define QUEUE_CLEAN    3       /* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL 1024    /* not an queue index, but mark for sentinel */
 
+/* Maximum number of clean buffer queues. */
+#define        CLEAN_QUEUES    16
+
+/* Configured number of clean queues. */
+static int clean_queues;
+
+/* Maximum number of buffer queues. */
+#define BUFFER_QUEUES  (QUEUE_CLEAN + CLEAN_QUEUES)
+
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 #ifdef INVARIANTS
@@ -326,15 +340,21 @@ static int bq_len[BUFFER_QUEUES];
 #endif
 
 /*
+ * Lock for each bufqueue
+ */
+static struct mtx_padalign bqlocks[BUFFER_QUEUES];
+
+/*
+ * per-cpu empty buffer cache.
+ */
+uma_zone_t buf_zone;
+
+/*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
-#define VFS_BIO_NEED_ANY       0x01    /* any freeable buffer */
-#define VFS_BIO_NEED_FREE      0x04    /* wait for free bufs, hi hysteresis */
-#define VFS_BIO_NEED_BUFSPACE  0x08    /* wait for buf space, lo hysteresis */
-
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
@@ -382,6 +402,21 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 }
 #endif
 
+static int
+bqcleanq(void)
+{
+       static int nextq;
+
+       return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+}
+
+static int
+bqisclean(int qindex)
+{
+
+       return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
+}
+
 /*
  *     bqlock:
  *
@@ -391,9 +426,7 @@ static inline struct mtx *
 bqlock(int qindex)
 {
 
-       if (qindex == QUEUE_DIRTY)
-               return (struct mtx *)(&bqdirty);
-       return (struct mtx *)(&bqclean);
+       return (struct mtx *)&bqlocks[qindex];
 }
 
 /*
@@ -447,62 +480,255 @@ bdirtyadd(void)
 }
 
 /*
- *     bufspacewakeup:
+ *     bufspace_wakeup:
  *
  *     Called when buffer space is potentially available for recovery.
  *     getnewbuf() will block on this flag when it is unable to free 
  *     sufficient buffer space.  Buffer space becomes recoverable when 
  *     bp's get placed back in the queues.
  */
-static __inline void
-bufspacewakeup(void)
+static void
+bufspace_wakeup(void)
 {
-       int need_wakeup, on;
 
        /*
-        * If someone is waiting for bufspace, wake them up.  Even
-        * though we may not have freed the kva space yet, the waiting
-        * process will be able to now.
+        * If someone is waiting for bufspace, wake them up.
+        *
+        * Since needsbuffer is set prior to doing an additional queue
+        * scan it is safe to check for the flag prior to acquiring the
+        * lock.  The thread that is preparing to scan again before
+        * blocking would discover the buf we released.
         */
+       if (needsbuffer) {
+               rw_rlock(&nblock);
+               if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
+                       wakeup(__DEVOLATILE(void *, &needsbuffer));
+               rw_runlock(&nblock);
+       }
+}
+
+/*
+ *     bufspace_daemonwakeup:
+ *
+ *     Wakeup the daemon responsible for freeing clean bufs.
+ */
+static void
+bufspace_daemonwakeup(void)
+{
        rw_rlock(&nblock);
-       for (;;) {
-               need_wakeup = 0;
-               on = needsbuffer;
-               if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
-                       break;
-               need_wakeup = 1;
-               if (atomic_cmpset_rel_int(&needsbuffer, on,
-                   on & ~VFS_BIO_NEED_BUFSPACE))
-                       break;
+       if (bufspace_request == 0) {
+               bufspace_request = 1;
+               wakeup(&bufspace_request);
        }
-       if (need_wakeup)
-               wakeup(__DEVOLATILE(void *, &needsbuffer));
        rw_runlock(&nblock);
 }
 
 /*
- *     bufspaceadjust:
+ *     bufspace_adjust:
  *
  *     Adjust the reported bufspace for a KVA managed buffer, possibly
  *     waking any waiters.
  */
 static void
-bufspaceadjust(struct buf *bp, int bufsize)
+bufspace_adjust(struct buf *bp, int bufsize)
 {
+       long space;
        int diff;
 
        KASSERT((bp->b_flags & B_MALLOC) == 0,
-           ("bufspaceadjust: malloc buf %p", bp));
+           ("bufspace_adjust: malloc buf %p", bp));
        diff = bufsize - bp->b_bufsize;
        if (diff < 0) {
                atomic_subtract_long(&bufspace, -diff);
-               bufspacewakeup();
-       } else
-               atomic_add_long(&bufspace, diff);
+               bufspace_wakeup();
+       } else {
+               space = atomic_fetchadd_long(&bufspace, diff);
+               /* Wake up the daemon on the transition. */
+               if (space < bufspacethresh && space + diff >= bufspacethresh)
+                       bufspace_daemonwakeup();
+       }
        bp->b_bufsize = bufsize;
 }
 
 /*
+ *     bufspace_reserve:
+ *
+ *     Reserve bufspace before calling allocbuf().  metadata has a
+ *     different space limit than data.
+ */
+static int
+bufspace_reserve(int size, bool metadata)
+{
+       long limit;
+       long space;
+
+       if (metadata)
+               limit = maxbufspace;
+       else
+               limit = hibufspace;
+       do {
+               space = bufspace;
+               if (space + size > limit)
+                       return (ENOSPC);
+       } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+
+       /* Wake up the daemon on the transition. */
+       if (space < bufspacethresh && space + size >= bufspacethresh)
+               bufspace_daemonwakeup();
+
+       return (0);
+}
+
+/*
+ *     bufspace_release:
+ *
+ *     Release reserved bufspace after bufspace_adjust() has consumed it.
+ */
+static void
+bufspace_release(int size)
+{
+       atomic_subtract_long(&bufspace, size);
+       bufspace_wakeup();
+}
+
+/*
+ *     bufspace_wait:
+ *
+ *     Wait for bufspace, acting as the buf daemon if a locked vnode is
+ *     supplied.  needsbuffer must be set in a safe fashion prior to
+ *     polling for space.  The operation must be re-tried on return.
+ */
+static void
+bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+{
+       struct thread *td;
+       int error, fl, norunbuf;
+
+       if ((gbflags & GB_NOWAIT_BD) != 0)
+               return;
+
+       td = curthread;
+       rw_wlock(&nblock);
+       while (needsbuffer != 0) {
+               if (vp != NULL && vp->v_type != VCHR &&
+                   (td->td_pflags & TDP_BUFNEED) == 0) {
+                       rw_wunlock(&nblock);
+                       /*
+                        * getblk() is called with a vnode locked, and
+                        * some majority of the dirty buffers may as
+                        * well belong to the vnode.  Flushing the
+                        * buffers there would make a progress that
+                        * cannot be achieved by the buf_daemon, that
+                        * cannot lock the vnode.
+                        */
+                       norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+                           (td->td_pflags & TDP_NORUNNINGBUF);
+
+                       /*
+                        * Play bufdaemon.  The getnewbuf() function
+                        * may be called while the thread owns lock
+                        * for another dirty buffer for the same
+                        * vnode, which makes it impossible to use
+                        * VOP_FSYNC() there, due to the buffer lock
+                        * recursion.
+                        */
+                       td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+                       fl = buf_flush(vp, flushbufqtarget);
+                       td->td_pflags &= norunbuf;
+                       rw_wlock(&nblock);
+                       if (fl != 0)
+                               continue;
+                       if (needsbuffer == 0)
+                               break;
+               }
+               error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+                   (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
+               if (error != 0)
+                       break;
+       }
+       rw_wunlock(&nblock);
+}
+
+
+/*
+ *     bufspace_daemon:
+ *
+ *     buffer space management daemon.  Tries to maintain some marginal
+ *     amount of free buffer space so that requesting processes neither
+ *     block nor work to reclaim buffers.
+ */
+static void
+bufspace_daemon(void)
+{
+       for (;;) {
+               kproc_suspend_check(bufspacedaemonproc);
+
+               /*
+                * Free buffers from the clean queue until we meet our
+                * targets.
+                *
+                * Theory of operation:  The buffer cache is most efficient
+                * when some free buffer headers and space are always
+                * available to getnewbuf().  This daemon attempts to prevent
+                * the excessive blocking and synchronization associated
+                * with shortfall.  It goes through three phases according
+                * demand:
+                *
+                * 1)   The daemon wakes up voluntarily once per-second
+                *      during idle periods when the counters are below
+                *      the wakeup thresholds (bufspacethresh, lofreebuffers).
+                *
+                * 2)   The daemon wakes up as we cross the thresholds
+                *      ahead of any potential blocking.  This may bounce
+                *      slightly according to the rate of consumption and
+                *      release.
+                *
+                * 3)   The daemon and consumers are starved for working
+                *      clean buffers.  This is the 'bufspace' sleep below
+                *      which will inefficiently trade bufs with bqrelse
+                *      until we return to condition 2.
+                */
+               while (bufspace > lobufspace ||
+                   numfreebuffers < hifreebuffers) {
+                       if (buf_recycle(false) != 0) {
+                               atomic_set_int(&needsbuffer, 1);
+                               if (buf_recycle(false) != 0) {
+                                       rw_wlock(&nblock);
+                                       if (needsbuffer)
+                                               rw_sleep(__DEVOLATILE(void *,
+                                                   &needsbuffer), &nblock,
+                                                   PRIBIO|PDROP, "bufspace",
+                                                   hz/10);
+                                       else
+                                               rw_wunlock(&nblock);
+                               }
+                       }
+                       maybe_yield();
+               }
+
+               /*
+                * Re-check our limits under the exclusive nblock.
+                */
+               rw_wlock(&nblock);
+               if (bufspace < bufspacethresh &&
+                   numfreebuffers > lofreebuffers) {
+                       bufspace_request = 0;
+                       rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
+                           "-", hz);
+               } else
+                       rw_wunlock(&nblock);
+       }
+}
+
+static struct kproc_desc bufspace_kp = {
+       "bufspacedaemon",
+       bufspace_daemon,
+       &bufspacedaemonproc
+};
+SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
+    &bufspace_kp);
+
+/*
  *     bufmallocadjust:
  *
  *     Adjust the reported bufspace for a malloc managed buffer, possibly
@@ -516,10 +742,9 @@ bufmallocadjust(struct buf *bp, int bufs
        KASSERT((bp->b_flags & B_MALLOC) != 0,
            ("bufmallocadjust: non-malloc buf %p", bp));
        diff = bufsize - bp->b_bufsize;
-       if (diff < 0) {
+       if (diff < 0)
                atomic_subtract_long(&bufmallocspace, -diff);
-               bufspacewakeup();
-       } else
+       else
                atomic_add_long(&bufmallocspace, diff);
        bp->b_bufsize = bufsize;
 }
@@ -571,67 +796,6 @@ runningbufwakeup(struct buf *bp)
 }
 
 /*
- *     bufcountadd:
- *
- *     Called when a buffer has been added to one of the free queues to
- *     account for the buffer and to wakeup anyone waiting for free buffers.
- *     This typically occurs when large amounts of metadata are being handled
- *     by the buffer cache ( else buffer space runs out first, usually ).
- */
-static __inline void
-bufcountadd(struct buf *bp)
-{
-       int mask, need_wakeup, old, on;
-
-       KASSERT((bp->b_flags & B_INFREECNT) == 0,
-           ("buf %p already counted as free", bp));
-       bp->b_flags |= B_INFREECNT;
-       old = atomic_fetchadd_int(&numfreebuffers, 1);
-       KASSERT(old >= 0 && old < nbuf,
-           ("numfreebuffers climbed to %d", old + 1));
-       mask = VFS_BIO_NEED_ANY;
-       if (numfreebuffers >= hifreebuffers)
-               mask |= VFS_BIO_NEED_FREE;
-       rw_rlock(&nblock);
-       for (;;) {
-               need_wakeup = 0;
-               on = needsbuffer;
-               if (on == 0)
-                       break;
-               need_wakeup = 1;
-               if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
-                       break;
-       }
-       if (need_wakeup)
-               wakeup(__DEVOLATILE(void *, &needsbuffer));
-       rw_runlock(&nblock);
-}
-
-/*
- *     bufcountsub:
- *
- *     Decrement the numfreebuffers count as needed.
- */
-static void
-bufcountsub(struct buf *bp)
-{
-       int old;
-
-       /*
-        * Fixup numfreebuffers count.  If the buffer is invalid or not
-        * delayed-write, the buffer was free and we must decrement
-        * numfreebuffers.
-        */
-       if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
-               KASSERT((bp->b_flags & B_INFREECNT) != 0,
-                   ("buf %p not counted in numfreebuffers", bp));
-               bp->b_flags &= ~B_INFREECNT;
-               old = atomic_fetchadd_int(&numfreebuffers, -1);
-               KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
-       }
-}
-
-/*
  *     waitrunningbufspace()
  *
  *     runningbufspace is a measure of the amount of I/O currently
@@ -847,8 +1011,10 @@ bufinit(void)
        int i;
 
        CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
-       mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
-       mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
+       mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
+       mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
+       for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
+               mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
        mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
        rw_init(&nblock, "needsbuffer lock");
        mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
@@ -864,7 +1030,7 @@ bufinit(void)
        for (i = 0; i < nbuf; i++) {
                bp = &buf[i];
                bzero(bp, sizeof *bp);
-               bp->b_flags = B_INVAL | B_INFREECNT;
+               bp->b_flags = B_INVAL;
                bp->b_rcred = NOCRED;
                bp->b_wcred = NOCRED;
                bp->b_qindex = QUEUE_EMPTY;
@@ -881,18 +1047,19 @@ bufinit(void)
        /*
         * maxbufspace is the absolute maximum amount of buffer space we are 
         * allowed to reserve in KVM and in real terms.  The absolute maximum
-        * is nominally used by buf_daemon.  hibufspace is the nominal maximum
-        * used by most other processes.  The differential is required to 
-        * ensure that buf_daemon is able to run when other processes might 
-        * be blocked waiting for buffer space.
+        * is nominally used by metadata.  hibufspace is the nominal maximum
+        * used by most other requests.  The differential is required to 
+        * ensure that metadata deadlocks don't occur.
         *
         * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
         * this may result in KVM fragmentation which is not handled optimally
-        * by the system.
+        * by the system. XXX This is less true with vmem.  We could use
+        * PAGE_SIZE.
         */
        maxbufspace = (long)nbuf * BKVASIZE;
        hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
-       lobufspace = hibufspace - MAXBCACHEBUF;
+       lobufspace = (hibufspace / 20) * 19; /* 95% */
+       bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
 
        /*
         * Note: The 16 MiB upper limit for hirunningspace was chosen
@@ -906,44 +1073,61 @@ bufinit(void)
            16 * 1024 * 1024), 1024 * 1024);
        lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
 
-/*
- * Limit the amount of malloc memory since it is wired permanently into
- * the kernel space.  Even though this is accounted for in the buffer
- * allocation, we don't want the malloced region to grow uncontrolled.
- * The malloc scheme improves memory utilization significantly on average
- * (small) directories.
- */
+       /*
+        * Limit the amount of malloc memory since it is wired permanently into
+        * the kernel space.  Even though this is accounted for in the buffer
+        * allocation, we don't want the malloced region to grow uncontrolled.
+        * The malloc scheme improves memory utilization significantly on
+        * average (small) directories.
+        */
        maxbufmallocspace = hibufspace / 20;
 
-/*
- * Reduce the chance of a deadlock occuring by limiting the number
- * of delayed-write dirty buffers we allow to stack up.
- */
+       /*
+        * Reduce the chance of a deadlock occuring by limiting the number
+        * of delayed-write dirty buffers we allow to stack up.
+        */
        hidirtybuffers = nbuf / 4 + 20;
        dirtybufthresh = hidirtybuffers * 9 / 10;
        numdirtybuffers = 0;
-/*
- * To support extreme low-memory systems, make sure hidirtybuffers cannot
- * eat up all available buffer space.  This occurs when our minimum cannot
- * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
- * BKVASIZE'd buffers.
- */
+       /*
+        * To support extreme low-memory systems, make sure hidirtybuffers
+        * cannot eat up all available buffer space.  This occurs when our
+        * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
+        * buffer space assuming BKVASIZE'd buffers.
+        */
        while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
                hidirtybuffers >>= 1;
        }
        lodirtybuffers = hidirtybuffers / 2;
 
-/*
- * Try to keep the number of free buffers in the specified range,
- * and give special processes (e.g. like buf_daemon) access to an 
- * emergency reserve.
- */
-       lofreebuffers = nbuf / 18 + 5;
-       hifreebuffers = 2 * lofreebuffers;
+       /*
+        * lofreebuffers should be sufficient to avoid stalling waiting on
+        * buf headers under heavy utilization.  The bufs in per-cpu caches
+        * are counted as free but will be unavailable to threads executing
+        * on other cpus.
+        *
+        * hifreebuffers is the free target for the bufspace daemon.  This
+        * should be set appropriately to limit work per-iteration.
+        */
+       lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
+       hifreebuffers = (3 * lofreebuffers) / 2;
        numfreebuffers = nbuf;
 
        bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
            VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+
+       /* Setup the kva and free list allocators. */
+       vmem_set_reclaim(buffer_arena, bufkva_reclaim);
+       buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
+           NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
+
+       /*
+        * Size the clean queue according to the amount of buffer space.
+        * One queue per-256mb up to the max.  More queues gives better
+        * concurrency but less accurate LRU.
+        */
+       clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+
 }
 
 #ifdef INVARIANTS
@@ -1129,10 +1313,25 @@ binsfree(struct buf *bp, int qindex)
 {
        struct mtx *olock, *nlock;
 
-       BUF_ASSERT_XLOCKED(bp);
+       if (qindex != QUEUE_EMPTY) {
+               BUF_ASSERT_XLOCKED(bp);
+       }
+
+       /*
+        * Stick to the same clean queue for the lifetime of the buf to
+        * limit locking below.  Otherwise pick ont sequentially.
+        */
+       if (qindex == QUEUE_CLEAN) {
+               if (bqisclean(bp->b_qindex))
+                       qindex = bp->b_qindex;
+               else
+                       qindex = bqcleanq();
+       }
 
+       /*
+        * Handle delayed bremfree() processing.
+        */
        nlock = bqlock(qindex);
-       /* Handle delayed bremfree() processing. */
        if (bp->b_flags & B_REMFREE) {
                olock = bqlock(bp->b_qindex);
                mtx_lock(olock);
@@ -1156,15 +1355,263 @@ binsfree(struct buf *bp, int qindex)
        bq_len[bp->b_qindex]++;
 #endif
        mtx_unlock(nlock);
+}
+
+/*
+ * buf_free:
+ *
+ *     Free a buffer to the buf zone once it no longer has valid contents.
+ */
+static void
+buf_free(struct buf *bp)
+{
+
+       if (bp->b_flags & B_REMFREE)
+               bremfreef(bp);
+       if (bp->b_vflags & BV_BKGRDINPROG)
+               panic("losing buffer 1");
+       if (bp->b_rcred != NOCRED) {
+               crfree(bp->b_rcred);
+               bp->b_rcred = NOCRED;
+       }
+       if (bp->b_wcred != NOCRED) {
+               crfree(bp->b_wcred);
+               bp->b_wcred = NOCRED;
+       }
+       if (!LIST_EMPTY(&bp->b_dep))
+               buf_deallocate(bp);
+       bufkva_free(bp);
+       BUF_UNLOCK(bp);
+       uma_zfree(buf_zone, bp);
+       atomic_add_int(&numfreebuffers, 1);
+       bufspace_wakeup();
+}
+
+/*
+ * buf_import:
+ *
+ *     Import bufs into the uma cache from the buf list.  The system still
+ *     expects a static array of bufs and much of the synchronization
+ *     around bufs assumes type stable storage.  As a result, UMA is used
+ *     only as a per-cpu cache of bufs still maintained on a global list.
+ */
+static int
+buf_import(void *arg, void **store, int cnt, int flags)
+{
+       struct buf *bp;
+       int i;
+
+       mtx_lock(&bqlocks[QUEUE_EMPTY]);
+       for (i = 0; i < cnt; i++) {
+               bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+               if (bp == NULL)
+                       break;
+               bremfreel(bp);
+               store[i] = bp;
+       }
+       mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+
+       return (i);
+}
+
+/*
+ * buf_release:
+ *
+ *     Release bufs from the uma cache back to the buffer queues.
+ */
+static void
+buf_release(void *arg, void **store, int cnt)
+{
+        int i;
+
+        for (i = 0; i < cnt; i++)
+               binsfree(store[i], QUEUE_EMPTY);
+}
+
+/*
+ * buf_alloc:
+ *
+ *     Allocate an empty buffer header.
+ */
+static struct buf *
+buf_alloc(void)
+{
+       struct buf *bp;
+
+       bp = uma_zalloc(buf_zone, M_NOWAIT);
+       if (bp == NULL) {
+               bufspace_daemonwakeup();
+               atomic_add_int(&numbufallocfails, 1);
+               return (NULL);
+       }
+
+       /*
+        * Wake-up the bufspace daemon on transition.
+        */
+       if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
+               bufspace_daemonwakeup();
+
+       if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+               panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
+       
+       KASSERT(bp->b_vp == NULL,
+           ("bp: %p still has vnode %p.", bp, bp->b_vp));
+       KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
+           ("invalid buffer %p flags %#x", bp, bp->b_flags));
+       KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+           ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+       KASSERT(bp->b_npages == 0,
+           ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
+       KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
+       KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
+
+       bp->b_flags = 0;
+       bp->b_ioflags = 0;
+       bp->b_xflags = 0;
+       bp->b_vflags = 0;
+       bp->b_vp = NULL;
+       bp->b_blkno = bp->b_lblkno = 0;
+       bp->b_offset = NOOFFSET;
+       bp->b_iodone = 0;
+       bp->b_error = 0;
+       bp->b_resid = 0;
+       bp->b_bcount = 0;
+       bp->b_npages = 0;
+       bp->b_dirtyoff = bp->b_dirtyend = 0;
+       bp->b_bufobj = NULL;
+       bp->b_pin_count = 0;
+       bp->b_data = bp->b_kvabase = unmapped_buf;
+       bp->b_fsprivate1 = NULL;
+       bp->b_fsprivate2 = NULL;
+       bp->b_fsprivate3 = NULL;
+       LIST_INIT(&bp->b_dep);
+
+       return (bp);
+}
+
+/*
+ *     buf_qrecycle:
+ *
+ *     Free a buffer from the given bufqueue.  kva controls whether the
+ *     freed buf must own some kva resources.  This is used for
+ *     defragmenting.
+ */
+static int
+buf_qrecycle(int qindex, bool kva)
+{
+       struct buf *bp, *nbp;
+
+       if (kva)
+               atomic_add_int(&bufdefragcnt, 1);
+       nbp = NULL;
+       mtx_lock(&bqlocks[qindex]);
+       nbp = TAILQ_FIRST(&bufqueues[qindex]);
+
+       /*
+        * Run scan, possibly freeing data and/or kva mappings on the fly
+        * depending.
+        */
+       while ((bp = nbp) != NULL) {
+               /*
+                * Calculate next bp (we can only use it if we do not
+                * release the bqlock).
+                */
+               nbp = TAILQ_NEXT(bp, b_freelist);
+
+               /*
+                * If we are defragging then we need a buffer with 
+                * some kva to reclaim.
+                */
+               if (kva && bp->b_kvasize == 0)
+                       continue;
+
+               if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+                       continue;
+
+               /*
+                * Skip buffers with background writes in progress.
+                */
+               if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
+                       BUF_UNLOCK(bp);
+                       continue;
+               }
+
+               KASSERT(bp->b_qindex == qindex,
+                   ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+               /*
+                * NOTE:  nbp is now entirely invalid.  We can only restart
+                * the scan from this point on.
+                */
+               bremfreel(bp);
+               mtx_unlock(&bqlocks[qindex]);
+
+               /*
+                * Requeue the background write buffer with error and
+                * restart the scan.
+                */
+               if ((bp->b_vflags & BV_BKGRDERR) != 0) {
+                       bqrelse(bp);
+                       mtx_lock(&bqlocks[qindex]);
+                       nbp = TAILQ_FIRST(&bufqueues[qindex]);
+                       continue;
+               }
+               bp->b_flags |= B_INVAL;
+               brelse(bp);
+               return (0);
+       }
+       mtx_unlock(&bqlocks[qindex]);
+
+       return (ENOBUFS);
+}
+
+/*
+ *     buf_recycle:
+ *
+ *     Iterate through all clean queues until we find a buf to recycle or
+ *     exhaust the search.
+ */
+static int
+buf_recycle(bool kva)
+{
+       int qindex, first_qindex;
+
+       qindex = first_qindex = bqcleanq();
+       do {
+               if (buf_qrecycle(qindex, kva) == 0)
+                       return (0);
+               if (++qindex == QUEUE_CLEAN + clean_queues)
+                       qindex = QUEUE_CLEAN;
+       } while (qindex != first_qindex);
+
+       return (ENOBUFS);
+}
+
+/*
+ *     buf_scan:
+ *
+ *     Scan the clean queues looking for a buffer to recycle.  needsbuffer
+ *     is set on failure so that the caller may optionally bufspace_wait()
+ *     in a race-free fashion.
+ */
+static int
+buf_scan(bool defrag)
+{
+       int error;
 
        /*
-        * Something we can maybe free or reuse.
-        */
-       if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
-               bufspacewakeup();
-
-       if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
-               bufcountadd(bp);
+        * To avoid heavy synchronization and wakeup races we set
+        * needsbuffer and re-poll before failing.  This ensures that
+        * no frees can be missed between an unsuccessful poll and
+        * going to sleep in a synchronized fashion.
+        */

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to