Author: jeff
Date: Tue Feb 20 00:06:07 2018
New Revision: 329612
URL: https://svnweb.freebsd.org/changeset/base/329612

Log:
  Further parallelize the buffer cache.
  
  Provide multiple clean queues partitioned into 'domains'.  Each domain manages
  its own bufspace and has its own bufspace daemon.  Each domain has a set of
  subqueues indexed by the current cpuid to reduce lock contention on the 
cleanq.
  
  Refine the sleep/wakeup around the bufspace daemon to use atomics as much as
  possible.
  
  Add a B_REUSE flag that is used to requeue bufs during the scan to approximate
  LRU rather than locking the queue on every use of a frequently accessed buf.
  
  Implement bufspace_reserve with only atomic_fetchadd to avoid loop restarts.
  
  Reviewed by:  markj
  Tested by:    pho
  Sponsored by: Netflix, Dell/EMC Isilon
  Differential Revision:        https://reviews.freebsd.org/D14274

Modified:
  head/sys/kern/vfs_bio.c
  head/sys/kern/vfs_subr.c
  head/sys/sys/buf.h
  head/sys/sys/bufobj.h

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c     Mon Feb 19 22:56:04 2018        (r329611)
+++ head/sys/kern/vfs_bio.c     Tue Feb 20 00:06:07 2018        (r329612)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
+#include <sys/counter.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
@@ -105,7 +106,6 @@ caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
-struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -124,11 +124,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size,
 static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
                void (*)(struct buf *));
 static int buf_flush(struct vnode *vp, int);
-static int buf_recycle(bool);
-static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
-static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
@@ -137,28 +134,17 @@ static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
-    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
-#endif
-
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
-static long bufspace;
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
-    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
-    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
-#else
-SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
-    "Physical memory used for buffers");
-#endif
-static long bufkvaspace;
-SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
+    NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
+static counter_u64_t bufkvaspace;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
@@ -178,11 +164,11 @@ SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &h
 long bufspacethresh;
 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
     0, "Bufspace consumed before waking the daemon to free some");
-static int buffreekvacnt;
-SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+static counter_u64_t buffreekvacnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
     "Number of times we have freed the KVA space from some buffer");
-static int bufdefragcnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+static counter_u64_t bufdefragcnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
@@ -225,24 +211,26 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, 
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "Threshold for clean buffer recycling");
-static int getnewbufcalls;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
-   "Number of calls to getnewbuf");
-static int getnewbufrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 
0,
+static counter_u64_t getnewbufcalls;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
+   &getnewbufcalls, "Number of calls to getnewbuf");
+static counter_u64_t getnewbufrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
+    &getnewbufrestarts,
     "Number of times getnewbuf has had to restart a buffer acquisition");
-static int mappingrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+static counter_u64_t mappingrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
+    &mappingrestarts,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
-static int numbufallocfails;
-SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
-    "Number of times buffer allocations failed");
+static counter_u64_t numbufallocfails;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
+    &numbufallocfails, "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflushes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
+static counter_u64_t notbufdflushes;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
@@ -266,11 +254,6 @@ static struct mtx_padalign __exclusive_cache_line bdlo
 static struct mtx_padalign __exclusive_cache_line rbreqlock;
 
 /*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
- */
-static struct rwlock_padalign __exclusive_cache_line nblock;
-
-/*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign __exclusive_cache_line bdirtylock;
@@ -283,11 +266,6 @@ static struct mtx_padalign __exclusive_cache_line bdir
 static int bd_request;
 
 /*
- * Request/wakeup point for the bufspace daemon.
- */
-static int bufspace_request;
-
-/*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
@@ -302,15 +280,6 @@ static int bd_speedupreq;
  */
 static int runningbufreq;
 
-/* 
- * Synchronization (sleep/wakeup) variable for buffer requests.
- * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
- * by and/or.
- * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
- * getnewbuf(), and getblk().
- */
-static volatile int needsbuffer;
-
 /*
  * Synchronization for bwillwrite() waiters.
  */
@@ -323,29 +292,69 @@ static int bdirtywait;
 #define QUEUE_EMPTY    1       /* empty buffer headers */
 #define QUEUE_DIRTY    2       /* B_DELWRI buffers */
 #define QUEUE_CLEAN    3       /* non-B_DELWRI buffers */
-#define QUEUE_SENTINEL 1024    /* not an queue index, but mark for sentinel */
+#define QUEUE_SENTINEL 4       /* not an queue index, but mark for sentinel */
 
-/* Maximum number of clean buffer queues. */
-#define        CLEAN_QUEUES    16
+struct bufqueue {
+       struct mtx_padalign     bq_lock;
+       TAILQ_HEAD(, buf)       bq_queue;
+       uint8_t                 bq_index;
+       uint16_t                bq_subqueue;
+       int                     bq_len;
+} __aligned(CACHE_LINE_SIZE);
 
+#define        BQ_LOCKPTR(bq)          (&(bq)->bq_lock)
+#define        BQ_LOCK(bq)             mtx_lock(BQ_LOCKPTR((bq)))
+#define        BQ_UNLOCK(bq)           mtx_unlock(BQ_LOCKPTR((bq)))
+#define        BQ_ASSERT_LOCKED(bq)    mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+
+struct bufqueue __exclusive_cache_line bqempty;
+struct bufqueue __exclusive_cache_line bqdirty;
+
+struct bufdomain {
+       struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
+       struct bufqueue *bd_cleanq;
+       struct mtx_padalign bd_run_lock;
+       /* Constants */
+       long            bd_maxbufspace;
+       long            bd_hibufspace;
+       long            bd_lobufspace;
+       long            bd_bufspacethresh;
+       int             bd_hifreebuffers;
+       int             bd_lofreebuffers;
+       int             bd_lim;
+       /* atomics */
+       int             bd_wanted;
+       int  __aligned(CACHE_LINE_SIZE) bd_running;
+       long __aligned(CACHE_LINE_SIZE) bd_bufspace;
+       int __aligned(CACHE_LINE_SIZE)  bd_freebuffers;
+} __aligned(CACHE_LINE_SIZE);
+
+#define        BD_LOCKPTR(bd)          (&(bd)->bd_cleanq->bq_lock)
+#define        BD_LOCK(bd)             mtx_lock(BD_LOCKPTR((bd)))
+#define        BD_UNLOCK(bd)           mtx_unlock(BD_LOCKPTR((bd)))
+#define        BD_ASSERT_LOCKED(bd)    mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
+#define        BD_RUN_LOCKPTR(bd)      (&(bd)->bd_run_lock)
+#define        BD_RUN_LOCK(bd)         mtx_lock(BD_RUN_LOCKPTR((bd)))
+#define        BD_RUN_UNLOCK(bd)       mtx_unlock(BD_RUN_LOCKPTR((bd)))
+#define        BD_DOMAIN(bd)           (bd - bdclean)
+
+/* Maximum number of clean buffer domains. */
+#define        CLEAN_DOMAINS   8
+
 /* Configured number of clean queues. */
-static int clean_queues;
+static int __read_mostly clean_domains;
 
-/* Maximum number of buffer queues. */
-#define BUFFER_QUEUES  (QUEUE_CLEAN + CLEAN_QUEUES)
+struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS];
 
-/* Queues for free buffers with various properties */
-static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
-#ifdef INVARIANTS
-static int bq_len[BUFFER_QUEUES];
-#endif
+static void bq_remove(struct bufqueue *bq, struct buf *bp);
+static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
+static int buf_recycle(struct bufdomain *, bool kva);
+static void bq_init(struct bufqueue *bq, int qindex, int cpu,
+           const char *lockname);
+static void bd_init(struct bufdomain *bd);
+static int bd_flushall(struct bufdomain *bd);
 
 /*
- * Lock for each bufqueue
- */
-static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES];
-
-/*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
@@ -391,46 +400,34 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
        long lvalue;
        int ivalue;
+       int i;
 
+       lvalue = 0;
+       for (i = 0; i < clean_domains; i++)
+               lvalue += bdclean[i].bd_bufspace;
        if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
-               return (sysctl_handle_long(oidp, arg1, arg2, req));
-       lvalue = *(long *)arg1;
+               return (sysctl_handle_long(oidp, &lvalue, 0, req));
        if (lvalue > INT_MAX)
                /* On overflow, still write out a long to trigger ENOMEM. */
                return (sysctl_handle_long(oidp, &lvalue, 0, req));
        ivalue = lvalue;
        return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
-#endif
-
+#else
 static int
-bqcleanq(void)
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
-       static int nextq;
+       long lvalue;
+       int i;
 
-       return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+       lvalue = 0;
+       for (i = 0; i < clean_domains; i++)
+               lvalue += bdclean[i].bd_bufspace;
+       return (sysctl_handle_int(oidp, &lvalue, 0, req));
 }
+#endif
 
-static int
-bqisclean(int qindex)
-{
-
-       return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
-}
-
 /*
- *     bqlock:
- *
- *     Return the appropriate queue lock based on the index.
- */
-static inline struct mtx *
-bqlock(int qindex)
-{
-
-       return (struct mtx *)&bqlocks[qindex];
-}
-
-/*
  *     bdirtywakeup:
  *
  *     Wakeup any bwillwrite() waiters.
@@ -481,47 +478,50 @@ bdirtyadd(void)
 }
 
 /*
- *     bufspace_wakeup:
+ *     bufspace_daemon_wakeup:
  *
- *     Called when buffer space is potentially available for recovery.
- *     getnewbuf() will block on this flag when it is unable to free 
- *     sufficient buffer space.  Buffer space becomes recoverable when 
- *     bp's get placed back in the queues.
+ *     Wakeup the daemons responsible for freeing clean bufs.
  */
 static void
-bufspace_wakeup(void)
+bufspace_daemon_wakeup(struct bufdomain *bd)
 {
 
        /*
-        * If someone is waiting for bufspace, wake them up.
-        *
-        * Since needsbuffer is set prior to doing an additional queue
-        * scan it is safe to check for the flag prior to acquiring the
-        * lock.  The thread that is preparing to scan again before
-        * blocking would discover the buf we released.
+        * avoid the lock if the daemon is running.
         */
-       if (needsbuffer) {
-               rw_rlock(&nblock);
-               if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
-                       wakeup(__DEVOLATILE(void *, &needsbuffer));
-               rw_runlock(&nblock);
+       if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
+               BD_RUN_LOCK(bd);
+               atomic_store_int(&bd->bd_running, 1);
+               wakeup(&bd->bd_running);
+               BD_RUN_UNLOCK(bd);
        }
 }
 
 /*
- *     bufspace_daemonwakeup:
+ *     bufspace_daemon_wait:
  *
- *     Wakeup the daemon responsible for freeing clean bufs.
+ *     Sleep until the domain falls below a limit or one second passes.
  */
 static void
-bufspace_daemonwakeup(void)
+bufspace_daemon_wait(struct bufdomain *bd)
 {
-       rw_rlock(&nblock);
-       if (bufspace_request == 0) {
-               bufspace_request = 1;
-               wakeup(&bufspace_request);
+       /*
+        * Re-check our limits and sleep.  bd_running must be
+        * cleared prior to checking the limits to avoid missed
+        * wakeups.  The waker will adjust one of bufspace or
+        * freebuffers prior to checking bd_running.
+        */
+       BD_RUN_LOCK(bd);
+       atomic_store_int(&bd->bd_running, 0);
+       if (bd->bd_bufspace < bd->bd_bufspacethresh &&
+           bd->bd_freebuffers > bd->bd_lofreebuffers) {
+               msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
+                   "-", hz);
+       } else {
+               /* Avoid spurious wakeups while running. */
+               atomic_store_int(&bd->bd_running, 1);
+               BD_RUN_UNLOCK(bd);
        }
-       rw_runlock(&nblock);
 }
 
 /*
@@ -533,20 +533,22 @@ bufspace_daemonwakeup(void)
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
+       struct bufdomain *bd;
        long space;
        int diff;
 
        KASSERT((bp->b_flags & B_MALLOC) == 0,
            ("bufspace_adjust: malloc buf %p", bp));
+       bd = &bdclean[bp->b_domain];
        diff = bufsize - bp->b_bufsize;
        if (diff < 0) {
-               atomic_subtract_long(&bufspace, -diff);
-               bufspace_wakeup();
+               atomic_subtract_long(&bd->bd_bufspace, -diff);
        } else {
-               space = atomic_fetchadd_long(&bufspace, diff);
+               space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
                /* Wake up the daemon on the transition. */
-               if (space < bufspacethresh && space + diff >= bufspacethresh)
-                       bufspace_daemonwakeup();
+               if (space < bd->bd_bufspacethresh &&
+                   space + diff >= bd->bd_bufspacethresh)
+                       bufspace_daemon_wakeup(bd);
        }
        bp->b_bufsize = bufsize;
 }
@@ -558,24 +560,25 @@ bufspace_adjust(struct buf *bp, int bufsize)
  *     different space limit than data.
  */
 static int
-bufspace_reserve(int size, bool metadata)
+bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
 {
-       long limit;
+       long limit, new;
        long space;
 
        if (metadata)
-               limit = maxbufspace;
+               limit = bd->bd_maxbufspace;
        else
-               limit = hibufspace;
-       do {
-               space = bufspace;
-               if (space + size > limit)
-                       return (ENOSPC);
-       } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+               limit = bd->bd_hibufspace;
+       space = atomic_fetchadd_long(&bd->bd_bufspace, size);
+       new = space + size;
+       if (new > limit) {
+               atomic_subtract_long(&bd->bd_bufspace, size);
+               return (ENOSPC);
+       }
 
        /* Wake up the daemon on the transition. */
-       if (space < bufspacethresh && space + size >= bufspacethresh)
-               bufspace_daemonwakeup();
+       if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
+               bufspace_daemon_wakeup(bd);
 
        return (0);
 }
@@ -586,21 +589,22 @@ bufspace_reserve(int size, bool metadata)
  *     Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
-bufspace_release(int size)
+bufspace_release(struct bufdomain *bd, int size)
 {
-       atomic_subtract_long(&bufspace, size);
-       bufspace_wakeup();
+
+       atomic_subtract_long(&bd->bd_bufspace, size);
 }
 
 /*
  *     bufspace_wait:
  *
  *     Wait for bufspace, acting as the buf daemon if a locked vnode is
- *     supplied.  needsbuffer must be set in a safe fashion prior to
- *     polling for space.  The operation must be re-tried on return.
+ *     supplied.  bd_wanted must be set prior to polling for space.  The
+ *     operation must be re-tried on return.
  */
 static void
-bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
+    int slpflag, int slptimeo)
 {
        struct thread *td;
        int error, fl, norunbuf;
@@ -609,11 +613,11 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
                return;
 
        td = curthread;
-       rw_wlock(&nblock);
-       while (needsbuffer != 0) {
+       BD_LOCK(bd);
+       while (bd->bd_wanted) {
                if (vp != NULL && vp->v_type != VCHR &&
                    (td->td_pflags & TDP_BUFNEED) == 0) {
-                       rw_wunlock(&nblock);
+                       BD_UNLOCK(bd);
                        /*
                         * getblk() is called with a vnode locked, and
                         * some majority of the dirty buffers may as
@@ -636,18 +640,18 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
                        td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
                        fl = buf_flush(vp, flushbufqtarget);
                        td->td_pflags &= norunbuf;
-                       rw_wlock(&nblock);
+                       BD_LOCK(bd);
                        if (fl != 0)
                                continue;
-                       if (needsbuffer == 0)
+                       if (bd->bd_wanted == 0)
                                break;
                }
-               error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+               error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
                    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
                if (error != 0)
                        break;
        }
-       rw_wunlock(&nblock);
+       BD_UNLOCK(bd);
 }
 
 
@@ -659,10 +663,13 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
  *     block nor work to reclaim buffers.
  */
 static void
-bufspace_daemon(void)
+bufspace_daemon(void *arg)
 {
+       struct bufdomain *bd;
+
+       bd = arg;
        for (;;) {
-               kproc_suspend_check(bufspacedaemonproc);
+               kproc_suspend_check(curproc);
 
                /*
                 * Free buffers from the clean queue until we meet our
@@ -689,46 +696,25 @@ bufspace_daemon(void)
                 *      which will inefficiently trade bufs with bqrelse
                 *      until we return to condition 2.
                 */
-               while (bufspace > lobufspace ||
-                   numfreebuffers < hifreebuffers) {
-                       if (buf_recycle(false) != 0) {
-                               atomic_set_int(&needsbuffer, 1);
-                               if (buf_recycle(false) != 0) {
-                                       rw_wlock(&nblock);
-                                       if (needsbuffer)
-                                               rw_sleep(__DEVOLATILE(void *,
-                                                   &needsbuffer), &nblock,
-                                                   PRIBIO|PDROP, "bufspace",
-                                                   hz/10);
-                                       else
-                                               rw_wunlock(&nblock);
-                               }
+               do {
+                       if (buf_recycle(bd, false) != 0) {
+                               if (bd_flushall(bd))
+                                       continue;
+                               BD_LOCK(bd);
+                               if (bd->bd_wanted) {
+                                       msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+                                           PRIBIO|PDROP, "bufspace", hz/10);
+                               } else
+                                       BD_UNLOCK(bd);
                        }
                        maybe_yield();
-               }
+               } while (bd->bd_bufspace > bd->bd_lobufspace ||
+                   bd->bd_freebuffers < bd->bd_hifreebuffers);
 
-               /*
-                * Re-check our limits under the exclusive nblock.
-                */
-               rw_wlock(&nblock);
-               if (bufspace < bufspacethresh &&
-                   numfreebuffers > lofreebuffers) {
-                       bufspace_request = 0;
-                       rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
-                           "-", hz);
-               } else
-                       rw_wunlock(&nblock);
+               bufspace_daemon_wait(bd);
        }
 }
 
-static struct kproc_desc bufspace_kp = {
-       "bufspacedaemon",
-       bufspace_daemon,
-       &bufspacedaemonproc
-};
-SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
-    &bufspace_kp);
-
 /*
  *     bufmallocadjust:
  *
@@ -842,7 +828,7 @@ vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, 
 }
 
 /* Wake up the buffer daemon if necessary */
-static __inline void
+static void
 bd_wakeup(void)
 {
 
@@ -1038,19 +1024,12 @@ bufinit(void)
        KASSERT(maxbcachebuf >= MAXBSIZE,
            ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
            MAXBSIZE));
-       mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
-       mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
-       for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
-               mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
+       bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
+       bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock");
        mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
-       rw_init(&nblock, "needsbuffer lock");
        mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
        mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
-       /* next, make a null set of free lists */
-       for (i = 0; i < BUFFER_QUEUES; i++)
-               TAILQ_INIT(&bufqueues[i]);
-
        unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
        /* finally, initialize each buffer header and stick on empty q */
@@ -1060,15 +1039,14 @@ bufinit(void)
                bp->b_flags = B_INVAL;
                bp->b_rcred = NOCRED;
                bp->b_wcred = NOCRED;
-               bp->b_qindex = QUEUE_EMPTY;
+               bp->b_qindex = QUEUE_NONE;
+               bp->b_domain = -1;
+               bp->b_subqueue = mp_ncpus;
                bp->b_xflags = 0;
                bp->b_data = bp->b_kvabase = unmapped_buf;
                LIST_INIT(&bp->b_dep);
                BUF_LOCKINIT(bp);
-               TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
-#ifdef INVARIANTS
-               bq_len[QUEUE_EMPTY]++;
-#endif
+               bq_insert(&bqempty, bp, false);
        }
 
        /*
@@ -1150,8 +1128,31 @@ bufinit(void)
         * One queue per-256mb up to the max.  More queues gives better
         * concurrency but less accurate LRU.
         */
-       clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+       clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS);
+       for (i = 0 ; i < clean_domains; i++) {
+               struct bufdomain *bd;
 
+               bd = &bdclean[i];
+               bd_init(bd);
+               bd->bd_freebuffers = nbuf / clean_domains;
+               bd->bd_hifreebuffers = hifreebuffers / clean_domains;
+               bd->bd_lofreebuffers = lofreebuffers / clean_domains;
+               bd->bd_bufspace = 0;
+               bd->bd_maxbufspace = maxbufspace / clean_domains;
+               bd->bd_hibufspace = hibufspace / clean_domains;
+               bd->bd_lobufspace = lobufspace / clean_domains;
+               bd->bd_bufspacethresh = bufspacethresh / clean_domains;
+               /* Don't allow more than 2% of bufs in the per-cpu caches. */
+               bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus;
+       }
+       getnewbufcalls = counter_u64_alloc(M_WAITOK);
+       getnewbufrestarts = counter_u64_alloc(M_WAITOK);
+       mappingrestarts = counter_u64_alloc(M_WAITOK);
+       numbufallocfails = counter_u64_alloc(M_WAITOK);
+       notbufdflushes = counter_u64_alloc(M_WAITOK);
+       buffreekvacnt = counter_u64_alloc(M_WAITOK);
+       bufdefragcnt = counter_u64_alloc(M_WAITOK);
+       bufkvaspace = counter_u64_alloc(M_WAITOK);
 }
 
 #ifdef INVARIANTS
@@ -1326,58 +1327,92 @@ bpmap_qenter(struct buf *bp)
            (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
+static struct bufqueue *
+bufqueue(struct buf *bp)
+{
+
+       switch (bp->b_qindex) {
+       case QUEUE_NONE:
+               /* FALLTHROUGH */
+       case QUEUE_SENTINEL:
+               return (NULL);
+       case QUEUE_EMPTY:
+               return (&bqempty);
+       case QUEUE_DIRTY:
+               return (&bqdirty);
+       case QUEUE_CLEAN:
+               return (&bdclean[bp->b_domain].bd_subq[bp->b_subqueue]);
+       default:
+               break;
+       }
+       panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
+}
+
 /*
+ * Return the locked bufqueue that bp is a member of.
+ */
+static struct bufqueue *
+bufqueue_acquire(struct buf *bp)
+{
+       struct bufqueue *bq, *nbq;
+
+       /*
+        * bp can be pushed from a per-cpu queue to the
+        * cleanq while we're waiting on the lock.  Retry
+        * if the queues don't match.
+        */
+       bq = bufqueue(bp);
+       BQ_LOCK(bq);
+       for (;;) {
+               nbq = bufqueue(bp);
+               if (bq == nbq)
+                       break;
+               BQ_UNLOCK(bq);
+               BQ_LOCK(nbq);
+               bq = nbq;
+       }
+       return (bq);
+}
+
+/*
  *     binsfree:
  *
- *     Insert the buffer into the appropriate free list.
+ *     Insert the buffer into the appropriate free list.  Requires a
+ *     locked buffer on entry and buffer is unlocked before return.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
-       struct mtx *olock, *nlock;
+       struct bufdomain *bd;
+       struct bufqueue *bq;
 
-       if (qindex != QUEUE_EMPTY) {
-               BUF_ASSERT_XLOCKED(bp);
-       }
+       KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
+           ("binsfree: Invalid qindex %d", qindex));
+       BUF_ASSERT_XLOCKED(bp);
 
        /*
-        * Stick to the same clean queue for the lifetime of the buf to
-        * limit locking below.  Otherwise pick ont sequentially.
-        */
-       if (qindex == QUEUE_CLEAN) {
-               if (bqisclean(bp->b_qindex))
-                       qindex = bp->b_qindex;
-               else
-                       qindex = bqcleanq();
-       }
-
-       /*
         * Handle delayed bremfree() processing.
         */
-       nlock = bqlock(qindex);
        if (bp->b_flags & B_REMFREE) {
-               olock = bqlock(bp->b_qindex);
-               mtx_lock(olock);
-               bremfreel(bp);
-               if (olock != nlock) {
-                       mtx_unlock(olock);
-                       mtx_lock(nlock);
+               if (bp->b_qindex == qindex) {
+                       bp->b_flags |= B_REUSE;
+                       bp->b_flags &= ~B_REMFREE;
+                       BUF_UNLOCK(bp);
+                       return;
                }
+               bq = bufqueue_acquire(bp);
+               bq_remove(bq, bp);
+               BQ_UNLOCK(bq);
+       }
+       if (qindex == QUEUE_CLEAN) {
+               bd = &bdclean[bp->b_domain];
+               if (bd->bd_lim != 0)
+                       bq = &bd->bd_subq[PCPU_GET(cpuid)];
+               else
+                       bq = bd->bd_cleanq;
        } else
-               mtx_lock(nlock);
-
-       if (bp->b_qindex != QUEUE_NONE)
-               panic("binsfree: free buffer onto another queue???");
-
-       bp->b_qindex = qindex;
-       if (bp->b_flags & B_AGE)
-               TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
-       else
-               TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-#ifdef INVARIANTS
-       bq_len[bp->b_qindex]++;
-#endif
-       mtx_unlock(nlock);
+               bq = &bqdirty;
+       bq_insert(bq, bp, true);
 }
 
 /*
@@ -1404,10 +1439,9 @@ buf_free(struct buf *bp)
        if (!LIST_EMPTY(&bp->b_dep))
                buf_deallocate(bp);
        bufkva_free(bp);
+       atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1);
        BUF_UNLOCK(bp);
        uma_zfree(buf_zone, bp);
-       atomic_add_int(&numfreebuffers, 1);
-       bufspace_wakeup();
 }
 
 /*
@@ -1424,15 +1458,15 @@ buf_import(void *arg, void **store, int cnt, int domai
        struct buf *bp;
        int i;
 
-       mtx_lock(&bqlocks[QUEUE_EMPTY]);
+       BQ_LOCK(&bqempty);
        for (i = 0; i < cnt; i++) {
-               bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+               bp = TAILQ_FIRST(&bqempty.bq_queue);
                if (bp == NULL)
                        break;
-               bremfreel(bp);
+               bq_remove(&bqempty, bp);
                store[i] = bp;
        }
-       mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+       BQ_UNLOCK(&bqempty);
 
        return (i);
 }
@@ -1445,10 +1479,21 @@ buf_import(void *arg, void **store, int cnt, int domai
 static void
 buf_release(void *arg, void **store, int cnt)
 {
+       struct bufqueue *bq;
+       struct buf *bp;
         int i;
 
-        for (i = 0; i < cnt; i++)
-               binsfree(store[i], QUEUE_EMPTY);
+       bq = &bqempty;
+       BQ_LOCK(bq);
+        for (i = 0; i < cnt; i++) {
+               bp = store[i];
+               /* Inline bq_insert() to batch locking. */
+               TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+               bp->b_flags &= ~(B_AGE | B_REUSE);
+               bq->bq_len++;
+               bp->b_qindex = bq->bq_index;
+       }
+       BQ_UNLOCK(bq);
 }
 
 /*
@@ -1457,22 +1502,31 @@ buf_release(void *arg, void **store, int cnt)
  *     Allocate an empty buffer header.
  */
 static struct buf *
-buf_alloc(void)
+buf_alloc(struct bufdomain *bd)
 {
        struct buf *bp;
+       int freebufs;
 
-       bp = uma_zalloc(buf_zone, M_NOWAIT);
+       /*
+        * We can only run out of bufs in the buf zone if the average buf
+        * is less than BKVASIZE.  In this case the actual wait/block will
+        * come from buf_reycle() failing to flush one of these small bufs.
+        */
+       bp = NULL;
+       freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
+       if (freebufs > 0)
+               bp = uma_zalloc(buf_zone, M_NOWAIT);
        if (bp == NULL) {
-               bufspace_daemonwakeup();
-               atomic_add_int(&numbufallocfails, 1);
+               atomic_fetchadd_int(&bd->bd_freebuffers, 1);
+               bufspace_daemon_wakeup(bd);
+               counter_u64_add(numbufallocfails, 1);
                return (NULL);
        }
-
        /*
-        * Wake-up the bufspace daemon on transition.
+        * Wake-up the bufspace daemon on transition below threshold.
         */
-       if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
-               bufspace_daemonwakeup();
+       if (freebufs == bd->bd_lofreebuffers)
+               bufspace_daemon_wakeup(bd);
 
        if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
                panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
@@ -1488,6 +1542,7 @@ buf_alloc(void)
        KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
        KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
+       bp->b_domain = BD_DOMAIN(bd);
        bp->b_flags = 0;
        bp->b_ioflags = 0;
        bp->b_xflags = 0;
@@ -1512,22 +1567,26 @@ buf_alloc(void)
 }
 
 /*
- *     buf_qrecycle:
+ *     buf_recycle:
  *
  *     Free a buffer from the given bufqueue.  kva controls whether the
  *     freed buf must own some kva resources.  This is used for
  *     defragmenting.
  */
 static int
-buf_qrecycle(int qindex, bool kva)
+buf_recycle(struct bufdomain *bd, bool kva)
 {
+       struct bufqueue *bq;
        struct buf *bp, *nbp;
 
        if (kva)
-               atomic_add_int(&bufdefragcnt, 1);
+               counter_u64_add(bufdefragcnt, 1);
        nbp = NULL;
-       mtx_lock(&bqlocks[qindex]);
-       nbp = TAILQ_FIRST(&bufqueues[qindex]);
+       bq = bd->bd_cleanq;
+       BQ_LOCK(bq);
+       KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
+           ("buf_recycle: Locks don't match"));
+       nbp = TAILQ_FIRST(&bq->bq_queue);
 
        /*
         * Run scan, possibly freeing data and/or kva mappings on the fly
@@ -1551,6 +1610,18 @@ buf_qrecycle(int qindex, bool kva)
                        continue;
 
                /*
+                * Implement a second chance algorithm for frequently
+                * accessed buffers.
+                */
+               if ((bp->b_flags & B_REUSE) != 0) {
+                       TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+                       TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+                       bp->b_flags &= ~B_REUSE;
+                       BUF_UNLOCK(bp);
+                       continue;
+               }
+
+               /*
                 * Skip buffers with background writes in progress.
                 */
                if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
@@ -1558,14 +1629,18 @@ buf_qrecycle(int qindex, bool kva)
                        continue;
                }
 
-               KASSERT(bp->b_qindex == qindex,
-                   ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+               KASSERT(bp->b_qindex == QUEUE_CLEAN,
+                   ("buf_recycle: inconsistent queue %d bp %p",
+                   bp->b_qindex, bp));
+               KASSERT(bp->b_domain == BD_DOMAIN(bd),
+                   ("getnewbuf: queue domain %d doesn't match request %d",
+                   bp->b_domain, (int)BD_DOMAIN(bd)));
                /*
                 * NOTE:  nbp is now entirely invalid.  We can only restart
                 * the scan from this point on.
                 */
-               bremfreel(bp);
-               mtx_unlock(&bqlocks[qindex]);
+               bq_remove(bq, bp);
+               BQ_UNLOCK(bq);
 
                /*
                 * Requeue the background write buffer with error and
@@ -1573,70 +1648,21 @@ buf_qrecycle(int qindex, bool kva)
                 */
                if ((bp->b_vflags & BV_BKGRDERR) != 0) {
                        bqrelse(bp);
-                       mtx_lock(&bqlocks[qindex]);
-                       nbp = TAILQ_FIRST(&bufqueues[qindex]);
+                       BQ_LOCK(bq);
+                       nbp = TAILQ_FIRST(&bq->bq_queue);
                        continue;
                }
                bp->b_flags |= B_INVAL;
                brelse(bp);
                return (0);
        }
-       mtx_unlock(&bqlocks[qindex]);
+       bd->bd_wanted = 1;
+       BQ_UNLOCK(bq);
 
        return (ENOBUFS);
 }

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to