Author: jeff
Date: Wed Jun  5 23:53:00 2013
New Revision: 251446
URL: http://svnweb.freebsd.org/changeset/base/251446

Log:
   - Consolidate duplicate code into support functions.
   - Split the bqlock into bqclean and bqdirty locks.
   - Only acquire the wakeup synchronization locks when we cross a
     threshold requiring them.
   - Restructure the way flushbufqueues() targets work so they are more
     smp friendly and sane.
  
  Reviewed by:  kib
  Discussed with:       mckusick, attilio
  Sponsored by: EMC / Isilon Storage Division
  
  M    vfs_bio.c

Modified:
  head/sys/kern/vfs_bio.c

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c     Wed Jun  5 23:28:29 2013        (r251445)
+++ head/sys/kern/vfs_bio.c     Wed Jun  5 23:53:00 2013        (r251446)
@@ -113,10 +113,11 @@ static void vfs_setdirty_locked_object(s
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
                daddr_t lblkno, daddr_t blkno);
-static int buf_do_flush(struct vnode *vp);
+static int buf_flush(struct vnode *vp, int);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
+static __inline void bd_wakeup(void);
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -217,8 +218,8 @@ SYSCTL_INT(_vfs, OID_AUTO, mappingrestar
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflashes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
+static long notbufdflushes;
+SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
@@ -228,6 +229,37 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_
     "Permit the use of the unmapped i/o");
 
 /*
+ * Lock for the non-dirty bufqueues
+ */
+static struct mtx_padalign bqclean;
+
+/*
+ * Lock for the dirty queue.
+ */
+static struct mtx_padalign bqdirty;
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign rbreqlock;
+
+/*
+ * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ */
+static struct mtx_padalign nblock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign bdirtylock;
+
+/*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
@@ -243,11 +275,6 @@ static int bd_request;
 static int bd_speedupreq;
 
 /*
- * This lock synchronizes access to bd_request.
- */
-static struct mtx bdlock;
-
-/*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
@@ -263,25 +290,19 @@ vm_page_t bogus_page;
  */
 static int runningbufreq;
 
-/*
- * This lock protects the runningbufreq and synchronizes runningbufwakeup and
- * waitrunningbufspace().
- */
-static struct mtx rbreqlock;
-
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static int needsbuffer;
 
 /*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ * Synchronization for bwillwrite() waiters.
  */
-static struct mtx nblock;
+static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
@@ -301,9 +322,6 @@ static TAILQ_HEAD(bqueues, buf) bufqueue
 static int bq_len[BUFFER_QUEUES];
 #endif
 
-/* Lock for the bufqueues */
-static struct mtx bqlock;
-
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
@@ -311,7 +329,6 @@ static struct mtx bqlock;
 const char *buf_wmesg = BUF_WMESG;
 
 #define VFS_BIO_NEED_ANY       0x01    /* any freeable buffer */
-#define VFS_BIO_NEED_DIRTYFLUSH        0x02    /* waiting for dirty buffer 
flush */
 #define VFS_BIO_NEED_FREE      0x04    /* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE  0x08    /* wait for buf space, lo hysteresis */
 
@@ -337,25 +354,69 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 #ifdef DIRECTIO
 extern void ffs_rawread_setup(void);
 #endif /* DIRECTIO */
+
 /*
- *     numdirtywakeup:
+ *     bqlock:
  *
- *     If someone is blocked due to there being too many dirty buffers,
- *     and numdirtybuffers is now reasonable, wake them up.
+ *     Return the appropriate queue lock based on the index.
  */
-
-static __inline void
-numdirtywakeup(int level)
+static inline struct mtx *
+bqlock(int qindex)
 {
 
-       if (numdirtybuffers <= level) {
-               mtx_lock(&nblock);
-               if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
-                       needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
-                       wakeup(&needsbuffer);
-               }
-               mtx_unlock(&nblock);
+       if (qindex == QUEUE_DIRTY)
+               return (struct mtx *)(&bqdirty);
+       return (struct mtx *)(&bqclean);
+}
+
+/*
+ *     bdirtywakeup:
+ *
+ *     Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
+{
+       mtx_lock(&bdirtylock);
+       if (bdirtywait) {
+               bdirtywait = 0;
+               wakeup(&bdirtywait);
        }
+       mtx_unlock(&bdirtylock);
+}
+
+/*
+ *     bdirtysub:
+ *
+ *     Decrement the numdirtybuffers count by one and wakeup any
+ *     threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(void)
+{
+
+       if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
+           (lodirtybuffers + hidirtybuffers) / 2)
+               bdirtywakeup();
+}
+
+/*
+ *     bdirtyadd:
+ *
+ *     Increment the numdirtybuffers count by one and wakeup the buf 
+ *     daemon if needed.
+ */
+static void
+bdirtyadd(void)
+{
+
+       /*
+        * Only do the wakeup once as we cross the boundary.  The
+        * buf daemon will keep running until the condition clears.
+        */
+       if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
+           (lodirtybuffers + hidirtybuffers) / 2)
+               bd_wakeup();
 }
 
 /*
@@ -385,36 +446,59 @@ bufspacewakeup(void)
 }
 
 /*
- * runningbufwakeup() - in-progress I/O accounting.
+ *     runningwakeup:
  *
+ *     Wake up processes that are waiting on asynchronous writes to fall
+ *     below lorunningspace.
+ */
+static void
+runningwakeup(void)
+{
+
+       mtx_lock(&rbreqlock);
+       if (runningbufreq) {
+               runningbufreq = 0;
+               wakeup(&runningbufreq);
+       }
+       mtx_unlock(&rbreqlock);
+}
+
+/*
+ *     runningbufwakeup:
+ *
+ *     Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
+       long space, bspace;
 
-       if (bp->b_runningbufspace) {
-               atomic_subtract_long(&runningbufspace, bp->b_runningbufspace);
-               bp->b_runningbufspace = 0;
-               mtx_lock(&rbreqlock);
-               if (runningbufreq && runningbufspace <= lorunningspace) {
-                       runningbufreq = 0;
-                       wakeup(&runningbufreq);
-               }
-               mtx_unlock(&rbreqlock);
-       }
+       if (bp->b_runningbufspace == 0)
+               return;
+       space = atomic_fetchadd_long(&runningbufspace, -bp->b_runningbufspace);
+       bspace = bp->b_runningbufspace;
+       bp->b_runningbufspace = 0;
+       /*
+        * Only acquire the lock and wakeup on the transition from exceeding
+        * the threshold to falling below it.
+        */
+       if (space < lorunningspace)
+               return;
+       if (space - bspace > lorunningspace)
+               return;
+       runningwakeup();
 }
 
 /*
- *     bufcountwakeup:
+ *     bufcountadd:
  *
  *     Called when a buffer has been added to one of the free queues to
  *     account for the buffer and to wakeup anyone waiting for free buffers.
  *     This typically occurs when large amounts of metadata are being handled
  *     by the buffer cache ( else buffer space runs out first, usually ).
  */
-
 static __inline void
-bufcountwakeup(struct buf *bp) 
+bufcountadd(struct buf *bp)
 {
        int old;
 
@@ -435,6 +519,30 @@ bufcountwakeup(struct buf *bp) 
 }
 
 /*
+ *     bufcountsub:
+ *
+ *     Decrement the numfreebuffers count as needed.
+ */
+static void
+bufcountsub(struct buf *bp)
+{
+       int old;
+
+       /*
+        * Fixup numfreebuffers count.  If the buffer is invalid or not
+        * delayed-write, the buffer was free and we must decrement
+        * numfreebuffers.
+        */
+       if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+               KASSERT((bp->b_flags & B_INFREECNT) != 0,
+                   ("buf %p not counted in numfreebuffers", bp));
+               bp->b_flags &= ~B_INFREECNT;
+               old = atomic_fetchadd_int(&numfreebuffers, -1);
+               KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
+       }
+}
+
+/*
  *     waitrunningbufspace()
  *
  *     runningbufspace is a measure of the amount of I/O currently
@@ -442,9 +550,6 @@ bufcountwakeup(struct buf *bp) 
  *     prevent creating huge backups of pending writes to a device.
  *     Only asynchronous writes are governed by this function.
  *
- *     Reads will adjust runningbufspace, but will not block based on it.
- *     The read load has a side effect of reducing the allowed write load.
- *
  *     This does NOT turn an async write into a sync write.  It waits  
  *     for earlier writes to complete and generally returns before the
  *     caller's write has reached the device.
@@ -485,13 +590,12 @@ vfs_buf_test_cache(struct buf *bp,
 }
 
 /* Wake up the buffer daemon if necessary */
-static __inline
-void
-bd_wakeup(int dirtybuflevel)
+static __inline void
+bd_wakeup(void)
 {
 
        mtx_lock(&bdlock);
-       if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
+       if (bd_request == 0) {
                bd_request = 1;
                wakeup(&bd_request);
        }
@@ -501,7 +605,6 @@ bd_wakeup(int dirtybuflevel)
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
-
 void
 bd_speedup(void)
 {
@@ -656,10 +759,12 @@ bufinit(void)
        struct buf *bp;
        int i;
 
-       mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
+       mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
+       mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
        mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
        mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
        mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+       mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
        /* next, make a null set of free lists */
        for (i = 0; i < BUFFER_QUEUES; i++)
@@ -831,15 +936,60 @@ bfreekva(struct buf *bp)
 }
 
 /*
+ *     binsfree:
+ *
+ *     Insert the buffer into the appropriate free list.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+       struct mtx *olock, *nlock;
+
+       BUF_ASSERT_XLOCKED(bp);
+
+       olock = bqlock(bp->b_qindex);
+       nlock = bqlock(qindex);
+       mtx_lock(olock);
+       /* Handle delayed bremfree() processing. */
+       if (bp->b_flags & B_REMFREE)
+               bremfreel(bp);
+
+       if (bp->b_qindex != QUEUE_NONE)
+               panic("binsfree: free buffer onto another queue???");
+
+       bp->b_qindex = qindex;
+       if (olock != nlock) {
+               mtx_unlock(olock);
+               mtx_lock(nlock);
+       }
+       if (bp->b_flags & B_AGE)
+               TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+       else
+               TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+       bq_len[bp->b_qindex]++;
+#endif
+       mtx_unlock(nlock);
+
+       /*
+        * Something we can maybe free or reuse.
+        */
+       if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+               bufspacewakeup();
+
+       if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
+               bufcountadd(bp);
+}
+
+/*
  *     bremfree:
  *
- *     Mark the buffer for removal from the appropriate free list in brelse.
+ *     Mark the buffer for removal from the appropriate free list.
  *     
  */
 void
 bremfree(struct buf *bp)
 {
-       int old;
 
        CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
        KASSERT((bp->b_flags & B_REMFREE) == 0,
@@ -849,14 +999,7 @@ bremfree(struct buf *bp)
        BUF_ASSERT_XLOCKED(bp);
 
        bp->b_flags |= B_REMFREE;
-       /* Fixup numfreebuffers count.  */
-       if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
-               KASSERT((bp->b_flags & B_INFREECNT) != 0,
-                   ("buf %p not counted in numfreebuffers", bp));
-               bp->b_flags &= ~B_INFREECNT;
-               old = atomic_fetchadd_int(&numfreebuffers, -1);
-               KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
-       }
+       bufcountsub(bp);
 }
 
 /*
@@ -868,28 +1011,30 @@ bremfree(struct buf *bp)
 void
 bremfreef(struct buf *bp)
 {
-       mtx_lock(&bqlock);
+       struct mtx *qlock;
+
+       qlock = bqlock(bp->b_qindex);
+       mtx_lock(qlock);
        bremfreel(bp);
-       mtx_unlock(&bqlock);
+       mtx_unlock(qlock);
 }
 
 /*
  *     bremfreel:
  *
  *     Removes a buffer from the free list, must be called with the
- *     bqlock held.
+ *     correct qlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
-       int old;
 
        CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
            bp, bp->b_vp, bp->b_flags);
        KASSERT(bp->b_qindex != QUEUE_NONE,
            ("bremfreel: buffer %p not on a queue.", bp));
        BUF_ASSERT_XLOCKED(bp);
-       mtx_assert(&bqlock, MA_OWNED);
+       mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
 
        TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
@@ -906,18 +1051,7 @@ bremfreel(struct buf *bp)
                bp->b_flags &= ~B_REMFREE;
                return;
        }
-       /*
-        * Fixup numfreebuffers count.  If the buffer is invalid or not
-        * delayed-write, the buffer was free and we must decrement
-        * numfreebuffers.
-        */
-       if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
-               KASSERT((bp->b_flags & B_INFREECNT) != 0,
-                   ("buf %p not counted in numfreebuffers", bp));
-               bp->b_flags &= ~B_INFREECNT;
-               old = atomic_fetchadd_int(&numfreebuffers, -1);
-               KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
-       }
+       bufcountsub(bp);
 }
 
 /*
@@ -1018,6 +1152,7 @@ bufwrite(struct buf *bp)
 {
        int oldflags;
        struct vnode *vp;
+       long space;
        int vp_md;
 
        CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
@@ -1065,7 +1200,7 @@ bufwrite(struct buf *bp)
         * Normal bwrites pipeline writes
         */
        bp->b_runningbufspace = bp->b_bufsize;
-       atomic_add_long(&runningbufspace, bp->b_runningbufspace);
+       space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
        if (!TD_IS_IDLETHREAD(curthread))
                curthread->td_ru.ru_oublock++;
@@ -1078,7 +1213,7 @@ bufwrite(struct buf *bp)
                int rtval = bufwait(bp);
                brelse(bp);
                return (rtval);
-       } else {
+       } else if (space > hirunningspace) {
                /*
                 * don't allow the async write to saturate the I/O
                 * system.  We will not deadlock here because
@@ -1213,13 +1348,6 @@ bdwrite(struct buf *bp)
        bqrelse(bp);
 
        /*
-        * Wakeup the buffer flushing daemon if we have a lot of dirty
-        * buffers (midpoint between our recovery point and our stall
-        * point).
-        */
-       bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
-
-       /*
         * note: we cannot initiate I/O from a bdwrite even if we wanted to,
         * due to the softdep code.
         */
@@ -1259,8 +1387,7 @@ bdirty(struct buf *bp)
        if ((bp->b_flags & B_DELWRI) == 0) {
                bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
                reassignbuf(bp);
-               atomic_add_int(&numdirtybuffers, 1);
-               bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+               bdirtyadd();
        }
 }
 
@@ -1288,8 +1415,7 @@ bundirty(struct buf *bp)
        if (bp->b_flags & B_DELWRI) {
                bp->b_flags &= ~B_DELWRI;
                reassignbuf(bp);
-               atomic_subtract_int(&numdirtybuffers, 1);
-               numdirtywakeup(lodirtybuffers);
+               bdirtysub();
        }
        /*
         * Since it is now being written, we can clear its deferred write flag.
@@ -1357,20 +1483,18 @@ bbarrierwrite(struct buf *bp)
  *     of any vnodes we attempt to avoid the situation where a locked vnode
  *     prevents the various system daemons from flushing related buffers.
  */
-
 void
 bwillwrite(void)
 {
 
        if (numdirtybuffers >= hidirtybuffers) {
-               mtx_lock(&nblock);
+               mtx_lock(&bdirtylock);
                while (numdirtybuffers >= hidirtybuffers) {
-                       bd_wakeup(1);
-                       needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
-                       msleep(&needsbuffer, &nblock,
-                           (PRIBIO + 4), "flswai", 0);
+                       bdirtywait = 1;
+                       msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+                           "flswai", 0);
                }
-               mtx_unlock(&nblock);
+               mtx_unlock(&bdirtylock);
        }
 }
 
@@ -1403,6 +1527,8 @@ buf_vm_page_count_severe(void)
 void
 brelse(struct buf *bp)
 {
+       int qindex;
+
        CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
            bp, bp->b_vp, bp->b_flags);
        KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1441,10 +1567,8 @@ brelse(struct buf *bp)
                bp->b_flags |= B_INVAL;
                if (!LIST_EMPTY(&bp->b_dep))
                        buf_deallocate(bp);
-               if (bp->b_flags & B_DELWRI) {
-                       atomic_subtract_int(&numdirtybuffers, 1);
-                       numdirtywakeup(lodirtybuffers);
-               }
+               if (bp->b_flags & B_DELWRI)
+                       bdirtysub();
                bp->b_flags &= ~(B_DELWRI | B_CACHE);
                if ((bp->b_flags & B_VMIO) == 0) {
                        if (bp->b_bufsize)
@@ -1591,15 +1715,6 @@ brelse(struct buf *bp)
                        brelvp(bp);
        }
                        
-       /* enqueue */
-       mtx_lock(&bqlock);
-       /* Handle delayed bremfree() processing. */
-       if (bp->b_flags & B_REMFREE)
-               bremfreel(bp);
-
-       if (bp->b_qindex != QUEUE_NONE)
-               panic("brelse: free buffer onto another queue???");
-
        /*
         * If the buffer has junk contents signal it and eventually
         * clean up B_DELWRI and diassociate the vnode so that gbincore()
@@ -1620,54 +1735,26 @@ brelse(struct buf *bp)
                bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
                if (bp->b_vflags & BV_BKGRDINPROG)
                        panic("losing buffer 1");
-               if (bp->b_kvasize) {
-                       bp->b_qindex = QUEUE_EMPTYKVA;
-               } else {
-                       bp->b_qindex = QUEUE_EMPTY;
-               }
-               TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+               if (bp->b_kvasize)
+                       qindex = QUEUE_EMPTYKVA;
+               else
+                       qindex = QUEUE_EMPTY;
+               bp->b_flags |= B_AGE;
        /* buffers with junk contents */
        } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
            (bp->b_ioflags & BIO_ERROR)) {
                bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
                if (bp->b_vflags & BV_BKGRDINPROG)
                        panic("losing buffer 2");
-               bp->b_qindex = QUEUE_CLEAN;
-               TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+               qindex = QUEUE_CLEAN;
+               bp->b_flags |= B_AGE;
        /* remaining buffers */
-       } else {
-               if (bp->b_flags & B_DELWRI)
-                       bp->b_qindex = QUEUE_DIRTY;
-               else
-                       bp->b_qindex = QUEUE_CLEAN;
-               if (bp->b_flags & B_AGE) {
-                       TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp,
-                           b_freelist);
-               } else {
-                       TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp,
-                           b_freelist);
-               }
-       }
-#ifdef INVARIANTS
-       bq_len[bp->b_qindex]++;
-#endif
-       mtx_unlock(&bqlock);
-
-       /*
-        * Fixup numfreebuffers count.  The bp is on an appropriate queue
-        * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
-        * We've already handled the B_INVAL case ( B_DELWRI will be clear
-        * if B_INVAL is set ).
-        */
-
-       if (!(bp->b_flags & B_DELWRI))
-               bufcountwakeup(bp);
+       } else if (bp->b_flags & B_DELWRI)
+               qindex = QUEUE_DIRTY;
+       else
+               qindex = QUEUE_CLEAN;
 
-       /*
-        * Something we can maybe free or reuse
-        */
-       if (bp->b_bufsize || bp->b_kvasize)
-               bufspacewakeup();
+       binsfree(bp, qindex);
 
        bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
        if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
@@ -1690,7 +1777,7 @@ brelse(struct buf *bp)
 void
 bqrelse(struct buf *bp)
 {
-       struct bufobj *bo;
+       int qindex;
 
        CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
        KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1701,71 +1788,40 @@ bqrelse(struct buf *bp)
                BUF_UNLOCK(bp);
                return;
        }
+       bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
-       bo = bp->b_bufobj;
        if (bp->b_flags & B_MANAGED) {
-               if (bp->b_flags & B_REMFREE) {
-                       mtx_lock(&bqlock);
-                       bremfreel(bp);
-                       mtx_unlock(&bqlock);
-               }
-               bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
-               BUF_UNLOCK(bp);
-               return;
+               if (bp->b_flags & B_REMFREE)
+                       bremfreef(bp);
+               goto out;
        }
 
-       mtx_lock(&bqlock);
-       /* Handle delayed bremfree() processing. */
-       if (bp->b_flags & B_REMFREE)
-               bremfreel(bp);
-
-       if (bp->b_qindex != QUEUE_NONE)
-               panic("bqrelse: free buffer onto another queue???");
        /* buffers with stale but valid contents */
        if (bp->b_flags & B_DELWRI) {
-               bp->b_qindex = QUEUE_DIRTY;
-               TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-#ifdef INVARIANTS
-               bq_len[bp->b_qindex]++;
-#endif
+               qindex = QUEUE_DIRTY;
        } else {
+               if ((bp->b_flags & B_DELWRI) == 0 &&
+                   (bp->b_xflags & BX_VNDIRTY))
+                       panic("bqrelse: not dirty");
                /*
                 * BKGRDINPROG can only be set with the buf and bufobj
                 * locks both held.  We tolerate a race to clear it here.
                 */
-               if (!buf_vm_page_count_severe() ||
-                   (bp->b_vflags & BV_BKGRDINPROG)) {
-                       bp->b_qindex = QUEUE_CLEAN;
-                       TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
-                           b_freelist);
-#ifdef INVARIANTS
-                       bq_len[QUEUE_CLEAN]++;
-#endif
-               } else {
+               if (buf_vm_page_count_severe() &&
+                   (bp->b_vflags & BV_BKGRDINPROG) == 0) {
                        /*
                         * We are too low on memory, we have to try to free
                         * the buffer (most importantly: the wired pages
                         * making up its backing store) *now*.
                         */
-                       mtx_unlock(&bqlock);
                        brelse(bp);
                        return;
                }
+               qindex = QUEUE_CLEAN;
        }
-       mtx_unlock(&bqlock);
-
-       if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
-               bufcountwakeup(bp);
+       binsfree(bp, qindex);
 
-       /*
-        * Something we can maybe free or reuse.
-        */
-       if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
-               bufspacewakeup();
-
-       bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
-       if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
-               panic("bqrelse: not dirty");
+out:
        /* unlock */
        BUF_UNLOCK(bp);
 }
@@ -2000,7 +2056,7 @@ getnewbuf_bufd_help(struct vnode *vp, in
        char *waitmsg;
        int fl, flags, norunbuf;
 
-       mtx_assert(&bqlock, MA_OWNED);
+       mtx_assert(&bqclean, MA_OWNED);
 
        if (defrag) {
                flags = VFS_BIO_NEED_BUFSPACE;
@@ -2015,7 +2071,7 @@ getnewbuf_bufd_help(struct vnode *vp, in
        mtx_lock(&nblock);
        needsbuffer |= flags;
        mtx_unlock(&nblock);
-       mtx_unlock(&bqlock);
+       mtx_unlock(&bqclean);
 
        bd_speedup();   /* heeeelp */
        if ((gbflags & GB_NOWAIT_BD) != 0)
@@ -2038,7 +2094,7 @@ getnewbuf_bufd_help(struct vnode *vp, in
                            (td->td_pflags & TDP_NORUNNINGBUF);
                        /* play bufdaemon */
                        td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
-                       fl = buf_do_flush(vp);
+                       fl = buf_flush(vp, flushbufqtarget);
                        td->td_pflags &= norunbuf;
                        mtx_lock(&nblock);
                        if (fl != 0)
@@ -2060,7 +2116,7 @@ getnewbuf_reuse_bp(struct buf *bp, int q
        CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
            "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
             bp->b_kvasize, bp->b_bufsize, qindex);
-       mtx_assert(&bqlock, MA_NOTOWNED);
+       mtx_assert(&bqclean, MA_NOTOWNED);
 
        /*
         * Note: we no longer distinguish between VMIO and non-VMIO
@@ -2156,7 +2212,7 @@ restart:
         * where we cannot backup.
         */
        nbp = NULL;
-       mtx_lock(&bqlock);
+       mtx_lock(&bqclean);
        if (!defrag && unmapped) {
                nqindex = QUEUE_EMPTY;
                nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
@@ -2267,14 +2323,14 @@ restart:
                    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 
                bremfreel(bp);
-               mtx_unlock(&bqlock);
+               mtx_unlock(&bqclean);
                /*
                 * NOTE:  nbp is now entirely invalid.  We can only restart
                 * the scan from this point on.
                 */
 
                getnewbuf_reuse_bp(bp, qindex);
-               mtx_assert(&bqlock, MA_NOTOWNED);
+               mtx_assert(&bqclean, MA_NOTOWNED);
 
                /*
                 * If we are defragging then free the buffer.
@@ -2335,10 +2391,6 @@ restart:
  *             We have insufficient buffer space
  *             buffer_map is too fragmented ( space reservation fails )
  *             If we have to flush dirty buffers ( but we try to avoid this )
- *
- *     To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- *     Instead we ask the buf daemon to do it for us.  We attempt to
- *     avoid piecemeal wakeups of the pageout daemon.
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
@@ -2379,11 +2431,11 @@ restart:
         * Generally we are sleeping due to insufficient buffer space.
         */
        if (bp == NULL) {
-               mtx_assert(&bqlock, MA_OWNED);
+               mtx_assert(&bqclean, MA_OWNED);
                getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
-               mtx_assert(&bqlock, MA_NOTOWNED);
+               mtx_assert(&bqclean, MA_NOTOWNED);
        } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
-               mtx_assert(&bqlock, MA_NOTOWNED);
+               mtx_assert(&bqclean, MA_NOTOWNED);
 
                bfreekva(bp);
                bp->b_flags |= B_UNMAPPED;
@@ -2393,7 +2445,7 @@ restart:
                atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
                atomic_add_int(&bufreusecnt, 1);
        } else {
-               mtx_assert(&bqlock, MA_NOTOWNED);
+               mtx_assert(&bqclean, MA_NOTOWNED);
 
                /*
                 * We finally have a valid bp.  We aren't quite out of the
@@ -2464,18 +2516,20 @@ static struct kproc_desc buf_kp = {
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
-buf_do_flush(struct vnode *vp)
+buf_flush(struct vnode *vp, int target)
 {
        int flushed;
 
-       flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
+       flushed = flushbufqueues(vp, target, 0);
        if (flushed == 0) {
                /*
                 * Could not find any buffers without rollback
                 * dependencies, so just write the first one
                 * in the hopes of eventually making progress.
                 */
-               flushbufqueues(vp, QUEUE_DIRTY, 1);
+               if (vp != NULL && target > 2)
+                       target /= 2;
+               flushbufqueues(vp, target, 1);
        }
        return (flushed);
 }
@@ -2483,7 +2537,7 @@ buf_do_flush(struct vnode *vp)
 static void
 buf_daemon()
 {
-       int lodirtysave;
+       int lodirty;
 
        /*
         * This process needs to be suspended prior to shutdown sync.
@@ -2501,23 +2555,21 @@ buf_daemon()
                mtx_unlock(&bdlock);
 
                kproc_suspend_check(bufdaemonproc);
-               lodirtysave = lodirtybuffers;
+               lodirty = lodirtybuffers;
                if (bd_speedupreq) {
-                       lodirtybuffers = numdirtybuffers / 2;
+                       lodirty = numdirtybuffers / 2;
                        bd_speedupreq = 0;
                }
                /*
                 * Do the flush.  Limit the amount of in-transit I/O we
                 * allow to build up, otherwise we would completely saturate
-                * the I/O system.  Wakeup any waiting processes before we
-                * normally would so they can run in parallel with our drain.
+                * the I/O system.
                 */
-               while (numdirtybuffers > lodirtybuffers) {
-                       if (buf_do_flush(NULL) == 0)
+               while (numdirtybuffers > lodirty) {
+                       if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
                                break;
                        kern_yield(PRI_USER);
                }
-               lodirtybuffers = lodirtysave;
 
                /*
                 * Only clear bd_request if we have reached our low water
@@ -2526,8 +2578,8 @@ buf_daemon()
                 * built up, within reason.
                 *
                 * If we were unable to hit our low water mark and couldn't
-                * find any flushable buffers, we sleep half a second.
-                * Otherwise we loop immediately.
+                * find any flushable buffers, we sleep for a short period
+                * to avoid endless loops on unlockable buffers.
                 */
                mtx_lock(&bdlock);
                if (numdirtybuffers <= lodirtybuffers) {
@@ -2537,6 +2589,14 @@ buf_daemon()
                         * The sleep is just so the suspend code works.
                         */
                        bd_request = 0;
+                       /*
+                        * Do an extra wakeup in case dirty threshold
+                        * changed via sysctl and the explicit transition
+                        * out of shortfall was missed.
+                        */
+                       bdirtywakeup();
+                       if (runningbufspace <= lorunningspace)
+                               runningwakeup();
                        msleep(&bd_request, &bdlock, PVM, "psleep", hz);
                } else {
                        /*
@@ -2561,7 +2621,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
-flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int target, int flushdeps)
 {
        struct buf *sentinel;
        struct vnode *vp;
@@ -2569,19 +2629,14 @@ flushbufqueues(struct vnode *lvp, int qu
        struct buf *bp;
        int hasdeps;
        int flushed;
-       int target;
+       int queue;
 
-       if (lvp == NULL) {
-               target = numdirtybuffers - lodirtybuffers;
-               if (flushdeps && target > 2)
-                       target /= 2;
-       } else
-               target = flushbufqtarget;
        flushed = 0;
+       queue = QUEUE_DIRTY;
        bp = NULL;
        sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
        sentinel->b_qindex = QUEUE_SENTINEL;
-       mtx_lock(&bqlock);
+       mtx_lock(&bqdirty);
        TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
        while (flushed != target) {
                bp = TAILQ_NEXT(sentinel, b_freelist);
@@ -2620,11 +2675,10 @@ flushbufqueues(struct vnode *lvp, int qu
                }
                if (bp->b_flags & B_INVAL) {

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to