Greetings all, 

Here's an up to date version of the buffer flipper that installs
on post hackathon -current. 

This diff (~beck/viagra.diff15) contains one important change from
the previous version - In the old cache, as buffers were never freed, 
we would put B_INVAL buffers in the cache at the head of the clean LRU. 
(B_INVAL buffers do not contain cachable data - so for example when a
remove happens and a file's link count drops to 0, all it's buffers 
are marked B_INVAL). 

I noticed after some work with tedu at the end of the hackathon that
we kept a lot of data in cache for removed files - it was because of
this - and moving to the head of the LRU (behaviour that has been
retained since the old static buffer cache) does not make sense with
the modern dynamic one - so this diff has changed it to free the
B_INVAL buffers right away instead of cacheing them. 

I'm running this on multiple arches and on my nfs servers feeding them. 

-Bob


On Mon, Jun 03, 2013 at 09:20:08AM -0600, Bob Beck wrote:
> 
> Here's a new version of the buffer flipper that fixes
> a problem found by krw@.  - All comments from before still apply:
> 
> > You too can have a GIANT buffer cache.... etc. etc... 
> > 
> > After much bug fighting in the midlayer and now uvm over the last 6
> > months in a number of places, I think it's about time to shop this
> > around again. 
> > 
> > This will only make a difference on amd64 - if you have 4 GB or more
> > of RAM. What it does is allows the high (non-DMA reachable) memory to
> > be used for buffer cache pages. It will use your set buffer
> > cache percentage of both dma'able, and above dma'able pages for the
> > cache, migrating the oldest cache pages into high memory. pages
> > are flipped back into dma'able memory if they are needed for IO. 
> > 
> > Notwithstanding that it only "matters" on amd64, it does change how
> > the world works a bit, and therefore requires testing everywhere. It
> > has survived multiple make build/make release test cycles now on my
> > machines (amd64,i386,zaurus,sparc,sparc64,hppa) (with various settings
> > of bufcachepercent) and is running on my NFS server
> > (bufcachepercent=90) without any complaints throughout that - it's
> > been running on my laptop for a long time now. 
> > 
> > If you try it, and have troubles (i.e. any new regressions), please
> > ensure you have your machine's console accessible (check to see if you
> > have ddb.console=1 in /etc/sysctl.conf) and if you have problems
> > please try to get
> > 
> > 
> > trace
> > ps
> > show bcstats
> > show uvm
> > 
> > from ddb if at all possible. 
> > 
> > Please let me know how you do with it, and most importantly what
> > you try it on/with. 
> > 
> -Bob
> 

Index: sys/kern/kern_sysctl.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.236
diff -u -p -r1.236 kern_sysctl.c
--- sys/kern/kern_sysctl.c      9 Jun 2013 13:10:19 -0000       1.236
+++ sys/kern/kern_sysctl.c      9 Jun 2013 15:27:04 -0000
@@ -110,6 +110,7 @@ extern struct disklist_head disklist;
 extern fixpt_t ccpu;
 extern  long numvnodes;
 extern u_int mcllivelocks;
+extern psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
 
 extern void nmbclust_update(void);
 
@@ -566,8 +567,8 @@ kern_sysctl(int *name, u_int namelen, vo
                return (sysctl_cptime2(name + 1, namelen -1, oldp, oldlenp,
                    newp, newlen));
        case KERN_CACHEPCT: {
-               u_int64_t dmapages;
-               int opct, pgs;
+               psize_t pgs;
+               int opct;
                opct = bufcachepercent;
                error = sysctl_int(oldp, oldlenp, newp, newlen,
                    &bufcachepercent);
@@ -577,9 +578,11 @@ kern_sysctl(int *name, u_int namelen, vo
                        bufcachepercent = opct;
                        return (EINVAL);
                }
-               dmapages = uvm_pagecount(&dma_constraint);
                if (bufcachepercent != opct) {
-                       pgs = bufcachepercent * dmapages / 100;
+                       pgs = (b_highpages_total + b_dmapages_total)
+                           * bufcachepercent / 100;
+                       b_dmamaxpages = b_dmapages_total * bufcachepercent
+                           / 100;
                        bufadjust(pgs); /* adjust bufpages */
                        bufhighpages = bufpages; /* set high water mark */
                }
Index: sys/kern/spec_vnops.c
===================================================================
RCS file: /cvs/src/sys/kern/spec_vnops.c,v
retrieving revision 1.71
diff -u -p -r1.71 spec_vnops.c
--- sys/kern/spec_vnops.c       28 Mar 2013 03:29:44 -0000      1.71
+++ sys/kern/spec_vnops.c       3 Jun 2013 14:51:14 -0000
@@ -457,7 +457,9 @@ spec_strategy(void *v)
        struct vop_strategy_args *ap = v;
        struct buf *bp = ap->a_bp;
        int maj = major(bp->b_dev);
-       
+
+       if (!ISSET(bp->b_flags, B_DMA) && ISSET(bp->b_flags, B_BC))
+               panic("bogus buf %p passed to spec_strategy", bp);
        if (LIST_FIRST(&bp->b_dep) != NULL)
                buf_start(bp);
 
Index: sys/kern/vfs_bio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.146
diff -u -p -r1.146 vfs_bio.c
--- sys/kern/vfs_bio.c  17 Feb 2013 17:39:29 -0000      1.146
+++ sys/kern/vfs_bio.c  9 Jun 2013 16:14:52 -0000
@@ -2,6 +2,7 @@
 /*     $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $  */
 
 /*
+ * Copyright (c) 2012,2013 Bob Beck <b...@openbsd.org>
  * Copyright (c) 1994 Christopher G. Demetriou
  * Copyright (c) 1982, 1986, 1989, 1993
  *     The Regents of the University of California.  All rights reserved.
@@ -63,12 +64,17 @@
 /*
  * Definitions for the buffer free lists.
  */
-#define        BQUEUES         2               /* number of free buffer queues 
*/
+#define        BQUEUES         3               /* number of free buffer queues 
*/
 
 #define        BQ_DIRTY        0               /* LRU queue with dirty buffers 
*/
-#define        BQ_CLEAN        1               /* LRU queue with clean buffers 
*/
+#define        BQ_CLEANL       1               /* LRU queue with clean low 
buffers */
+#define        BQ_CLEANH       2               /* LRU queue with clean high 
buffers */
 
 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
+int    bfreeclean(int, struct bqueues *);
+struct uvm_constraint_range high_constraint;
+psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
+int needda;
 int nobuffers;
 int needbuffer;
 struct bio_ops bioops;
@@ -110,30 +116,49 @@ bremfree(struct buf *bp)
        struct bqueues *dp = NULL;
 
        splassert(IPL_BIO);
+       KASSERT(ISSET(bp->b_flags, B_BC));
+       KASSERT(!ISSET(bp->b_flags, B_BUSY));
+       if (bp->b_freelist.tqe_next == NOLIST ||
+           bp->b_freelist.tqe_next == (void *)-1)
+               panic("bremfree: - buf %p not on a free list!", bp);
 
-       /*
-        * We only calculate the head of the freelist when removing
-        * the last element of the list as that is the only time that
-        * it is needed (e.g. to reset the tail pointer).
-        *
-        * NB: This makes an assumption about how tailq's are implemented.
-        */
-       if (TAILQ_NEXT(bp, b_freelist) == NULL) {
-               for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
-                       if (dp->tqh_last == &TAILQ_NEXT(bp, b_freelist))
-                               break;
-               if (dp == &bufqueues[BQUEUES])
-                       panic("bremfree: lost tail");
-       }
        if (!ISSET(bp->b_flags, B_DELWRI)) {
+               if (ISSET(bp->b_flags, B_DMA))
+                       dp = &bufqueues[BQ_CLEANL];
+               else
+                       dp = &bufqueues[BQ_CLEANH];
                bcstats.numcleanpages -= atop(bp->b_bufsize);
        } else {
+               dp = &bufqueues[BQ_DIRTY];
                bcstats.numdirtypages -= atop(bp->b_bufsize);
                bcstats.delwribufs--;
        }
        TAILQ_REMOVE(dp, bp, b_freelist);
 }
 
+int
+bfreeclean(int npages, struct bqueues *dp)
+{
+       struct buf *bp;
+       int i = 0;
+
+       splassert(IPL_BIO);
+       while (i < npages) {
+               bp = TAILQ_FIRST(dp);
+               if (bp == NULL)
+                       return(-1);
+               i += atop(bp->b_bufsize);
+               bremfree(bp);
+               if (bp->b_vp) {
+                       RB_REMOVE(buf_rb_bufs,
+                           &bp->b_vp->v_bufs_tree, bp);
+                       brelvp(bp);
+               }
+               buf_put(bp);
+       }
+       return(0);
+}
+
 void
 buf_put(struct buf *bp)
 {
@@ -158,7 +183,7 @@ buf_put(struct buf *bp)
        bcstats.numbufs--;
 
        if (buf_dealloc_mem(bp) != 0)
-               return;
+                return;
        pool_put(&bufpool, bp);
 }
 
@@ -168,12 +193,21 @@ buf_put(struct buf *bp)
 void
 bufinit(void)
 {
-       u_int64_t dmapages;
        struct bqueues *dp;
 
-       dmapages = uvm_pagecount(&dma_constraint);
-       /* take away a guess at how much of this the kernel will consume */
-       dmapages -= (atop(physmem) - atop(uvmexp.free));
+       /* How much DMA accessible memory will we consider? */
+       b_dmapages_total = uvm_pagecount(&dma_constraint);
+       /* Take away a guess at how much of this the kernel will consume. */
+       b_dmapages_total -= (atop(physmem) - atop(uvmexp.free));
+
+       /* See if we have memory above the dma accessible region. */
+       high_constraint.ucr_low = dma_constraint.ucr_high;
+       high_constraint.ucr_high = no_constraint.ucr_high;
+       if (high_constraint.ucr_low != high_constraint.ucr_high) {
+               high_constraint.ucr_low++;
+               b_highpages_total = uvm_pagecount(&high_constraint);
+       } else
+               b_highpages_total = 0;
 
        /*
         * If MD code doesn't say otherwise, use up to 10% of DMA'able
@@ -189,18 +223,18 @@ bufinit(void)
        KASSERT(bufcachepercent <= 90);
        KASSERT(bufcachepercent >= 5);
        if (bufpages == 0)
-               bufpages = dmapages * bufcachepercent / 100;
+               bufpages = (b_dmapages_total + b_highpages_total)
+                   * bufcachepercent / 100;
        if (bufpages < BCACHE_MIN)
                bufpages = BCACHE_MIN;
-       KASSERT(bufpages < dmapages);
 
        bufhighpages = bufpages;
-
+       b_dmamaxpages = b_dmapages_total * bufcachepercent / 100;
        /*
         * Set the base backoff level for the buffer cache.  We will
         * not allow uvm to steal back more than this number of pages.
         */
-       buflowpages = dmapages * 5 / 100;
+       buflowpages = b_dmapages_total * 5 / 100;
        if (buflowpages < BCACHE_MIN)
                buflowpages = BCACHE_MIN;
 
@@ -267,7 +301,6 @@ bufinit(void)
 void
 bufadjust(int newbufpages)
 {
-       struct buf *bp;
        int s, growing = 0;
 
        if (newbufpages < buflowpages)
@@ -290,15 +323,11 @@ bufadjust(int newbufpages)
         * If we have more buffers allocated than our new low water mark,
         * immediately free them.
         */
-       while (!growing && (bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
-           (bcstats.numbufpages > lopages)) {
-               bremfree(bp);
-               if (bp->b_vp) {
-                       RB_REMOVE(buf_rb_bufs,
-                           &bp->b_vp->v_bufs_tree, bp);
-                       brelvp(bp);
-               }
-               buf_put(bp);
+       if (!growing && (bcstats.numbufpages > lopages)) {
+               if (bfreeclean(bcstats.numbufpages - lopages,
+                       &bufqueues[BQ_CLEANH]) != 0)
+                       (void) bfreeclean(bcstats.numbufpages - lopages,
+                           &bufqueues[BQ_CLEANL]);
        }
 
        /*
@@ -321,8 +350,10 @@ bufbackoff(struct uvm_constraint_range *
        /*
         * Back off "size" buffer cache pages. Called by the page
         * daemon to consume buffer cache pages rather than scanning.
+        * Also called buy the buffer cache to back off if memory
+        * allocation in a particular range fails.
         *
-        * It returns 0 to the pagedaemon to indicate that it has
+        * It returns 0 to the caller to indicate that it has
         * succeeded in freeing enough pages. It returns -1 to
         * indicate that it could not and the pagedaemon should take
         * other measures.
@@ -340,8 +371,23 @@ bufbackoff(struct uvm_constraint_range *
                return(-1);
        if (bufpages - pdelta < buflowpages)
                pdelta = bufpages - buflowpages;
+
        oldbufpages = bufpages;
-       bufadjust(bufpages - pdelta);
+       if (b_highpages_total
+           && (range->ucr_high <= dma_constraint.ucr_high)) {
+               /*
+                * Free up DMA accessible memory by moving pages to
+                * the high range.
+                */
+               if (bufhigh(pdelta) == 0)
+                       return(0); /* we moved enough pages up high */
+               else {
+                       bufadjust(bufpages - pdelta); /* shrink the cache. */
+               }
+       } else {
+               /* Free memory by shrinking the cache. */
+               bufadjust(bufpages - pdelta);
+       }
        if (oldbufpages - bufpages < size)
                return (-1); /* we did not free what we were asked */
        else
@@ -526,12 +572,18 @@ bread_cluster(struct vnode *vp, daddr64_
        for (i = 1; i < howmany; i++) {
                bcstats.pendingreads++;
                bcstats.numreads++;
-               SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
+               /*
+                * We set B_DMA here because bp above will be B_DMA,
+                * and we are playing buffer slice-n-dice games from
+                * the memory allocated in bp.
+                */
+               SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
                xbpp[i]->b_blkno = sblkno + (i * inc);
                xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
                xbpp[i]->b_data = NULL;
                xbpp[i]->b_pobj = bp->b_pobj;
                xbpp[i]->b_poffs = bp->b_poffs + (i * size);
+               buf_dma(xbpp[i]);
        }
 
        KASSERT(bp->b_lblkno == blkno + 1);
@@ -760,8 +812,11 @@ brelse(struct buf *bp)
 
        if (ISSET(bp->b_flags, B_INVAL)) {
                /*
-                * If the buffer is invalid, place it in the clean queue, so it
-                * can be reused.
+                * If the buffer is invalid, free it now rather than
+                * putting it on any queue and wasting cache space.
+                *
+                * XXX we could queue it here for a later TRIM operation.
+                *
                 */
                if (LIST_FIRST(&bp->b_dep) != NULL)
                        buf_deallocate(bp);
@@ -778,44 +833,35 @@ brelse(struct buf *bp)
                bp->b_vp = NULL;
 
                /*
-                * If the buffer has no associated data, place it back in the
-                * pool.
+                * Wake up any processes waiting for _this_ buffer to
+                * become free. They are not allowed to grab it
+                * since it will be freed. But the only sleeper is
+                * getblk and it's restarting the operation after
+                * sleep.
                 */
-               if (bp->b_data == NULL && bp->b_pobj == NULL) {
-                       /*
-                        * Wake up any processes waiting for _this_ buffer to
-                        * become free. They are not allowed to grab it
-                        * since it will be freed. But the only sleeper is
-                        * getblk and it's restarting the operation after
-                        * sleep.
-                        */
-                       if (ISSET(bp->b_flags, B_WANTED)) {
-                               CLR(bp->b_flags, B_WANTED);
-                               wakeup(bp);
-                       }
-                       if (bp->b_vp != NULL)
-                               RB_REMOVE(buf_rb_bufs,
-                                   &bp->b_vp->v_bufs_tree, bp);
-                       buf_put(bp);
-                       splx(s);
-                       return;
+               if (ISSET(bp->b_flags, B_WANTED)) {
+                       CLR(bp->b_flags, B_WANTED);
+                       wakeup(bp);
                }
-
-               bcstats.numcleanpages += atop(bp->b_bufsize);
-               binsheadfree(bp, &bufqueues[BQ_CLEAN]);
+               if (ISSET(bp->b_flags, B_DMA) && needda)
+                       wakeup(&needda);
+               buf_put(bp);
        } else {
                /*
                 * It has valid data.  Put it on the end of the appropriate
                 * queue, so that it'll stick around for as long as possible.
                 */
 
-               if (!ISSET(bp->b_flags, B_DELWRI)) {
-                       bcstats.numcleanpages += atop(bp->b_bufsize);
-                       bufq = &bufqueues[BQ_CLEAN];
-               } else {
+               if (ISSET(bp->b_flags, B_DELWRI)) {
                        bcstats.numdirtypages += atop(bp->b_bufsize);
                        bcstats.delwribufs++;
                        bufq = &bufqueues[BQ_DIRTY];
+               } else {
+                       bcstats.numcleanpages += atop(bp->b_bufsize);
+                       if (ISSET(bp->b_flags, B_DMA))
+                               bufq = &bufqueues[BQ_CLEANL];
+                       else
+                               bufq = &bufqueues[BQ_CLEANH];
                }
                if (ISSET(bp->b_flags, B_AGE)) {
                        binsheadfree(bp, bufq);
@@ -824,12 +870,20 @@ brelse(struct buf *bp)
                        binstailfree(bp, bufq);
                        bp->b_synctime = time_uptime + 300;
                }
-       }
-
-       /* Unlock the buffer. */
-       CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
-       buf_release(bp);
+               /* Unlock the buffer. */
+               CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
+               buf_release(bp);
 
+               if (ISSET(bp->b_flags, B_DMA) && needda) {
+                       wakeup(&needda);
+               }
+               /* Wake up any processes waiting for _this_ buffer to
+                * become free. */
+               if (ISSET(bp->b_flags, B_WANTED)) {
+                       CLR(bp->b_flags, B_WANTED);
+                       wakeup(bp);
+               }
+       }
        /* Wake up syncer and cleaner processes waiting for buffers. */
        if (nobuffers) {
                nobuffers = 0;
@@ -843,12 +897,6 @@ brelse(struct buf *bp)
                wakeup(&needbuffer);
        }
 
-       /* Wake up any processes waiting for _this_ buffer to become free. */
-       if (ISSET(bp->b_flags, B_WANTED)) {
-               CLR(bp->b_flags, B_WANTED);
-               wakeup(bp);
-       }
-
        splx(s);
 }
 
@@ -890,16 +938,6 @@ getblk(struct vnode *vp, daddr64_t blkno
        struct buf b;
        int s, error;
 
-       /*
-        * XXX
-        * The following is an inlined version of 'incore()', but with
-        * the 'invalid' test moved to after the 'busy' test.  It's
-        * necessary because there are some cases in which the NFS
-        * code sets B_INVAL prior to writing data to the server, but
-        * in which the buffers actually contain valid data.  In this
-        * case, we can't allow the system to allocate a new buffer for
-        * the block until the write is finished.
-        */
 start:
        s = splbio();
        b.b_lblkno = blkno;
@@ -987,18 +1025,17 @@ buf_get(struct vnode *vp, daddr64_t blkn
                 * free down to the low water mark.
                 */
                if (bcstats.numbufpages + npages > hipages) {
-                       while ((bcstats.numbufpages > lopages) &&
-                           (bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]))) {
-                               bremfree(bp);
-                               if (bp->b_vp) {
-                                       RB_REMOVE(buf_rb_bufs,
-                                           &bp->b_vp->v_bufs_tree, bp);
-                                       brelvp(bp);
-                               }
-                               buf_put(bp);
-                       }
+                       if (bfreeclean(bcstats.numbufpages - lopages,
+                               &bufqueues[BQ_CLEANH]) != 0)
+                               (void) bfreeclean(bcstats.numbufpages
+                                   - lopages, &bufqueues[BQ_CLEANL]);
                }
 
+
+               if (b_highpages_total && bcstats.dmapages + npages >
+                   b_dmamaxpages)
+                       bufhigh(bcstats.dmapages + npages - b_dmamaxpages);
+
                /*
                 * If we get here, we tried to free the world down
                 * above, and couldn't get down - Wake the cleaner
@@ -1029,6 +1066,8 @@ buf_get(struct vnode *vp, daddr64_t blkn
                return (NULL);
        }
 
+       /* Mark buffer as the cache's */
+       SET(bp->b_flags, B_BC);
        bp->b_freelist.tqe_next = NOLIST;
        bp->b_synctime = time_uptime + 300;
        bp->b_dev = NODEV;
@@ -1068,6 +1107,7 @@ buf_get(struct vnode *vp, daddr64_t blkn
        if (size) {
                buf_alloc_pages(bp, round_page(size));
                buf_map(bp);
+               buf_dma(bp);
        }
 
        splx(s);
@@ -1238,6 +1278,128 @@ biodone(struct buf *bp)
        }
 }
 
+/*
+ * Ensure buffer is DMA reachable
+ */
+void
+buf_dma(struct buf *buf)
+{
+       struct buf *b;
+       int s;
+
+start:
+       KASSERT(ISSET(buf->b_flags, B_BC));
+       KASSERT(ISSET(buf->b_flags, B_BUSY));
+       KASSERT(buf->b_pobj != NULL);
+       s = splbio();
+       /*
+        * If we are adding to the queue, and we are not the cleaner or
+        * the syncer, ensure we free down below the max
+        */
+       while (b_highpages_total &&
+           curproc != syncerproc && curproc != cleanerproc &&
+           (!ISSET(buf->b_flags, B_DMA)) &&
+           (bcstats.dmapages > (b_dmamaxpages - atop(buf->b_bufsize)))) {
+               b = TAILQ_FIRST(&bufqueues[BQ_CLEANL]);
+               KASSERT(!ISSET(b->b_flags, B_BUSY));
+               if (b == NULL) {
+                       /* no non-busy buffers. */
+                       needda++;
+                       tsleep(&needda, PRIBIO, "needda", 0);
+                       needda--;
+                       splx(s);
+                       goto start;
+               } else {
+                       bremfree(b);
+                       buf_acquire_nomap(b);
+                       if (buf_realloc_pages(b, &high_constraint,
+                           UVM_PLA_NOWAIT) == 0) {
+                               /* move the buffer to high memory if we can */
+                               if (ISSET(b->b_flags, B_DMA))
+                                       panic("B_DMA after high flip %p", b);
+                               binstailfree(b, &bufqueues[BQ_CLEANH]);
+                               buf_release(b);
+                       } else {
+                               /* otherwise just free the buffer */
+                               buf_release(b);
+                               if (b->b_vp) {
+                                       RB_REMOVE(buf_rb_bufs,
+                                           &b->b_vp->v_bufs_tree, b);
+                                       brelvp(b);
+                               }
+                               buf_put(b);
+                       }
+               }
+       }
+       if (!ISSET(buf->b_flags, B_DMA)) {
+               /* move buf to dma reachable memory */
+               (void) buf_realloc_pages(buf, &dma_constraint, UVM_PLA_WAITOK);
+               if (!ISSET(buf->b_flags, B_DMA))
+                       panic("non-dma buffer after dma move %p\n", buf);
+       }
+       splx(s);
+       return;
+}
+
+/*
+ * Attempt to flip "delta" dma reachable cache pages high. return 0 if we can,
+ * -1 otherwise.
+ */
+int
+bufhigh(int delta)
+{
+       psize_t newdmapages;
+       struct buf *b, *bn;
+       int s;
+       if (!b_highpages_total)
+               return(-1);
+               s = splbio();
+       newdmapages = bcstats.dmapages - delta;
+       b = TAILQ_FIRST(&bufqueues[BQ_CLEANL]);
+       while ((bcstats.dmapages > newdmapages) && (b != NULL)) {
+               while (ISSET(b->b_flags, B_BUSY)) {
+                       b = TAILQ_NEXT(b, b_freelist);
+               }
+               if (b != NULL) {
+                       bn = TAILQ_NEXT(b, b_freelist);
+                       bremfree(b);
+                       buf_acquire_nomap(b);
+               moveit:
+                       if (buf_realloc_pages(b, &high_constraint,
+                           UVM_PLA_NOWAIT) == 0) {
+                               /* move the buffer to high memory if we can */
+                               if (ISSET(b->b_flags, B_DMA))
+                                       panic("B_DMA after high flip %p", b);
+                               binstailfree(b, &bufqueues[BQ_CLEANH]);
+                               buf_release(b);
+                       } else {
+                               /* free up some high memory and try again. */
+                               if (bfreeclean(delta, &bufqueues[BQ_CLEANH])
+                                   == 0)
+                                       goto moveit;
+                               else {
+                                       /* otherwise just free the buffer */
+                                       buf_release(b);
+                                       if (b->b_vp) {
+                                               RB_REMOVE(buf_rb_bufs,
+                                                   &b->b_vp->v_bufs_tree, b);
+                                               brelvp(b);
+                                       }
+                                       buf_put(b);
+                               }
+                       }
+                       b = bn;
+               }
+       }
+       wakeup(&needda);
+       splx(s);
+       if (bcstats.dmapages > newdmapages)
+               return(-1);
+       else
+               return(0);
+}
+
+
 #ifdef DDB
 void   bcstats_print(int (*)(const char *, ...) /* 
__attribute__((__format__(__kprintf__,1,2))) */);
 /*
@@ -1252,8 +1414,8 @@ bcstats_print(
            bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs);
        (*pr)("kvaslots %lld avail kva slots %lld\n",
            bcstats.kvaslots, bcstats.kvaslots_avail);
-       (*pr)("bufpages %lld, dirtypages %lld\n",
-           bcstats.numbufpages,  bcstats.numdirtypages);
+       (*pr)("total bufpages %lld, dmapages %lld, dirtypages %lld\n",
+           bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages);
        (*pr)("pendingreads %lld, pendingwrites %lld\n",
            bcstats.pendingreads, bcstats.pendingwrites);
 }
Index: sys/kern/vfs_biomem.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_biomem.c,v
retrieving revision 1.23
diff -u -p -r1.23 vfs_biomem.c
--- sys/kern/vfs_biomem.c       18 Jan 2013 10:07:37 -0000      1.23
+++ sys/kern/vfs_biomem.c       3 Jun 2013 14:51:14 -0000
@@ -1,6 +1,7 @@
 /*     $OpenBSD: vfs_biomem.c,v 1.23 2013/01/18 10:07:37 beck Exp $ */
 /*
  * Copyright (c) 2007 Artur Grabowski <a...@openbsd.org>
+ * Copyright (c) 2012,2013 Bob Beck <b...@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -267,6 +268,7 @@ void
 buf_alloc_pages(struct buf *bp, vsize_t size)
 {
        voff_t offs;
+       int i;
 
        KASSERT(size == round_page(size));
        KASSERT(bp->b_pobj == NULL);
@@ -278,8 +280,18 @@ buf_alloc_pages(struct buf *bp, vsize_t 
 
        KASSERT(buf_page_offset > 0);
 
-       uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK);
+       do {
+               i = uvm_pagealloc_multi(buf_object, offs, size,
+                   UVM_PLA_NOWAIT);
+               if (i == 0)
+                       break;
+       } while (bufbackoff(&dma_constraint, 100) == 0);
+       if (i != 0)
+               i = uvm_pagealloc_multi(buf_object, offs, size,
+                   UVM_PLA_WAITOK);
        bcstats.numbufpages += atop(size);
+       bcstats.dmapages += atop(size);
+       SET(bp->b_flags, B_DMA);
        bp->b_pobj = buf_object;
        bp->b_poffs = offs;
        bp->b_bufsize = size;
@@ -307,10 +319,68 @@ buf_free_pages(struct buf *bp)
                pg->wire_count = 0;
                uvm_pagefree(pg);
                bcstats.numbufpages--;
+               if (ISSET(bp->b_flags, B_DMA))
+                       bcstats.dmapages--;
        }
+       CLR(bp->b_flags, B_DMA);
 }
 
-/*
- * XXX - it might make sense to make a buf_realloc_pages to avoid
- *       bouncing through the free list all the time.
- */
+/* Reallocate a buf into a particular pmem range specified by "where". */
+int
+buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where,
+    int flags)
+{
+       vaddr_t va;
+       int dma;
+       int i, r;
+       KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
+
+       splassert(IPL_BIO);
+       KASSERT(ISSET(bp->b_flags, B_BUSY));
+       dma = ISSET(bp->b_flags, B_DMA);
+
+       /* if the original buf is mapped, unmap it */
+       if (bp->b_data != NULL) {
+               va = (vaddr_t)bp->b_data;
+               pmap_kremove(va, bp->b_bufsize);
+               pmap_update(pmap_kernel());
+       }
+
+       r = 0;
+       do {
+               r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs,
+                   bp->b_bufsize, UVM_PLA_NOWAIT, where);
+               if (r == 0)
+                       break;
+       } while ((bufbackoff(where, 100) == 0) && (flags & UVM_PLA_WAITOK));
+       if (r != 0 && (! flags & UVM_PLA_NOWAIT))
+               r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs,
+                   bp->b_bufsize, flags, where);
+
+       /*
+        * do this now, and put it back later when we know where we are
+        */
+       if (dma)
+               bcstats.dmapages -= atop(bp->b_bufsize);
+
+       dma = 1;
+       /* if the original buf was mapped, re-map it */
+       for (i = 0; i < atop(bp->b_bufsize); i++) {
+               struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
+                   bp->b_poffs + ptoa(i));
+               KASSERT(pg != NULL);
+               if  (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg)))
+                       dma = 0;
+               if (bp->b_data != NULL) {
+                       pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
+                           VM_PROT_READ|VM_PROT_WRITE);
+                       pmap_update(pmap_kernel());
+               }
+       }
+       if (dma) {
+               SET(bp->b_flags, B_DMA);
+               bcstats.dmapages += atop(bp->b_bufsize);
+       } else
+               CLR(bp->b_flags, B_DMA);
+       return(r);
+}
Index: sys/kern/vfs_vops.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_vops.c,v
retrieving revision 1.5
diff -u -p -r1.5 vfs_vops.c
--- sys/kern/vfs_vops.c 28 Mar 2013 02:08:39 -0000      1.5
+++ sys/kern/vfs_vops.c 3 Jun 2013 14:51:14 -0000
@@ -633,6 +633,11 @@ VOP_STRATEGY(struct buf *bp)
 
        if (bp->b_vp->v_op->vop_strategy == NULL)
                return (EOPNOTSUPP);
+       /*
+        * Flip buffer to dma reachable memory if necessary.
+        */
+       if (ISSET(bp->b_flags, B_BC))
+               buf_dma(bp);
 
        return ((bp->b_vp->v_op->vop_strategy)(&a));
 }
Index: sys/sys/buf.h
===================================================================
RCS file: /cvs/src/sys/sys/buf.h,v
retrieving revision 1.84
diff -u -p -r1.84 buf.h
--- sys/sys/buf.h       24 Mar 2013 17:42:43 -0000      1.84
+++ sys/sys/buf.h       3 Jun 2013 14:51:14 -0000
@@ -234,12 +234,14 @@ struct buf {
 #define        B_SCANNED       0x00100000      /* Block already pushed during 
sync */
 #define        B_PDAEMON       0x00200000      /* I/O started by pagedaemon */
 #define        B_RELEASED      0x00400000      /* free this buffer after its 
kvm */
+#define B_BC           0x00800000      /* Managed by the Buffer Cache. */
+#define B_DMA          0x01000000      /* DMA reachable. */
 
 #define        B_BITS  "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \
     "\006CACHE\007CALL\010DELWRI\011DONE\012EINTR\013ERROR" \
     "\014INVAL\015NOCACHE\016PHYS\017RAW\020READ" \
     "\021WANTED\022WRITEINPROG\023XXX(FORMAT)\024DEFERRED" \
-    "\025SCANNED\026DAEMON\027RELEASED"
+    "\025SCANNED\026DAEMON\027RELEASED\030BC\031DMA"
 
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
@@ -305,6 +307,7 @@ void        bremfree(struct buf *);
 void   bufinit(void);
 void   buf_dirty(struct buf *);
 void    buf_undirty(struct buf *);
+void   buf_dma(struct buf *);
 int    bwrite(struct buf *);
 struct buf *getblk(struct vnode *, daddr64_t, int, int, int);
 struct buf *geteblk(int);
@@ -328,7 +331,8 @@ int buf_dealloc_mem(struct buf *);
 void   buf_fix_mapping(struct buf *, vsize_t);
 void   buf_alloc_pages(struct buf *, vsize_t);
 void   buf_free_pages(struct buf *);
-
+struct uvm_constraint_range;
+int    buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int);
 
 void   minphys(struct buf *bp);
 int    physio(void (*strategy)(struct buf *), dev_t dev, int flags,
Index: sys/sys/mount.h
===================================================================
RCS file: /cvs/src/sys/sys/mount.h,v
retrieving revision 1.111
diff -u -p -r1.111 mount.h
--- sys/sys/mount.h     3 Jun 2013 15:56:01 -0000       1.111
+++ sys/sys/mount.h     8 Jun 2013 17:35:56 -0000
@@ -528,6 +528,7 @@ extern long buflowpages, bufhighpages, b
 #define BUFPAGES_INACT (((bcstats.numcleanpages - buflowpages) < 0) ? 0 \
     : bcstats.numcleanpages - buflowpages)
 extern int bufcachepercent;
+extern int bufhigh(int);
 extern void bufadjust(int);
 struct uvm_constraint_range;
 extern int bufbackoff(struct uvm_constraint_range*, long);
Index: sys/uvm/uvm_extern.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.107
diff -u -p -r1.107 uvm_extern.h
--- sys/uvm/uvm_extern.h        23 May 2013 01:42:59 -0000      1.107
+++ sys/uvm/uvm_extern.h        8 Jun 2013 17:35:57 -0000
@@ -680,11 +680,11 @@ struct vm_page            *uvm_pagealloc(struct uv
                                voff_t, struct vm_anon *, int);
 vaddr_t                        uvm_pagealloc_contig(vaddr_t, vaddr_t,
                                vaddr_t, vaddr_t);
-void                   uvm_pagealloc_multi(struct uvm_object *, voff_t,
+int                    uvm_pagealloc_multi(struct uvm_object *, voff_t,
                            vsize_t, int);
 void                   uvm_pagerealloc(struct vm_page *, 
                                             struct uvm_object *, voff_t);
-void                   uvm_pagerealloc_multi(struct uvm_object *, voff_t,
+int                    uvm_pagerealloc_multi(struct uvm_object *, voff_t,
                            vsize_t, int, struct uvm_constraint_range *);
 /* Actually, uvm_page_physload takes PF#s which need their own type */
 void                   uvm_page_physload(paddr_t, paddr_t, paddr_t,
Index: sys/uvm/uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
retrieving revision 1.125
diff -u -p -r1.125 uvm_page.c
--- sys/uvm/uvm_page.c  30 May 2013 16:29:46 -0000      1.125
+++ sys/uvm/uvm_page.c  8 Jun 2013 17:35:57 -0000
@@ -876,19 +876,21 @@ uvm_pglistfree(struct pglist *list)
  * interface used by the buffer cache to allocate a buffer at a time.
  * The pages are allocated wired in DMA accessible memory
  */
-void
+int
 uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
     int flags)
 {
        struct pglist    plist;
        struct vm_page  *pg;
-       int              i;
+       int              i, r;
 
 
        TAILQ_INIT(&plist);
-       (void) uvm_pglistalloc(size, dma_constraint.ucr_low,
+       r = uvm_pglistalloc(size, dma_constraint.ucr_low,
            dma_constraint.ucr_high, 0, 0, &plist, atop(round_page(size)),
-           UVM_PLA_WAITOK);
+           flags);
+       if (r != 0)
+               return(r);
        i = 0;
        while ((pg = TAILQ_FIRST(&plist)) != NULL) {
                pg->wire_count = 1;
@@ -897,6 +899,7 @@ uvm_pagealloc_multi(struct uvm_object *o
                TAILQ_REMOVE(&plist, pg, pageq);
                uvm_pagealloc_pg(pg, obj, off + ptoa(i++), NULL);
        }
+       return(0);
 }
 
 /*
@@ -904,21 +907,23 @@ uvm_pagealloc_multi(struct uvm_object *o
  * The pages are reallocated wired outside the DMA accessible region.
  *
  */
-void
+int
 uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
     int flags, struct uvm_constraint_range *where)
 {
        struct pglist    plist;
        struct vm_page  *pg, *tpg;
-       int              i;
+       int              i,r;
        voff_t          offset;
 
 
        TAILQ_INIT(&plist);
        if (size == 0)
                panic("size 0 uvm_pagerealloc");
-       (void) uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0,
-           0, &plist, atop(round_page(size)), UVM_PLA_WAITOK);
+       r = uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0,
+           0, &plist, atop(round_page(size)), flags);
+       if (r != 0)
+               return(r);
        i = 0;
        while((pg = TAILQ_FIRST(&plist)) != NULL) {
                offset = off + ptoa(i++);
@@ -931,6 +936,7 @@ uvm_pagerealloc_multi(struct uvm_object 
                uvm_pagefree(tpg);
                uvm_pagealloc_pg(pg, obj, offset, NULL);
        }
+       return(0);
 }
 
 /*
Index: usr.bin/systat/iostat.c
===================================================================
RCS file: /cvs/src/usr.bin/systat/iostat.c,v
retrieving revision 1.40
diff -u -p -r1.40 iostat.c
--- usr.bin/systat/iostat.c     19 Sep 2011 14:48:04 -0000      1.40
+++ usr.bin/systat/iostat.c     18 Mar 2013 22:29:29 -0000
@@ -222,6 +222,10 @@ showbcache(void)
        print_fld_ssize(FLD_IO_SVAL, bccur.numbufpages);
        end_line();
 
+       print_fld_str(FLD_IO_SSTR, "dma pages");
+       print_fld_ssize(FLD_IO_SVAL, bccur.dmapages);
+       end_line();
+
        print_fld_str(FLD_IO_SSTR, "dirty pages");
        print_fld_ssize(FLD_IO_SVAL, bccur.numdirtypages);
        end_line();

Reply via email to