Here's a new version of the buffer flipper that fixes a problem found by krw@. - All comments from before still apply:
> You too can have a GIANT buffer cache.... etc. etc... > > After much bug fighting in the midlayer and now uvm over the last 6 > months in a number of places, I think it's about time to shop this > around again. > > This will only make a difference on amd64 - if you have 4 GB or more > of RAM. What it does is allows the high (non-DMA reachable) memory to > be used for buffer cache pages. It will use your set buffer > cache percentage of both dma'able, and above dma'able pages for the > cache, migrating the oldest cache pages into high memory. pages > are flipped back into dma'able memory if they are needed for IO. > > Notwithstanding that it only "matters" on amd64, it does change how > the world works a bit, and therefore requires testing everywhere. It > has survived multiple make build/make release test cycles now on my > machines (amd64,i386,zaurus,sparc,sparc64,hppa) (with various settings > of bufcachepercent) and is running on my NFS server > (bufcachepercent=90) without any complaints throughout that - it's > been running on my laptop for a long time now. > > If you try it, and have troubles (i.e. any new regressions), please > ensure you have your machine's console accessible (check to see if you > have ddb.console=1 in /etc/sysctl.conf) and if you have problems > please try to get > > > trace > ps > show bcstats > show uvm > > from ddb if at all possible. > > Please let me know how you do with it, and most importantly what > you try it on/with. > -Bob (diff also in ~beck/viagra.diff14 on cvs) Index: sys/kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.234 diff -u -p -r1.234 kern_sysctl.c --- sys/kern/kern_sysctl.c 6 Apr 2013 03:44:34 -0000 1.234 +++ sys/kern/kern_sysctl.c 3 Jun 2013 14:51:14 -0000 @@ -110,6 +110,7 @@ extern struct disklist_head disklist; extern fixpt_t ccpu; extern long numvnodes; extern u_int mcllivelocks; +extern psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages; extern void nmbclust_update(void); @@ -564,8 +565,8 @@ kern_sysctl(int *name, u_int namelen, vo return (sysctl_cptime2(name + 1, namelen -1, oldp, oldlenp, newp, newlen)); case KERN_CACHEPCT: { - u_int64_t dmapages; - int opct, pgs; + psize_t pgs; + int opct; opct = bufcachepercent; error = sysctl_int(oldp, oldlenp, newp, newlen, &bufcachepercent); @@ -575,9 +576,11 @@ kern_sysctl(int *name, u_int namelen, vo bufcachepercent = opct; return (EINVAL); } - dmapages = uvm_pagecount(&dma_constraint); if (bufcachepercent != opct) { - pgs = bufcachepercent * dmapages / 100; + pgs = (b_highpages_total + b_dmapages_total) + * bufcachepercent / 100; + b_dmamaxpages = b_dmapages_total * bufcachepercent + / 100; bufadjust(pgs); /* adjust bufpages */ bufhighpages = bufpages; /* set high water mark */ } Index: sys/kern/spec_vnops.c =================================================================== RCS file: /cvs/src/sys/kern/spec_vnops.c,v retrieving revision 1.71 diff -u -p -r1.71 spec_vnops.c --- sys/kern/spec_vnops.c 28 Mar 2013 03:29:44 -0000 1.71 +++ sys/kern/spec_vnops.c 3 Jun 2013 14:51:14 -0000 @@ -457,7 +457,9 @@ spec_strategy(void *v) struct vop_strategy_args *ap = v; struct buf *bp = ap->a_bp; int maj = major(bp->b_dev); - + + if (!ISSET(bp->b_flags, B_DMA) && ISSET(bp->b_flags, B_BC)) + panic("bogus buf %p passed to spec_strategy", bp); if (LIST_FIRST(&bp->b_dep) != NULL) buf_start(bp); Index: sys/kern/vfs_bio.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_bio.c,v retrieving revision 1.146 diff -u -p -r1.146 vfs_bio.c --- sys/kern/vfs_bio.c 17 Feb 2013 17:39:29 -0000 1.146 +++ sys/kern/vfs_bio.c 3 Jun 2013 14:59:18 -0000 @@ -63,12 +63,17 @@ /* * Definitions for the buffer free lists. */ -#define BQUEUES 2 /* number of free buffer queues */ +#define BQUEUES 3 /* number of free buffer queues */ #define BQ_DIRTY 0 /* LRU queue with dirty buffers */ -#define BQ_CLEAN 1 /* LRU queue with clean buffers */ +#define BQ_CLEANL 1 /* LRU queue with clean low buffers */ +#define BQ_CLEANH 2 /* LRU queue with clean high buffers */ TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; +int bfreeclean(int, struct bqueues *); +struct uvm_constraint_range high_constraint; +psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages; +int needda; int nobuffers; int needbuffer; struct bio_ops bioops; @@ -110,30 +115,49 @@ bremfree(struct buf *bp) struct bqueues *dp = NULL; splassert(IPL_BIO); + KASSERT(ISSET(bp->b_flags, B_BC)); + KASSERT(!ISSET(bp->b_flags, B_BUSY)); + if (bp->b_freelist.tqe_next == NOLIST || + bp->b_freelist.tqe_next == (void *)-1) + panic("bremfree: - buf %p not on a free list!", bp); - /* - * We only calculate the head of the freelist when removing - * the last element of the list as that is the only time that - * it is needed (e.g. to reset the tail pointer). - * - * NB: This makes an assumption about how tailq's are implemented. - */ - if (TAILQ_NEXT(bp, b_freelist) == NULL) { - for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) - if (dp->tqh_last == &TAILQ_NEXT(bp, b_freelist)) - break; - if (dp == &bufqueues[BQUEUES]) - panic("bremfree: lost tail"); - } if (!ISSET(bp->b_flags, B_DELWRI)) { + if (ISSET(bp->b_flags, B_DMA)) + dp = &bufqueues[BQ_CLEANL]; + else + dp = &bufqueues[BQ_CLEANH]; bcstats.numcleanpages -= atop(bp->b_bufsize); } else { + dp = &bufqueues[BQ_DIRTY]; bcstats.numdirtypages -= atop(bp->b_bufsize); bcstats.delwribufs--; } TAILQ_REMOVE(dp, bp, b_freelist); } +int +bfreeclean(int npages, struct bqueues *dp) +{ + struct buf *bp; + int i = 0; + + splassert(IPL_BIO); + while (i < npages) { + bp = TAILQ_FIRST(dp); + if (bp == NULL) + return(-1); + i += atop(bp->b_bufsize); + bremfree(bp); + if (bp->b_vp) { + RB_REMOVE(buf_rb_bufs, + &bp->b_vp->v_bufs_tree, bp); + brelvp(bp); + } + buf_put(bp); + } + return(0); +} + void buf_put(struct buf *bp) { @@ -158,7 +182,7 @@ buf_put(struct buf *bp) bcstats.numbufs--; if (buf_dealloc_mem(bp) != 0) - return; + return; pool_put(&bufpool, bp); } @@ -168,12 +192,21 @@ buf_put(struct buf *bp) void bufinit(void) { - u_int64_t dmapages; struct bqueues *dp; - dmapages = uvm_pagecount(&dma_constraint); - /* take away a guess at how much of this the kernel will consume */ - dmapages -= (atop(physmem) - atop(uvmexp.free)); + /* How much DMA accessible memory will we consider? */ + b_dmapages_total = uvm_pagecount(&dma_constraint); + /* Take away a guess at how much of this the kernel will consume. */ + b_dmapages_total -= (atop(physmem) - atop(uvmexp.free)); + + /* See if we have memory above the dma accessible region. */ + high_constraint.ucr_low = dma_constraint.ucr_high; + high_constraint.ucr_high = no_constraint.ucr_high; + if (high_constraint.ucr_low != high_constraint.ucr_high) { + high_constraint.ucr_low++; + b_highpages_total = uvm_pagecount(&high_constraint); + } else + b_highpages_total = 0; /* * If MD code doesn't say otherwise, use up to 10% of DMA'able @@ -189,18 +222,18 @@ bufinit(void) KASSERT(bufcachepercent <= 90); KASSERT(bufcachepercent >= 5); if (bufpages == 0) - bufpages = dmapages * bufcachepercent / 100; + bufpages = (b_dmapages_total + b_highpages_total) + * bufcachepercent / 100; if (bufpages < BCACHE_MIN) bufpages = BCACHE_MIN; - KASSERT(bufpages < dmapages); bufhighpages = bufpages; - + b_dmamaxpages = b_dmapages_total * bufcachepercent / 100; /* * Set the base backoff level for the buffer cache. We will * not allow uvm to steal back more than this number of pages. */ - buflowpages = dmapages * 5 / 100; + buflowpages = b_dmapages_total * 5 / 100; if (buflowpages < BCACHE_MIN) buflowpages = BCACHE_MIN; @@ -267,7 +300,6 @@ bufinit(void) void bufadjust(int newbufpages) { - struct buf *bp; int s, growing = 0; if (newbufpages < buflowpages) @@ -290,15 +322,11 @@ bufadjust(int newbufpages) * If we have more buffers allocated than our new low water mark, * immediately free them. */ - while (!growing && (bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && - (bcstats.numbufpages > lopages)) { - bremfree(bp); - if (bp->b_vp) { - RB_REMOVE(buf_rb_bufs, - &bp->b_vp->v_bufs_tree, bp); - brelvp(bp); - } - buf_put(bp); + if (!growing && (bcstats.numbufpages > lopages)) { + if (bfreeclean(bcstats.numbufpages - lopages, + &bufqueues[BQ_CLEANH]) != 0) + (void) bfreeclean(bcstats.numbufpages - lopages, + &bufqueues[BQ_CLEANL]); } /* @@ -321,8 +349,10 @@ bufbackoff(struct uvm_constraint_range * /* * Back off "size" buffer cache pages. Called by the page * daemon to consume buffer cache pages rather than scanning. + * Also called buy the buffer cache to back off if memory + * allocation in a particular range fails. * - * It returns 0 to the pagedaemon to indicate that it has + * It returns 0 to the caller to indicate that it has * succeeded in freeing enough pages. It returns -1 to * indicate that it could not and the pagedaemon should take * other measures. @@ -340,8 +370,23 @@ bufbackoff(struct uvm_constraint_range * return(-1); if (bufpages - pdelta < buflowpages) pdelta = bufpages - buflowpages; + oldbufpages = bufpages; - bufadjust(bufpages - pdelta); + if (b_highpages_total + && (range->ucr_high <= dma_constraint.ucr_high)) { + /* + * Free up DMA accessible memory by moving pages to + * the high range. + */ + if (bufhigh(pdelta) == 0) + return(0); /* we moved enough pages up high */ + else { + bufadjust(bufpages - pdelta); /* shrink the cache. */ + } + } else { + /* Free memory by shrinking the cache. */ + bufadjust(bufpages - pdelta); + } if (oldbufpages - bufpages < size) return (-1); /* we did not free what we were asked */ else @@ -526,12 +571,18 @@ bread_cluster(struct vnode *vp, daddr64_ for (i = 1; i < howmany; i++) { bcstats.pendingreads++; bcstats.numreads++; - SET(xbpp[i]->b_flags, B_READ | B_ASYNC); + /* + * We set B_DMA here because bp above will be B_DMA, + * and we are playing buffer slice-n-dice games from + * the memory allocated in bp. + */ + SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC); xbpp[i]->b_blkno = sblkno + (i * inc); xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size; xbpp[i]->b_data = NULL; xbpp[i]->b_pobj = bp->b_pobj; xbpp[i]->b_poffs = bp->b_poffs + (i * size); + buf_dma(xbpp[i]); } KASSERT(bp->b_lblkno == blkno + 1); @@ -793,6 +844,8 @@ brelse(struct buf *bp) CLR(bp->b_flags, B_WANTED); wakeup(bp); } + if (ISSET(bp->b_flags, B_DMA) && needda) + wakeup(&needda); if (bp->b_vp != NULL) RB_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp); @@ -802,20 +855,26 @@ brelse(struct buf *bp) } bcstats.numcleanpages += atop(bp->b_bufsize); - binsheadfree(bp, &bufqueues[BQ_CLEAN]); + if (ISSET(bp->b_flags, B_DMA)) + binsheadfree(bp, &bufqueues[BQ_CLEANL]); + else + binsheadfree(bp, &bufqueues[BQ_CLEANH]); } else { /* * It has valid data. Put it on the end of the appropriate * queue, so that it'll stick around for as long as possible. */ - if (!ISSET(bp->b_flags, B_DELWRI)) { - bcstats.numcleanpages += atop(bp->b_bufsize); - bufq = &bufqueues[BQ_CLEAN]; - } else { + if (ISSET(bp->b_flags, B_DELWRI)) { bcstats.numdirtypages += atop(bp->b_bufsize); bcstats.delwribufs++; bufq = &bufqueues[BQ_DIRTY]; + } else { + bcstats.numcleanpages += atop(bp->b_bufsize); + if (ISSET(bp->b_flags, B_DMA)) + bufq = &bufqueues[BQ_CLEANL]; + else + bufq = &bufqueues[BQ_CLEANH]; } if (ISSET(bp->b_flags, B_AGE)) { binsheadfree(bp, bufq); @@ -836,6 +895,10 @@ brelse(struct buf *bp) wakeup(&nobuffers); } + if (ISSET(bp->b_flags, B_DMA) && needda) { + wakeup(&needda); + } + /* Wake up any processes waiting for any buffer to become free. */ if (needbuffer && bcstats.numbufpages < hipages && bcstats.kvaslots_avail > RESERVE_SLOTS) { @@ -987,18 +1050,17 @@ buf_get(struct vnode *vp, daddr64_t blkn * free down to the low water mark. */ if (bcstats.numbufpages + npages > hipages) { - while ((bcstats.numbufpages > lopages) && - (bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]))) { - bremfree(bp); - if (bp->b_vp) { - RB_REMOVE(buf_rb_bufs, - &bp->b_vp->v_bufs_tree, bp); - brelvp(bp); - } - buf_put(bp); - } + if (bfreeclean(bcstats.numbufpages - lopages, + &bufqueues[BQ_CLEANH]) != 0) + (void) bfreeclean(bcstats.numbufpages + - lopages, &bufqueues[BQ_CLEANL]); } + + if (b_highpages_total && bcstats.dmapages + npages > + b_dmamaxpages) + bufhigh(bcstats.dmapages + npages - b_dmamaxpages); + /* * If we get here, we tried to free the world down * above, and couldn't get down - Wake the cleaner @@ -1029,6 +1091,8 @@ buf_get(struct vnode *vp, daddr64_t blkn return (NULL); } + /* Mark buffer as the cache's */ + SET(bp->b_flags, B_BC); bp->b_freelist.tqe_next = NOLIST; bp->b_synctime = time_uptime + 300; bp->b_dev = NODEV; @@ -1068,6 +1132,7 @@ buf_get(struct vnode *vp, daddr64_t blkn if (size) { buf_alloc_pages(bp, round_page(size)); buf_map(bp); + buf_dma(bp); } splx(s); @@ -1238,6 +1303,128 @@ biodone(struct buf *bp) } } +/* + * Ensure buffer is DMA reachable + */ +void +buf_dma(struct buf *buf) +{ + struct buf *b; + int s; + +start: + KASSERT(ISSET(buf->b_flags, B_BC)); + KASSERT(ISSET(buf->b_flags, B_BUSY)); + KASSERT(buf->b_pobj != NULL); + s = splbio(); + /* + * If we are adding to the queue, and we are not the cleaner or + * the syncer, ensure we free down below the max + */ + while (b_highpages_total && + curproc != syncerproc && curproc != cleanerproc && + (!ISSET(buf->b_flags, B_DMA)) && + (bcstats.dmapages > (b_dmamaxpages - atop(buf->b_bufsize)))) { + b = TAILQ_FIRST(&bufqueues[BQ_CLEANL]); + KASSERT(!ISSET(b->b_flags, B_BUSY)); + if (b == NULL) { + /* no non-busy buffers. */ + needda++; + tsleep(&needda, PRIBIO, "needda", 0); + needda--; + splx(s); + goto start; + } else { + bremfree(b); + buf_acquire_nomap(b); + if (buf_realloc_pages(b, &high_constraint, + UVM_PLA_NOWAIT) == 0) { + /* move the buffer to high memory if we can */ + if (ISSET(b->b_flags, B_DMA)) + panic("B_DMA after high flip %p", b); + binstailfree(b, &bufqueues[BQ_CLEANH]); + buf_release(b); + } else { + /* otherwise just free the buffer */ + buf_release(b); + if (b->b_vp) { + RB_REMOVE(buf_rb_bufs, + &b->b_vp->v_bufs_tree, b); + brelvp(b); + } + buf_put(b); + } + } + } + if (!ISSET(buf->b_flags, B_DMA)) { + /* move buf to dma reachable memory */ + (void) buf_realloc_pages(buf, &dma_constraint, UVM_PLA_WAITOK); + if (!ISSET(buf->b_flags, B_DMA)) + panic("non-dma buffer after dma move %p\n", buf); + } + splx(s); + return; +} + +/* + * Attempt to flip "delta" dma reachable cache pages high. return 0 if we can, + * -1 otherwise. + */ +int +bufhigh(int delta) +{ + psize_t newdmapages; + struct buf *b, *bn; + int s; + if (!b_highpages_total) + return(-1); + s = splbio(); + newdmapages = bcstats.dmapages - delta; + b = TAILQ_FIRST(&bufqueues[BQ_CLEANL]); + while ((bcstats.dmapages > newdmapages) && (b != NULL)) { + while (ISSET(b->b_flags, B_BUSY)) { + b = TAILQ_NEXT(b, b_freelist); + } + if (b != NULL) { + bn = TAILQ_NEXT(b, b_freelist); + bremfree(b); + buf_acquire_nomap(b); + moveit: + if (buf_realloc_pages(b, &high_constraint, + UVM_PLA_NOWAIT) == 0) { + /* move the buffer to high memory if we can */ + if (ISSET(b->b_flags, B_DMA)) + panic("B_DMA after high flip %p", b); + binstailfree(b, &bufqueues[BQ_CLEANH]); + buf_release(b); + } else { + /* free up some high memory and try again. */ + if (bfreeclean(delta, &bufqueues[BQ_CLEANH]) + == 0) + goto moveit; + else { + /* otherwise just free the buffer */ + buf_release(b); + if (b->b_vp) { + RB_REMOVE(buf_rb_bufs, + &b->b_vp->v_bufs_tree, b); + brelvp(b); + } + buf_put(b); + } + } + b = bn; + } + } + wakeup(&needda); + splx(s); + if (bcstats.dmapages > newdmapages) + return(-1); + else + return(0); +} + + #ifdef DDB void bcstats_print(int (*)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */); /* @@ -1252,8 +1439,8 @@ bcstats_print( bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs); (*pr)("kvaslots %lld avail kva slots %lld\n", bcstats.kvaslots, bcstats.kvaslots_avail); - (*pr)("bufpages %lld, dirtypages %lld\n", - bcstats.numbufpages, bcstats.numdirtypages); + (*pr)("total bufpages %lld, dmapages %lld, dirtypages %lld\n", + bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages); (*pr)("pendingreads %lld, pendingwrites %lld\n", bcstats.pendingreads, bcstats.pendingwrites); } Index: sys/kern/vfs_biomem.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_biomem.c,v retrieving revision 1.23 diff -u -p -r1.23 vfs_biomem.c --- sys/kern/vfs_biomem.c 18 Jan 2013 10:07:37 -0000 1.23 +++ sys/kern/vfs_biomem.c 3 Jun 2013 14:51:14 -0000 @@ -1,6 +1,7 @@ /* $OpenBSD: vfs_biomem.c,v 1.23 2013/01/18 10:07:37 beck Exp $ */ /* * Copyright (c) 2007 Artur Grabowski <a...@openbsd.org> + * Copyright (c) 2012,2013 Bob Beck <b...@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -267,6 +268,7 @@ void buf_alloc_pages(struct buf *bp, vsize_t size) { voff_t offs; + int i; KASSERT(size == round_page(size)); KASSERT(bp->b_pobj == NULL); @@ -278,8 +280,18 @@ buf_alloc_pages(struct buf *bp, vsize_t KASSERT(buf_page_offset > 0); - uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK); + do { + i = uvm_pagealloc_multi(buf_object, offs, size, + UVM_PLA_NOWAIT); + if (i == 0) + break; + } while (bufbackoff(&dma_constraint, 100) == 0); + if (i != 0) + i = uvm_pagealloc_multi(buf_object, offs, size, + UVM_PLA_WAITOK); bcstats.numbufpages += atop(size); + bcstats.dmapages += atop(size); + SET(bp->b_flags, B_DMA); bp->b_pobj = buf_object; bp->b_poffs = offs; bp->b_bufsize = size; @@ -307,10 +319,68 @@ buf_free_pages(struct buf *bp) pg->wire_count = 0; uvm_pagefree(pg); bcstats.numbufpages--; + if (ISSET(bp->b_flags, B_DMA)) + bcstats.dmapages--; } + CLR(bp->b_flags, B_DMA); } -/* - * XXX - it might make sense to make a buf_realloc_pages to avoid - * bouncing through the free list all the time. - */ +/* Reallocate a buf into a particular pmem range specified by "where". */ +int +buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where, + int flags) +{ + vaddr_t va; + int dma; + int i, r; + KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); + + splassert(IPL_BIO); + KASSERT(ISSET(bp->b_flags, B_BUSY)); + dma = ISSET(bp->b_flags, B_DMA); + + /* if the original buf is mapped, unmap it */ + if (bp->b_data != NULL) { + va = (vaddr_t)bp->b_data; + pmap_kremove(va, bp->b_bufsize); + pmap_update(pmap_kernel()); + } + + r = 0; + do { + r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs, + bp->b_bufsize, UVM_PLA_NOWAIT, where); + if (r == 0) + break; + } while ((bufbackoff(where, 100) == 0) && (flags & UVM_PLA_WAITOK)); + if (r != 0 && (! flags & UVM_PLA_NOWAIT)) + r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs, + bp->b_bufsize, flags, where); + + /* + * do this now, and put it back later when we know where we are + */ + if (dma) + bcstats.dmapages -= atop(bp->b_bufsize); + + dma = 1; + /* if the original buf was mapped, re-map it */ + for (i = 0; i < atop(bp->b_bufsize); i++) { + struct vm_page *pg = uvm_pagelookup(bp->b_pobj, + bp->b_poffs + ptoa(i)); + KASSERT(pg != NULL); + if (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg))) + dma = 0; + if (bp->b_data != NULL) { + pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg), + VM_PROT_READ|VM_PROT_WRITE); + pmap_update(pmap_kernel()); + } + } + if (dma) { + SET(bp->b_flags, B_DMA); + bcstats.dmapages += atop(bp->b_bufsize); + } else + CLR(bp->b_flags, B_DMA); + return(r); +} Index: sys/kern/vfs_vops.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_vops.c,v retrieving revision 1.5 diff -u -p -r1.5 vfs_vops.c --- sys/kern/vfs_vops.c 28 Mar 2013 02:08:39 -0000 1.5 +++ sys/kern/vfs_vops.c 3 Jun 2013 14:51:14 -0000 @@ -633,6 +633,11 @@ VOP_STRATEGY(struct buf *bp) if (bp->b_vp->v_op->vop_strategy == NULL) return (EOPNOTSUPP); + /* + * Flip buffer to dma reachable memory if necessary. + */ + if (ISSET(bp->b_flags, B_BC)) + buf_dma(bp); return ((bp->b_vp->v_op->vop_strategy)(&a)); } Index: sys/sys/buf.h =================================================================== RCS file: /cvs/src/sys/sys/buf.h,v retrieving revision 1.84 diff -u -p -r1.84 buf.h --- sys/sys/buf.h 24 Mar 2013 17:42:43 -0000 1.84 +++ sys/sys/buf.h 3 Jun 2013 14:51:14 -0000 @@ -234,12 +234,14 @@ struct buf { #define B_SCANNED 0x00100000 /* Block already pushed during sync */ #define B_PDAEMON 0x00200000 /* I/O started by pagedaemon */ #define B_RELEASED 0x00400000 /* free this buffer after its kvm */ +#define B_BC 0x00800000 /* Managed by the Buffer Cache. */ +#define B_DMA 0x01000000 /* DMA reachable. */ #define B_BITS "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \ "\006CACHE\007CALL\010DELWRI\011DONE\012EINTR\013ERROR" \ "\014INVAL\015NOCACHE\016PHYS\017RAW\020READ" \ "\021WANTED\022WRITEINPROG\023XXX(FORMAT)\024DEFERRED" \ - "\025SCANNED\026DAEMON\027RELEASED" + "\025SCANNED\026DAEMON\027RELEASED\030BC\031DMA" /* * This structure describes a clustered I/O. It is stored in the b_saveaddr @@ -305,6 +307,7 @@ void bremfree(struct buf *); void bufinit(void); void buf_dirty(struct buf *); void buf_undirty(struct buf *); +void buf_dma(struct buf *); int bwrite(struct buf *); struct buf *getblk(struct vnode *, daddr64_t, int, int, int); struct buf *geteblk(int); @@ -328,7 +331,8 @@ int buf_dealloc_mem(struct buf *); void buf_fix_mapping(struct buf *, vsize_t); void buf_alloc_pages(struct buf *, vsize_t); void buf_free_pages(struct buf *); - +struct uvm_constraint_range; +int buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int); void minphys(struct buf *bp); int physio(void (*strategy)(struct buf *), dev_t dev, int flags, Index: sys/sys/mount.h =================================================================== RCS file: /cvs/src/sys/sys/mount.h,v retrieving revision 1.109 diff -u -p -r1.109 mount.h --- sys/sys/mount.h 15 Apr 2013 15:32:19 -0000 1.109 +++ sys/sys/mount.h 3 Jun 2013 14:51:14 -0000 @@ -518,6 +518,7 @@ extern long buflowpages, bufhighpages, b #define BUFPAGES_INACT (((bcstats.numcleanpages - buflowpages) < 0) ? 0 \ : bcstats.numcleanpages - buflowpages) extern int bufcachepercent; +extern int bufhigh(int); extern void bufadjust(int); struct uvm_constraint_range; extern int bufbackoff(struct uvm_constraint_range*, long); Index: sys/uvm/uvm_extern.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_extern.h,v retrieving revision 1.104 diff -u -p -r1.104 uvm_extern.h --- sys/uvm/uvm_extern.h 9 Mar 2012 13:01:29 -0000 1.104 +++ sys/uvm/uvm_extern.h 3 Jun 2013 14:51:14 -0000 @@ -681,11 +681,11 @@ struct vm_page *uvm_pagealloc(struct uv voff_t, struct vm_anon *, int); vaddr_t uvm_pagealloc_contig(vaddr_t, vaddr_t, vaddr_t, vaddr_t); -void uvm_pagealloc_multi(struct uvm_object *, voff_t, +int uvm_pagealloc_multi(struct uvm_object *, voff_t, vsize_t, int); void uvm_pagerealloc(struct vm_page *, struct uvm_object *, voff_t); -void uvm_pagerealloc_multi(struct uvm_object *, voff_t, +int uvm_pagerealloc_multi(struct uvm_object *, voff_t, vsize_t, int, struct uvm_constraint_range *); /* Actually, uvm_page_physload takes PF#s which need their own type */ void uvm_page_physload(paddr_t, paddr_t, paddr_t, Index: sys/uvm/uvm_page.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_page.c,v retrieving revision 1.123 diff -u -p -r1.123 uvm_page.c --- sys/uvm/uvm_page.c 27 Mar 2013 02:02:23 -0000 1.123 +++ sys/uvm/uvm_page.c 3 Jun 2013 14:51:14 -0000 @@ -879,19 +879,21 @@ uvm_pglistfree(struct pglist *list) * interface used by the buffer cache to allocate a buffer at a time. * The pages are allocated wired in DMA accessible memory */ -void +int uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size, int flags) { struct pglist plist; struct vm_page *pg; - int i; + int i, r; TAILQ_INIT(&plist); - (void) uvm_pglistalloc(size, dma_constraint.ucr_low, + r = uvm_pglistalloc(size, dma_constraint.ucr_low, dma_constraint.ucr_high, 0, 0, &plist, atop(round_page(size)), - UVM_PLA_WAITOK); + flags); + if (r != 0) + return(r); i = 0; while ((pg = TAILQ_FIRST(&plist)) != NULL) { pg->wire_count = 1; @@ -900,6 +902,7 @@ uvm_pagealloc_multi(struct uvm_object *o TAILQ_REMOVE(&plist, pg, pageq); uvm_pagealloc_pg(pg, obj, off + ptoa(i++), NULL); } + return(0); } /* @@ -907,21 +910,23 @@ uvm_pagealloc_multi(struct uvm_object *o * The pages are reallocated wired outside the DMA accessible region. * */ -void +int uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size, int flags, struct uvm_constraint_range *where) { struct pglist plist; struct vm_page *pg, *tpg; - int i; + int i,r; voff_t offset; TAILQ_INIT(&plist); if (size == 0) panic("size 0 uvm_pagerealloc"); - (void) uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0, - 0, &plist, atop(round_page(size)), UVM_PLA_WAITOK); + r = uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0, + 0, &plist, atop(round_page(size)), flags); + if (r != 0) + return(r); i = 0; while((pg = TAILQ_FIRST(&plist)) != NULL) { offset = off + ptoa(i++); @@ -934,6 +939,7 @@ uvm_pagerealloc_multi(struct uvm_object uvm_pagefree(tpg); uvm_pagealloc_pg(pg, obj, offset, NULL); } + return(0); } /* Index: usr.bin/systat/iostat.c =================================================================== RCS file: /cvs/src/usr.bin/systat/iostat.c,v retrieving revision 1.40 diff -u -p -r1.40 iostat.c --- usr.bin/systat/iostat.c 19 Sep 2011 14:48:04 -0000 1.40 +++ usr.bin/systat/iostat.c 18 Mar 2013 22:29:29 -0000 @@ -222,6 +222,10 @@ showbcache(void) print_fld_ssize(FLD_IO_SVAL, bccur.numbufpages); end_line(); + print_fld_str(FLD_IO_SSTR, "dma pages"); + print_fld_ssize(FLD_IO_SVAL, bccur.dmapages); + end_line(); + print_fld_str(FLD_IO_SSTR, "dirty pages"); print_fld_ssize(FLD_IO_SVAL, bccur.numdirtypages); end_line();