Re: Adding support for WC (write-combining) memory to bus_dma

2012-07-13 Thread John Baldwin
On Thursday, July 12, 2012 1:51:20 pm John Baldwin wrote:
> On Thursday, July 12, 2012 12:37:13 pm Scott Long wrote:
> > Yup, this is a problem, and I like your fix; this kind of state is exactly 
> > what
> > belongs in the map.
> 
> Why don't I break out the fix first as a separate patch.

Here is a patch to just fix this.  I also mirrored the changes into powerpc
since it had copied the same code from x86 (albeit disabled).  If you feel
the malloc stats via M_DEVBUF are important, I can restore those (probably
by adding a contigmalloc_attr() wrapper).

Index: powerpc/powerpc/busdma_machdep.c
===
--- powerpc/powerpc/busdma_machdep.c(revision 238365)
+++ powerpc/powerpc/busdma_machdep.c(working copy)
@@ -46,6 +46,8 @@
 #include 
 
 #include 
+#include 
+#include 
 #include 
 #include 
 
@@ -130,6 +132,7 @@
bus_dmamap_callback_t *callback;
void  *callback_arg;
STAILQ_ENTRY(bus_dmamap) links;
+   intcontigalloc;
 };
 
 static STAILQ_HEAD(, bus_dmamap) bounce_map_waitinglist;
@@ -489,6 +492,7 @@
 bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
 bus_dmamap_t *mapp)
 {
+   vm_memattr_t attr;
int mflags;
 
if (flags & BUS_DMA_NOWAIT)
@@ -500,6 +504,12 @@
 
if (flags & BUS_DMA_ZERO)
mflags |= M_ZERO;
+#ifdef NOTYET
+   if (flags & BUS_DMA_NOCACHE)
+   attr = VM_MEMATTR_UNCACHEABLE;
+   else
+#endif
+   attr = VM_MEMATTR_DEFAULT;
 
/* 
 * XXX:
@@ -511,7 +521,8 @@
 */
if ((dmat->maxsize <= PAGE_SIZE) &&
   (dmat->alignment < dmat->maxsize) &&
-   dmat->lowaddr >= ptoa((vm_paddr_t)Maxmem)) {
+   dmat->lowaddr >= ptoa((vm_paddr_t)Maxmem) &&
+   attr == VM_MEMATTR_DEFAULT) {
*vaddr = malloc(dmat->maxsize, M_DEVBUF, mflags);
} else {
/*
@@ -520,9 +531,10 @@
 * multi-seg allocations yet though.
 * XXX Certain AGP hardware does.
 */
-   *vaddr = contigmalloc(dmat->maxsize, M_DEVBUF, mflags,
-   0ul, dmat->lowaddr, dmat->alignment? dmat->alignment : 1ul,
-   dmat->boundary);
+   *vaddr = (void *)kmem_alloc_contig(kernel_map, dmat->maxsize,
+   mflags, 0ul, dmat->lowaddr, dmat->alignment ?
+   dmat->alignment : 1ul, dmat->boundary, attr);
+   (*mapp)->contigalloc = 1;
}
if (*vaddr == NULL) {
CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
@@ -531,11 +543,6 @@
} else if (vtophys(*vaddr) & (dmat->alignment - 1)) {
printf("bus_dmamem_alloc failed to align memory properly.\n");
}
-#ifdef NOTYET
-   if (flags & BUS_DMA_NOCACHE)
-   pmap_change_attr((vm_offset_t)*vaddr, dmat->maxsize,
-   VM_MEMATTR_UNCACHEABLE);
-#endif
CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
__func__, dmat, dmat->flags, 0);
return (0);
@@ -548,18 +555,12 @@
 void
 bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map)
 {
-   bus_dmamap_destroy(dmat, map);
 
-#ifdef NOTYET
-   pmap_change_attr((vm_offset_t)vaddr, dmat->maxsize, VM_MEMATTR_DEFAULT);
-#endif
-   if ((dmat->maxsize <= PAGE_SIZE) &&
-  (dmat->alignment < dmat->maxsize) &&
-   dmat->lowaddr >= ptoa((vm_paddr_t)Maxmem))
+   if (!map->contigalloc)
free(vaddr, M_DEVBUF);
-   else {
-   contigfree(vaddr, dmat->maxsize, M_DEVBUF);
-   }
+   else
+   kmem_free(kernel_map, (vm_offset_t)vaddr, dmat->maxsize);
+   bus_dmamap_destroy(dmat, map);
CTR3(KTR_BUSDMA, "%s: tag %p flags 0x%x", __func__, dmat, dmat->flags);
 }
 
Index: x86/x86/busdma_machdep.c
===
--- x86/x86/busdma_machdep.c(revision 238365)
+++ x86/x86/busdma_machdep.c(working copy)
@@ -42,6 +42,8 @@
 #include 
 
 #include 
+#include 
+#include 
 #include 
 #include 
 
@@ -131,7 +133,7 @@
 
 static STAILQ_HEAD(, bus_dmamap) bounce_map_waitinglist;
 static STAILQ_HEAD(, bus_dmamap) bounce_map_callbacklist;
-static struct bus_dmamap nobounce_dmamap;
+static struct bus_dmamap nobounce_dmamap, contig_dmamap;
 
 static void init_bounce_pages(void *dummy);
 static int alloc_bounce_zone(bus_dma_tag_t dmat);
@@ -465,7 +467,7 @@
 int
 bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map)
 {
-   if (map != NULL && map != &nobounce_dmamap) {
+   if (map != NULL && map != &nobounce_dmamap && map != &contig_dmamap) {
if (STAILQ_FIRST(&map->bpages) != NULL) {
CTR3(KTR_BUSDMA, "%s: tag %p error %d",
__func__, dmat, EBUSY);
@@ -490,6 +492,7 @@
 bus_dmamem_alloc(bus_dma_tag_t dm

Re: Adding support for WC (write-combining) memory to bus_dma

2012-07-12 Thread Peter Jeremy
On 2012-Jul-12 10:40:27 -0400, John Baldwin  wrote:
>contigmalloc().  In fact, even better is to call kmem_alloc_contig() directly
>rather than using contigmalloc().
...
>Peter, this is somewhat orthognal (but related) to your bus_dma patch which is
>what prompted me to post this.

Overall, the change seems good to me.  My sole thought on the API was
whether the actual attribute should be passed, rather than having a
couple of new BUS_DMA_ flags but you've addressed that in a followup.

One change is that previously allocated memory was all charged to
M_DEVBUF via the malloc_type_allocated() call in contigmalloc()
whereas now only small allocations are counted.  This would seem to
indicate that large bus_dmamem_alloc() allocations won't be visible in
(eg) "vmstat -m".

-- 
Peter Jeremy


pgpZoejmmJeAW.pgp
Description: PGP signature


Re: Adding support for WC (write-combining) memory to bus_dma

2012-07-12 Thread John Baldwin
On Thursday, July 12, 2012 12:37:13 pm Scott Long wrote:
> 
> - Original Message -
> > From: John Baldwin 
> > To: curr...@freebsd.org
> > Cc: sco...@freebsd.org; Peter Jeremy 
> > Sent: Thursday, July 12, 2012 7:40 AM
> > Subject: Adding support for WC (write-combining) memory to bus_dma
> > 
> > I have a need to allocate static DMA memory via bus_dmamem_alloc() that is 
> > also WC (for a PCI-e device so it can use "nosnoop" transactions).  
> > This is 
> > similar to what the nvidia driver needs, but in my case it is much cleaner 
> > to 
> > allocate the memory via bus dma since the existing code I am extending all 
> > uses busdma.
> > 
> > I have a patch to implement this on 8.x for amd64 that I can port to HEAD 
> > if 
> > folks don't object.  What I would really like to do is add a new paramter 
> > to 
> > 
> > bus_dmamem_alloc() to specify the memory attribute to use, but I am 
> > hesitant 
> > to break that API.  Instead, I added a new flag similar to the existing 
> > BUS_DMA_NOCACHE used to allocate UC memory.
> > 
> 
> Please don't add new parameters.  Now that I'm carefully documenting the
> evolution of the APIs, it's becoming glaringly apparent how sloppy we are
> with API design and interface compatibility.  I'm just as guilty of it as 
> anyone,
> but I'd really like to see less instances of call signature changes in 
> existing
> functions; they make driver maintenance tedious and are hard to effectively
> document.  Some options I can think of:
> 
> 1.  create bus_dmamem_alloc_attr().  I don't really like leafy API growth like
> this either, but it's not a horrible solution.

I would actually not oppose this either.  In this particular case it is a bit
useful as I think the user shouldn't have to explicitly state a default if
they don't need a non-standard attribute.

Side-band comment: if I were going to change the API of bus_dmamem_alloc(), my
biggest request would be a bus_dmamem_alloc_size() that takes the size to 
allocate as a parameter rather than pulling the size out of the tag.  I always
think of a tag as describing a DMA engine's capabilities and restrictions,
whereas the size of, say, a descriptor ring is often a software-configurable
knob (e.g. hw.igb.nrxd) and thus the allocation size isn't really a property
of the DMA engine.  I also find this to be the least intuitive behavior in the
bus DMA API.  But that is an entirely different ball of wax.

> 2.  There are existing placeholder flags, BUS_DMA_BUS[1234] that could be
> aliased and repurposed to hold 4 bits of attribute information for this 
> function.
> The 3 and 4 variants are already in use, but I haven't looked closely to see
> their scope.

Humm, I could do that.  In practice WC is only really applicable on x86.
Also, to be honest, I doubt anyone will ever use special attributes besides
UC and WC for DMA on x86.  That is the main reason why I just added a new flag
and didn't try to add a generic scheme for specifying the memory attribute.

I could easily just move BUS_DMA_WRITE_COMBINING into x86's 
and have it use one of the free placeholder flags.  That is probably the
simplest short term solution.

> 3.  Reserve the top 16 bits of the flags for attribute information.
> 4.  Move the attribute information into the tag and create new setter/getter
> accessors for attribute information.  This would probably be the cleanest,
> though it breaks the existing sloppiness of allowing different 
> pseudo-attributes
> for different allocations under the same tag.  I've wanted to break down the
> existing bus_dma_tag_create() into finer-grained setter/getters for a while in
> any case.
> 5.  Move the attribute information into the map and force everyone to start
> creating maps for static memory allocations.  This would actually add some
> missing uniformity to the API and might actually be cleaner that option 4.
> 
> > While doing this, I ran into an old bug, which is that if you were to call 
> > bus_dmamem_alloc() with BUS_DMA_NOCACHE but a tag that otherwise fell 
> > through 
> > to using malloc() instead of contigmalloc(), bus_dmamem_alloc() would 
> > actually
> > change the state of the entire page.  This seems wrong.  Instead, I think 
> > that 
> > any request for a non-default memory attribute should always use 
> > contigmalloc().  In fact, even better is to call kmem_alloc_contig() 
> > directly
> > rather than using contigmalloc().  However, if you change this, then 
> > bus_dmamem_free() won't always DTRT as it doesn't have enough state to 
> > know if
> > a small allocation shou

Re: Adding support for WC (write-combining) memory to bus_dma

2012-07-12 Thread Scott Long




- Original Message -
> From: John Baldwin 
> To: curr...@freebsd.org
> Cc: sco...@freebsd.org; Peter Jeremy 
> Sent: Thursday, July 12, 2012 7:40 AM
> Subject: Adding support for WC (write-combining) memory to bus_dma
> 
> I have a need to allocate static DMA memory via bus_dmamem_alloc() that is 
> also WC (for a PCI-e device so it can use "nosnoop" transactions).  
> This is 
> similar to what the nvidia driver needs, but in my case it is much cleaner to 
> allocate the memory via bus dma since the existing code I am extending all 
> uses busdma.
> 
> I have a patch to implement this on 8.x for amd64 that I can port to HEAD if 
> folks don't object.  What I would really like to do is add a new paramter to 
> 
> bus_dmamem_alloc() to specify the memory attribute to use, but I am hesitant 
> to break that API.  Instead, I added a new flag similar to the existing 
> BUS_DMA_NOCACHE used to allocate UC memory.
> 

Please don't add new parameters.  Now that I'm carefully documenting the
evolution of the APIs, it's becoming glaringly apparent how sloppy we are
with API design and interface compatibility.  I'm just as guilty of it as 
anyone,
but I'd really like to see less instances of call signature changes in existing
functions; they make driver maintenance tedious and are hard to effectively
document.  Some options I can think of:

1.  create bus_dmamem_alloc_attr().  I don't really like leafy API growth like
this either, but it's not a horrible solution.
2.  There are existing placeholder flags, BUS_DMA_BUS[1234] that could be
aliased and repurposed to hold 4 bits of attribute information for this 
function.
The 3 and 4 variants are already in use, but I haven't looked closely to see
their scope.
3.  Reserve the top 16 bits of the flags for attribute information.
4.  Move the attribute information into the tag and create new setter/getter
accessors for attribute information.  This would probably be the cleanest,
though it breaks the existing sloppiness of allowing different pseudo-attributes
for different allocations under the same tag.  I've wanted to break down the
existing bus_dma_tag_create() into finer-grained setter/getters for a while in
any case.
5.  Move the attribute information into the map and force everyone to start
creating maps for static memory allocations.  This would actually add some
missing uniformity to the API and might actually be cleaner that option 4.

> While doing this, I ran into an old bug, which is that if you were to call 
> bus_dmamem_alloc() with BUS_DMA_NOCACHE but a tag that otherwise fell through 
> to using malloc() instead of contigmalloc(), bus_dmamem_alloc() would actually
> change the state of the entire page.  This seems wrong.  Instead, I think 
> that 
> any request for a non-default memory attribute should always use 
> contigmalloc().  In fact, even better is to call kmem_alloc_contig() directly
> rather than using contigmalloc().  However, if you change this, then 
> bus_dmamem_free() won't always DTRT as it doesn't have enough state to 
> know if
> a small allocation should be free'd via free() or contigfree() (the latter 
> would be required if it used a non-default memory attribute).  The fix I used 
> for this was to create a new dummy dmamap that is returned by 
> bus_dmamem_alloc 
> if it uses contigmalloc().  bus_dmamem_free() then checks the passed in map 
> pointer to decide which type of free to perform.  Once this is fixed, the 
> actual WC support is rather trivial as it merely consists of passing a 
> different argument to kmem_alloc_contig().

Yup, this is a problem, and I like your fix; this kind of state is exactly what
belongs in the map.

> 
> Oh, and using kmem_alloc_contig() instead of the pmap_change_attr() hack is
> required if you want to be able to export the same pages to userland via
> mmap (e.g. using an OBJT_SG object). :)
> 
> Peter, this is somewhat orthognal (but related) to your bus_dma patch which is
> what prompted me to post this.
> 
> Patch for 8 is below.  Porting it to HEAD should be fairly trivial and direct.
> 
> Index: amd64/amd64/busdma_machdep.c

Thanks,
Scott

___
freebsd-current@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: Adding support for WC (write-combining) memory to bus_dma

2012-07-12 Thread Robert Watson


On Thu, 12 Jul 2012, Ian Lepore wrote:

To be clear, I'm not objecting to your proposed changes, I'm more just 
musing that similar problems exist in non-x86 architectures and maybe an MI 
solution is possible (or at least the groundwork could be laid)?


I was likewise going to comment that there are known defficiencies in handling 
page table attributes on non-x86 -- for example, they are ignored on MIPS.  I 
encountered this when trying to add memory mapping support for a device driver 
and discovered that all entries were being installed cached instead of the 
requested uncached.  I have local patches that propagate the same non-solution 
used by FreeBSD/MIPS within the kernel to the d_mmap interface -- namely, if 
you try to export something that isn't memory, then it knows it has to be 
uncached.  I plan to commit these fixes to the MIPS TLB code pretty soon, but 
have been preoccupied with other things for the last few months.  However, 
some heavier lifting is required to add attribute support, and it's quite 
desirable to do so.


Robert
___
freebsd-current@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: Adding support for WC (write-combining) memory to bus_dma

2012-07-12 Thread John Baldwin
On Thursday, July 12, 2012 11:02:07 am Ian Lepore wrote:
> On Thu, 2012-07-12 at 10:40 -0400, John Baldwin wrote:
> > I have a need to allocate static DMA memory via bus_dmamem_alloc() that is 
> > also WC (for a PCI-e device so it can use "nosnoop" transactions).  This is 
> > similar to what the nvidia driver needs, but in my case it is much cleaner 
> > to 
> > allocate the memory via bus dma since the existing code I am extending all 
> > uses busdma.
> > 
> > I have a patch to implement this on 8.x for amd64 that I can port to HEAD 
> > if 
> > folks don't object.  What I would really like to do is add a new paramter 
> > to 
> > bus_dmamem_alloc() to specify the memory attribute to use, but I am 
> > hesitant 
> > to break that API.  Instead, I added a new flag similar to the existing 
> > BUS_DMA_NOCACHE used to allocate UC memory.
> > 
> > While doing this, I ran into an old bug, which is that if you were to call 
> > bus_dmamem_alloc() with BUS_DMA_NOCACHE but a tag that otherwise fell 
> > through 
> > to using malloc() instead of contigmalloc(), bus_dmamem_alloc() would 
> > actually
> > change the state of the entire page.  This seems wrong.  Instead, I think 
> > that 
> > any request for a non-default memory attribute should always use 
> > contigmalloc().  
> 
> The problem I have with this (already, even before your proposed
> changes) is that contigmalloc() is only able to allocate pages.  In the
> ARM world we have a need to allocate BUS_DMA_COHERENT memory (same
> effect as BUS_DMA_NOCACHE; we should consolidate these names) that is
> aligned to a 32-byte boundary (cacheline-aligned) but usually the buffer
> is far smaller than a page, often smaller than 1k, and sometimes we need
> lots of them (allocating 128 pages for ethernet buffers, with only half
> of each page used, is unreasonably expensive on a platform with only
> 64mb to begin with).
> 
> I keep thinking what's needed is a busdma allocation helper routine,
> something MI that can be used by the various MD busdma implementations,
> that can manage a pool of pages that are flagged as uncachable and can
> subdivide those pages to provide small blocks of memory that fit various
> alignment and boundary restrictions.
> 
> To be clear, I'm not objecting to your proposed changes, I'm more just
> musing that similar problems exist in non-x86 architectures and maybe an
> MI solution is possible (or at least the groundwork could be laid)?

The traditional argument I've heard against this is that the relevant driver
should allocate a big block and manage suballocations on its own rather than
pushing that work into bus_dma.  How are you allocating Ethernet buffers
btw?  Are you not using mbuf clusters to receive packets, but allocate
mbufs in your RX interrupt handler and copying data out of static buffers
into the mbufs to send up the stack?

Also, I do not think BUS_DMA_COHERENT and BUS_DMA_NOCACHE are quite the same.
I see UC as a way to implement COHERENT semantics, but it also seems to me that
a COHERENT mapping can't use bounce pages either.  OTOH, NOCACHE (and my new
flag), are specifically requesting a certain mapping behavior not necessarily
to avoid bus_dmamap_sync() operations, but due to a hardware requirement (e.g.
the WC mapping is to enable use of "nosnoop" PCI-e transactions).

That is, I interpret COHERENT as meaning "this map doesn't require
bus_dmamap_sync(), do whatever it takes to make that true", where as NOCACHE
and WC have other meanings (though in practice NOCACHE and WC both imply
COHERENT).  For example, on x86 with caches that snoop DMA transactions,
COHERENT doesn't require NOCACHE at all, it simply requires avoiding the use
of bounce pages.

-- 
John Baldwin
___
freebsd-current@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Re: Adding support for WC (write-combining) memory to bus_dma

2012-07-12 Thread Ian Lepore
On Thu, 2012-07-12 at 10:40 -0400, John Baldwin wrote:
> I have a need to allocate static DMA memory via bus_dmamem_alloc() that is 
> also WC (for a PCI-e device so it can use "nosnoop" transactions).  This is 
> similar to what the nvidia driver needs, but in my case it is much cleaner to 
> allocate the memory via bus dma since the existing code I am extending all 
> uses busdma.
> 
> I have a patch to implement this on 8.x for amd64 that I can port to HEAD if 
> folks don't object.  What I would really like to do is add a new paramter to 
> bus_dmamem_alloc() to specify the memory attribute to use, but I am hesitant 
> to break that API.  Instead, I added a new flag similar to the existing 
> BUS_DMA_NOCACHE used to allocate UC memory.
> 
> While doing this, I ran into an old bug, which is that if you were to call 
> bus_dmamem_alloc() with BUS_DMA_NOCACHE but a tag that otherwise fell through 
> to using malloc() instead of contigmalloc(), bus_dmamem_alloc() would actually
> change the state of the entire page.  This seems wrong.  Instead, I think 
> that 
> any request for a non-default memory attribute should always use 
> contigmalloc().  

The problem I have with this (already, even before your proposed
changes) is that contigmalloc() is only able to allocate pages.  In the
ARM world we have a need to allocate BUS_DMA_COHERENT memory (same
effect as BUS_DMA_NOCACHE; we should consolidate these names) that is
aligned to a 32-byte boundary (cacheline-aligned) but usually the buffer
is far smaller than a page, often smaller than 1k, and sometimes we need
lots of them (allocating 128 pages for ethernet buffers, with only half
of each page used, is unreasonably expensive on a platform with only
64mb to begin with).

I keep thinking what's needed is a busdma allocation helper routine,
something MI that can be used by the various MD busdma implementations,
that can manage a pool of pages that are flagged as uncachable and can
subdivide those pages to provide small blocks of memory that fit various
alignment and boundary restrictions.

To be clear, I'm not objecting to your proposed changes, I'm more just
musing that similar problems exist in non-x86 architectures and maybe an
MI solution is possible (or at least the groundwork could be laid)?

> In fact, even better is to call kmem_alloc_contig() directly
> rather than using contigmalloc().  However, if you change this, then 
> bus_dmamem_free() won't always DTRT as it doesn't have enough state to know if
> a small allocation should be free'd via free() or contigfree() (the latter 
> would be required if it used a non-default memory attribute).  The fix I used 
> for this was to create a new dummy dmamap that is returned by 
> bus_dmamem_alloc 
> if it uses contigmalloc().  bus_dmamem_free() then checks the passed in map 
> pointer to decide which type of free to perform.  Once this is fixed, the 
> actual WC support is rather trivial as it merely consists of passing a 
> different argument to kmem_alloc_contig().
> 
> Oh, and using kmem_alloc_contig() instead of the pmap_change_attr() hack is
> required if you want to be able to export the same pages to userland via
> mmap (e.g. using an OBJT_SG object). :)
> 
> Peter, this is somewhat orthognal (but related) to your bus_dma patch which is
> what prompted me to post this.
> 
> Patch for 8 is below.  Porting it to HEAD should be fairly trivial and direct.
> [patch removed]
> 

-- Ian

___
freebsd-current@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"


Adding support for WC (write-combining) memory to bus_dma

2012-07-12 Thread John Baldwin
I have a need to allocate static DMA memory via bus_dmamem_alloc() that is 
also WC (for a PCI-e device so it can use "nosnoop" transactions).  This is 
similar to what the nvidia driver needs, but in my case it is much cleaner to 
allocate the memory via bus dma since the existing code I am extending all 
uses busdma.

I have a patch to implement this on 8.x for amd64 that I can port to HEAD if 
folks don't object.  What I would really like to do is add a new paramter to 
bus_dmamem_alloc() to specify the memory attribute to use, but I am hesitant 
to break that API.  Instead, I added a new flag similar to the existing 
BUS_DMA_NOCACHE used to allocate UC memory.

While doing this, I ran into an old bug, which is that if you were to call 
bus_dmamem_alloc() with BUS_DMA_NOCACHE but a tag that otherwise fell through 
to using malloc() instead of contigmalloc(), bus_dmamem_alloc() would actually
change the state of the entire page.  This seems wrong.  Instead, I think that 
any request for a non-default memory attribute should always use 
contigmalloc().  In fact, even better is to call kmem_alloc_contig() directly
rather than using contigmalloc().  However, if you change this, then 
bus_dmamem_free() won't always DTRT as it doesn't have enough state to know if
a small allocation should be free'd via free() or contigfree() (the latter 
would be required if it used a non-default memory attribute).  The fix I used 
for this was to create a new dummy dmamap that is returned by bus_dmamem_alloc 
if it uses contigmalloc().  bus_dmamem_free() then checks the passed in map 
pointer to decide which type of free to perform.  Once this is fixed, the 
actual WC support is rather trivial as it merely consists of passing a 
different argument to kmem_alloc_contig().

Oh, and using kmem_alloc_contig() instead of the pmap_change_attr() hack is
required if you want to be able to export the same pages to userland via
mmap (e.g. using an OBJT_SG object). :)

Peter, this is somewhat orthognal (but related) to your bus_dma patch which is
what prompted me to post this.

Patch for 8 is below.  Porting it to HEAD should be fairly trivial and direct.

Index: amd64/amd64/busdma_machdep.c
===
--- amd64/amd64/busdma_machdep.c(revision 225604)
+++ amd64/amd64/busdma_machdep.c(working copy)
@@ -42,6 +42,8 @@
 #include 
 
 #include 
+#include 
+#include 
 #include 
 #include 
 
@@ -125,7 +127,7 @@
 
 static STAILQ_HEAD(, bus_dmamap) bounce_map_waitinglist;
 static STAILQ_HEAD(, bus_dmamap) bounce_map_callbacklist;
-static struct bus_dmamap nobounce_dmamap;
+static struct bus_dmamap nobounce_dmamap, contig_dmamap;
 
 static void init_bounce_pages(void *dummy);
 static int alloc_bounce_zone(bus_dma_tag_t dmat);
@@ -455,7 +457,7 @@
 int
 bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map)
 {
-   if (map != NULL && map != &nobounce_dmamap) {
+   if (map != NULL && map != &nobounce_dmamap && map != &contig_dmamap) {
if (STAILQ_FIRST(&map->bpages) != NULL) {
CTR3(KTR_BUSDMA, "%s: tag %p error %d",
__func__, dmat, EBUSY);
@@ -480,6 +482,7 @@
 bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
 bus_dmamap_t *mapp)
 {
+   vm_memattr_t attr;
int mflags;
 
if (flags & BUS_DMA_NOWAIT)
@@ -502,6 +505,12 @@
}
if (flags & BUS_DMA_ZERO)
mflags |= M_ZERO;
+   if (flags & BUS_DMA_WRITE_COMBINING)
+   attr = VM_MEMATTR_WRITE_COMBINING;
+   else if (flags & BUS_DMA_NOCACHE)
+   attr = VM_MEMATTR_UNCACHEABLE;
+   else
+   attr = VM_MEMATTR_DEFAULT;
 
/* 
 * XXX:
@@ -513,7 +522,8 @@
 */
if ((dmat->maxsize <= PAGE_SIZE) &&
   (dmat->alignment < dmat->maxsize) &&
-   dmat->lowaddr >= ptoa((vm_paddr_t)Maxmem)) {
+   dmat->lowaddr >= ptoa((vm_paddr_t)Maxmem) &&
+   attr == VM_MEMATTR_DEFAULT) {
*vaddr = malloc(dmat->maxsize, M_DEVBUF, mflags);
} else {
/*
@@ -522,9 +532,10 @@
 * multi-seg allocations yet though.
 * XXX Certain AGP hardware does.
 */
-   *vaddr = contigmalloc(dmat->maxsize, M_DEVBUF, mflags,
-   0ul, dmat->lowaddr, dmat->alignment? dmat->alignment : 1ul,
-   dmat->boundary);
+   *vaddr = (void *)kmem_alloc_contig(kernel_map, dmat->maxsize,
+   mflags, 0ul, dmat->lowaddr, dmat->alignment ?
+   dmat->alignment : 1ul, dmat->boundary, attr);
+   *mapp = &contig_dmamap;
}
if (*vaddr == NULL) {
CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
@@ -533,9 +544,6 @@
} else if ((uintptr_t)*vaddr & (dmat->alignment - 1)) {
printf("bus_dmamem_alloc failed to align