On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> Hi,
>
> I've spent a few days investigating why USB ethernet adapters are so
> horribly slow on my ARMs. Using dt(4) I realized that it was spending
> most of its time in memcpy. But, why? As it turns out, all USB data
> buffers are mapped COHERENT, which on some/most ARMs means uncached.
> Using cached data buffers makes the performance rise from 20 mbit/s to
> 200 mbit/s. Quite a difference.
>
> sys/dev/usb/usb_mem.c:
> error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
>
> On x86, COHERENT is essentially a no-op. On ARM, it depends on the SoC.
> Some SoCs have cache-coherent USB controllers, some don't. Mine does
> not, so mapping it COHERENT means uncached and thus slow.
>
> Why do we do that? Well, when the code was imported in 99, it was
> already there. Since then we have gained infrastructure for DMA
> syncs in the USB stack, which I think are proper.
>
> sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
>
> if (!usbd_xfer_isread(xfer)) {
> if ((xfer->flags & USBD_NO_COPY) == 0)
> memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
> xfer->length);
> usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> BUS_DMASYNC_PREWRITE);
> } else
> usb_syncmem(&xfer->dmabuf, 0, xfer->length,
> BUS_DMASYNC_PREREAD);
> err = pipe->methods->transfer(xfer);
>
> sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
>
> if (xfer->actlen != 0) {
> if (usbd_xfer_isread(xfer)) {
> usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> BUS_DMASYNC_POSTREAD);
> if (!(xfer->flags & USBD_NO_COPY))
> memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
> xfer->actlen);
> } else
> usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
> BUS_DMASYNC_POSTWRITE);
> }
>
> We cannot just remove COHERENT, since some drivers, like ehci(4), use
> the same backend to allocate their rings. And I can't vouch for those
> drivers' sanity.
>
> As a first step, I would like to go ahead with another solution, which
> is based on a diff from Marius Strobl, who added those syncs in the
> first place. Essentially it splits the memory handling into cacheable
> and non-cacheable blocks. The USB data transfers and everyone who uses
> usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> still don't. This is a bit of a safer approach imho, since we don't
> hurt the controller drivers, but speed up the data buffers.
>
> Once we have verified that there are no regressions, we can adjust
> ehci(4) and the like, add proper syncs, make sure they still work as
> well as before, and maybe then back this out again.
>
> Keep note that this is all a no-op on X86, but all the other archs will
> profit from this.
>
> ok?
>
> Patrick
Update diff with inverted logic. kettenis@ argues that we should
invert the logic, and those who need COHERENT memory should ask
for that explicitly, since for bus_dmamem_map() it also needs to
be passed explicitly. This also points out all those users that
use usb_allocmem() internally, where we might want to have a look
if COHERENT is actually needed or not, or if it can be refactored
in another way.
diff --git a/sys/dev/usb/dwc2/dwc2.c b/sys/dev/usb/dwc2/dwc2.c
index 6f035467213..099dfa26da1 100644
--- a/sys/dev/usb/dwc2/dwc2.c
+++ b/sys/dev/usb/dwc2/dwc2.c
@@ -473,6 +473,7 @@ dwc2_open(struct usbd_pipe *pipe)
switch (xfertype) {
case UE_CONTROL:
pipe->methods = &dwc2_device_ctrl_methods;
+ dpipe->req_dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, sizeof(usb_device_request_t),
0, &dpipe->req_dma);
if (err)
diff --git a/sys/dev/usb/dwc2/dwc2_hcd.c b/sys/dev/usb/dwc2/dwc2_hcd.c
index 7e5c91481d5..d44e3196e61 100644
--- a/sys/dev/usb/dwc2/dwc2_hcd.c
+++ b/sys/dev/usb/dwc2/dwc2_hcd.c
@@ -679,6 +679,7 @@ STATIC int dwc2_hc_setup_align_buf(struct dwc2_hsotg
*hsotg, struct dwc2_qh *qh,
qh->dw_align_buf = NULL;
qh->dw_align_buf_dma = 0;
+ qh->dw_align_buf_usbdma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&hsotg->hsotg_sc->sc_bus, buf_size, buf_size,
&qh->dw_align_buf_usbdma);
if (!err) {
@@ -2267,6 +2268,7 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg,
*/
hsotg->status_buf = NULL;
if (hsotg->core_params->dma_enable > 0) {
+ hsotg->status_buf_usbdma.flags |= USB_DMA_COHERENT;
retval = usb_allocmem(&hsotg->hsotg_sc->sc_bus,
DWC2_HCD_STATUS_BUF_SIZE, 0,
&hsotg->status_buf_usbdma);
diff --git a/sys/dev/usb/dwc2/dwc2_hcdddma.c b/sys/dev/usb/dwc2/dwc2_hcdddma.c
index d8584eed50d..72da68e5667 100644
--- a/sys/dev/usb/dwc2/dwc2_hcdddma.c
+++ b/sys/dev/usb/dwc2/dwc2_hcdddma.c
@@ -102,6 +102,7 @@ STATIC int dwc2_desc_list_alloc(struct dwc2_hsotg *hsotg,
struct dwc2_qh *qh,
//KASSERT(!cpu_intr_p() && !cpu_softintr_p());
qh->desc_list = NULL;
+ qh->desc_list_usbdma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&hsotg->hsotg_sc->sc_bus,
sizeof(struct dwc2_hcd_dma_desc) * dwc2_max_desc_num(qh), 0,
&qh->desc_list_usbdma);
@@ -143,6 +144,7 @@ STATIC int dwc2_frame_list_alloc(struct dwc2_hsotg *hsotg,
gfp_t mem_flags)
/* XXXNH - struct pool */
hsotg->frame_list = NULL;
+ hsotg->frame_list_usbdma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&hsotg->hsotg_sc->sc_bus, 4 * FRLISTEN_64_SIZE,
0, &hsotg->frame_list_usbdma);
diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
index ddde2796409..1bb6698cbcc 100644
--- a/sys/dev/usb/ehci.c
+++ b/sys/dev/usb/ehci.c
@@ -355,6 +355,7 @@ ehci_init(struct ehci_softc *sc)
case 3:
return (USBD_IOERROR);
}
+ sc->sc_fldma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, sc->sc_flsize * sizeof(ehci_link_t),
EHCI_FLALIGN_ALIGN, &sc->sc_fldma);
if (err)
@@ -1469,6 +1470,7 @@ ehci_open(struct usbd_pipe *pipe)
switch (xfertype) {
case UE_CONTROL:
+ epipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, sizeof(usb_device_request_t),
0, &epipe->u.ctl.reqdma);
if (err) {
@@ -2257,6 +2259,7 @@ ehci_alloc_sqh(struct ehci_softc *sc)
s = splusb();
if (sc->sc_freeqhs == NULL) {
DPRINTFN(2, ("ehci_alloc_sqh: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, EHCI_SQH_SIZE * EHCI_SQH_CHUNK,
EHCI_PAGE_SIZE, &dma);
if (err)
@@ -2305,6 +2308,7 @@ ehci_alloc_sqtd(struct ehci_softc *sc)
s = splusb();
if (sc->sc_freeqtds == NULL) {
DPRINTFN(2, ("ehci_alloc_sqtd: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, EHCI_SQTD_SIZE*EHCI_SQTD_CHUNK,
EHCI_PAGE_SIZE, &dma);
if (err)
@@ -2533,6 +2537,7 @@ ehci_alloc_itd(struct ehci_softc *sc)
}
if (freeitd == NULL) {
+ dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, EHCI_ITD_SIZE * EHCI_ITD_CHUNK,
EHCI_PAGE_SIZE, &dma);
if (err) {
diff --git a/sys/dev/usb/ohci.c b/sys/dev/usb/ohci.c
index ad3a9811a9c..022286380dc 100644
--- a/sys/dev/usb/ohci.c
+++ b/sys/dev/usb/ohci.c
@@ -394,6 +394,7 @@ ohci_alloc_sed(struct ohci_softc *sc)
s = splusb();
if (sc->sc_freeeds == NULL) {
DPRINTFN(2, ("ohci_alloc_sed: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, OHCI_SED_SIZE * OHCI_SED_CHUNK,
OHCI_ED_ALIGN, &dma);
if (err)
@@ -439,6 +440,7 @@ ohci_alloc_std(struct ohci_softc *sc)
s = splusb();
if (sc->sc_freetds == NULL) {
DPRINTFN(2, ("ohci_alloc_std: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, OHCI_STD_SIZE * OHCI_STD_CHUNK,
OHCI_TD_ALIGN, &dma);
if (err)
@@ -597,6 +599,7 @@ ohci_alloc_sitd(struct ohci_softc *sc)
if (sc->sc_freeitds == NULL) {
DPRINTFN(2, ("ohci_alloc_sitd: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, OHCI_SITD_SIZE *
OHCI_SITD_CHUNK,
OHCI_ITD_ALIGN, &dma);
if (err)
@@ -728,6 +731,7 @@ ohci_init(struct ohci_softc *sc)
/* XXX determine alignment by R/W */
/* Allocate the HCCA area. */
+ sc->sc_hccadma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, OHCI_HCCA_SIZE,
OHCI_HCCA_ALIGN, &sc->sc_hccadma);
if (err)
@@ -1931,6 +1935,7 @@ ohci_open(struct usbd_pipe *pipe)
switch (xfertype) {
case UE_CONTROL:
pipe->methods = &ohci_device_ctrl_methods;
+ opipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus,
sizeof(usb_device_request_t),
0, &opipe->u.ctl.reqdma);
diff --git a/sys/dev/usb/uhci.c b/sys/dev/usb/uhci.c
index 47e6d1678c0..c3e0e15856a 100644
--- a/sys/dev/usb/uhci.c
+++ b/sys/dev/usb/uhci.c
@@ -377,6 +377,7 @@ uhci_init(struct uhci_softc *sc)
UWRITE1(sc, UHCI_SOF, sc->sc_saved_sof);
/* Allocate and initialize real frame array. */
+ sc->sc_dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus,
UHCI_FRAMELIST_COUNT * sizeof(uhci_physaddr_t),
UHCI_FRAMELIST_ALIGN, &sc->sc_dma);
@@ -1414,6 +1415,7 @@ uhci_alloc_std(struct uhci_softc *sc)
s = splusb();
if (sc->sc_freetds == NULL) {
DPRINTFN(2,("uhci_alloc_std: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, UHCI_STD_SIZE * UHCI_STD_CHUNK,
UHCI_TD_ALIGN, &dma);
if (err)
@@ -1468,6 +1470,7 @@ uhci_alloc_sqh(struct uhci_softc *sc)
s = splusb();
if (sc->sc_freeqhs == NULL) {
DPRINTFN(2, ("uhci_alloc_sqh: allocating chunk\n"));
+ dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus, UHCI_SQH_SIZE * UHCI_SQH_CHUNK,
UHCI_QH_ALIGN, &dma);
if (err)
@@ -2650,6 +2653,7 @@ uhci_open(struct usbd_pipe *pipe)
uhci_free_std(sc, upipe->u.ctl.setup);
goto bad;
}
+ upipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(&sc->sc_bus,
sizeof(usb_device_request_t),
0, &upipe->u.ctl.reqdma);
diff --git a/sys/dev/usb/usb_mem.c b/sys/dev/usb/usb_mem.c
index c65906b43f4..a2b8a3c10be 100644
--- a/sys/dev/usb/usb_mem.c
+++ b/sys/dev/usb/usb_mem.c
@@ -72,7 +72,7 @@ struct usb_frag_dma {
};
usbd_status usb_block_allocmem(bus_dma_tag_t, size_t, size_t,
- struct usb_dma_block **);
+ struct usb_dma_block **, int);
void usb_block_freemem(struct usb_dma_block *);
LIST_HEAD(, usb_dma_block) usb_blk_freelist =
@@ -84,7 +84,7 @@ LIST_HEAD(, usb_frag_dma) usb_frag_freelist =
usbd_status
usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
- struct usb_dma_block **dmap)
+ struct usb_dma_block **dmap, int coherent)
{
int error;
struct usb_dma_block *p;
@@ -96,7 +96,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t
align,
s = splusb();
/* First check the free list. */
for (p = LIST_FIRST(&usb_blk_freelist); p; p = LIST_NEXT(p, next)) {
- if (p->tag == tag && p->size >= size && p->align >= align) {
+ if (p->tag == tag && p->size >= size && p->align >= align &&
+ p->coherent == coherent) {
LIST_REMOVE(p, next);
usb_blk_nfree--;
splx(s);
@@ -116,6 +117,7 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t
align,
p->tag = tag;
p->size = size;
p->align = align;
+ p->coherent = coherent;
error = bus_dmamem_alloc(tag, p->size, align, 0,
p->segs, nitems(p->segs),
&p->nsegs, BUS_DMA_NOWAIT);
@@ -123,7 +125,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t
align,
goto free0;
error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
- &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
+ &p->kaddr, BUS_DMA_NOWAIT | (coherent ?
+ BUS_DMA_COHERENT : 0));
if (error)
goto free1;
@@ -187,14 +190,18 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t
align, struct usb_dma *p)
usbd_status err;
struct usb_frag_dma *f;
struct usb_dma_block *b;
+ int coherent;
int i;
int s;
+ coherent = !!(p->flags & USB_DMA_COHERENT);
+
/* If the request is large then just use a full block. */
if (size > USB_MEM_SMALL || align > USB_MEM_SMALL) {
DPRINTFN(1, ("%s: large alloc %d\n", __func__, (int)size));
size = (size + USB_MEM_BLOCK - 1) & ~(USB_MEM_BLOCK - 1);
- err = usb_block_allocmem(tag, size, align, &p->block);
+ err = usb_block_allocmem(tag, size, align, &p->block,
+ coherent);
if (!err) {
p->block->frags = NULL;
p->offs = 0;
@@ -205,11 +212,12 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t
align, struct usb_dma *p)
s = splusb();
/* Check for free fragments. */
for (f = LIST_FIRST(&usb_frag_freelist); f; f = LIST_NEXT(f, next))
- if (f->block->tag == tag)
+ if (f->block->tag == tag && f->block->coherent == coherent)
break;
if (f == NULL) {
DPRINTFN(1, ("usb_allocmem: adding fragments\n"));
- err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL,&b);
+ err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL, &b,
+ coherent);
if (err) {
splx(s);
return (err);
diff --git a/sys/dev/usb/usb_mem.h b/sys/dev/usb/usb_mem.h
index 1ae933f4bd9..7d490dd4833 100644
--- a/sys/dev/usb/usb_mem.h
+++ b/sys/dev/usb/usb_mem.h
@@ -40,6 +40,7 @@ struct usb_dma_block {
caddr_t kaddr;
bus_dma_segment_t segs[1];
int nsegs;
+ int coherent;
size_t size;
size_t align;
struct usb_frag_dma *frags;
diff --git a/sys/dev/usb/usbdivar.h b/sys/dev/usb/usbdivar.h
index 6600c402979..102ec2f1af4 100644
--- a/sys/dev/usb/usbdivar.h
+++ b/sys/dev/usb/usbdivar.h
@@ -47,6 +47,8 @@ struct usb_dma_block;
struct usb_dma {
struct usb_dma_block *block;
u_int offs;
+ int flags;
+#define USB_DMA_COHERENT (1 << 0)
};
struct usbd_xfer;