On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> Hi,
> 
> I've spent a few days investigating why USB ethernet adapters are so
> horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> most of its time in memcpy.  But, why?  As it turns out, all USB data
> buffers are mapped COHERENT, which on some/most ARMs means uncached.
> Using cached data buffers makes the performance rise from 20 mbit/s to
> 200 mbit/s.  Quite a difference.
> 
> sys/dev/usb/usb_mem.c:
>       error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
>                              &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> 
> On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> not, so mapping it COHERENT means uncached and thus slow.
> 
> Why do we do that?  Well, when the code was imported in 99, it was
> already there.  Since then we have gained infrastructure for DMA
> syncs in the USB stack, which I think are proper.
> 
> sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> 
>       if (!usbd_xfer_isread(xfer)) {
>               if ((xfer->flags & USBD_NO_COPY) == 0)
>                       memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
>                           xfer->length);
>               usb_syncmem(&xfer->dmabuf, 0, xfer->length,
>                   BUS_DMASYNC_PREWRITE);
>       } else
>               usb_syncmem(&xfer->dmabuf, 0, xfer->length,
>                   BUS_DMASYNC_PREREAD);
>       err = pipe->methods->transfer(xfer);
> 
> sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> 
>       if (xfer->actlen != 0) {
>               if (usbd_xfer_isread(xfer)) {
>                       usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
>                           BUS_DMASYNC_POSTREAD);
>                       if (!(xfer->flags & USBD_NO_COPY))
>                               memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
>                                   xfer->actlen);
>               } else
>                       usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
>                           BUS_DMASYNC_POSTWRITE);
>       }
> 
> We cannot just remove COHERENT, since some drivers, like ehci(4), use
> the same backend to allocate their rings.  And I can't vouch for those
> drivers' sanity.
> 
> As a first step, I would like to go ahead with another solution, which
> is based on a diff from Marius Strobl, who added those syncs in the
> first place.  Essentially it splits the memory handling into cacheable
> and non-cacheable blocks.  The USB data transfers and everyone who uses
> usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> still don't.  This is a bit of a safer approach imho, since we don't
> hurt the controller drivers, but speed up the data buffers.
> 
> Once we have verified that there are no regressions, we can adjust
> ehci(4) and the like, add proper syncs, make sure they still work as
> well as before, and maybe then back this out again.
> 
> Keep note that this is all a no-op on X86, but all the other archs will
> profit from this.
> 
> ok?
> 
> Patrick

Update diff with inverted logic.  kettenis@ argues that we should
invert the logic, and those who need COHERENT memory should ask
for that explicitly, since for bus_dmamem_map() it also needs to
be passed explicitly.  This also points out all those users that
use usb_allocmem() internally, where we might want to have a look
if COHERENT is actually needed or not, or if it can be refactored
in another way.

diff --git a/sys/dev/usb/dwc2/dwc2.c b/sys/dev/usb/dwc2/dwc2.c
index 6f035467213..099dfa26da1 100644
--- a/sys/dev/usb/dwc2/dwc2.c
+++ b/sys/dev/usb/dwc2/dwc2.c
@@ -473,6 +473,7 @@ dwc2_open(struct usbd_pipe *pipe)
        switch (xfertype) {
        case UE_CONTROL:
                pipe->methods = &dwc2_device_ctrl_methods;
+               dpipe->req_dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, sizeof(usb_device_request_t),
                    0, &dpipe->req_dma);
                if (err)
diff --git a/sys/dev/usb/dwc2/dwc2_hcd.c b/sys/dev/usb/dwc2/dwc2_hcd.c
index 7e5c91481d5..d44e3196e61 100644
--- a/sys/dev/usb/dwc2/dwc2_hcd.c
+++ b/sys/dev/usb/dwc2/dwc2_hcd.c
@@ -679,6 +679,7 @@ STATIC int dwc2_hc_setup_align_buf(struct dwc2_hsotg 
*hsotg, struct dwc2_qh *qh,
 
                qh->dw_align_buf = NULL;
                qh->dw_align_buf_dma = 0;
+               qh->dw_align_buf_usbdma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&hsotg->hsotg_sc->sc_bus, buf_size, buf_size,
                                   &qh->dw_align_buf_usbdma);
                if (!err) {
@@ -2267,6 +2268,7 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg,
         */
        hsotg->status_buf = NULL;
        if (hsotg->core_params->dma_enable > 0) {
+               hsotg->status_buf_usbdma.flags |= USB_DMA_COHERENT;
                retval = usb_allocmem(&hsotg->hsotg_sc->sc_bus,
                                      DWC2_HCD_STATUS_BUF_SIZE, 0,
                                      &hsotg->status_buf_usbdma);
diff --git a/sys/dev/usb/dwc2/dwc2_hcdddma.c b/sys/dev/usb/dwc2/dwc2_hcdddma.c
index d8584eed50d..72da68e5667 100644
--- a/sys/dev/usb/dwc2/dwc2_hcdddma.c
+++ b/sys/dev/usb/dwc2/dwc2_hcdddma.c
@@ -102,6 +102,7 @@ STATIC int dwc2_desc_list_alloc(struct dwc2_hsotg *hsotg, 
struct dwc2_qh *qh,
        //KASSERT(!cpu_intr_p() && !cpu_softintr_p());
 
        qh->desc_list = NULL;
+       qh->desc_list_usbdma.flags |= USB_DMA_COHERENT;
        err = usb_allocmem(&hsotg->hsotg_sc->sc_bus,
            sizeof(struct dwc2_hcd_dma_desc) * dwc2_max_desc_num(qh), 0,
            &qh->desc_list_usbdma);
@@ -143,6 +144,7 @@ STATIC int dwc2_frame_list_alloc(struct dwc2_hsotg *hsotg, 
gfp_t mem_flags)
 
        /* XXXNH - struct pool */
        hsotg->frame_list = NULL;
+       hsotg->frame_list_usbdma.flags |= USB_DMA_COHERENT;
        err = usb_allocmem(&hsotg->hsotg_sc->sc_bus, 4 * FRLISTEN_64_SIZE,
            0, &hsotg->frame_list_usbdma);
 
diff --git a/sys/dev/usb/ehci.c b/sys/dev/usb/ehci.c
index ddde2796409..1bb6698cbcc 100644
--- a/sys/dev/usb/ehci.c
+++ b/sys/dev/usb/ehci.c
@@ -355,6 +355,7 @@ ehci_init(struct ehci_softc *sc)
        case 3:
                return (USBD_IOERROR);
        }
+       sc->sc_fldma.flags |= USB_DMA_COHERENT;
        err = usb_allocmem(&sc->sc_bus, sc->sc_flsize * sizeof(ehci_link_t),
            EHCI_FLALIGN_ALIGN, &sc->sc_fldma);
        if (err)
@@ -1469,6 +1470,7 @@ ehci_open(struct usbd_pipe *pipe)
 
        switch (xfertype) {
        case UE_CONTROL:
+               epipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, sizeof(usb_device_request_t),
                    0, &epipe->u.ctl.reqdma);
                if (err) {
@@ -2257,6 +2259,7 @@ ehci_alloc_sqh(struct ehci_softc *sc)
        s = splusb();
        if (sc->sc_freeqhs == NULL) {
                DPRINTFN(2, ("ehci_alloc_sqh: allocating chunk\n"));
+               dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, EHCI_SQH_SIZE * EHCI_SQH_CHUNK,
                    EHCI_PAGE_SIZE, &dma);
                if (err)
@@ -2305,6 +2308,7 @@ ehci_alloc_sqtd(struct ehci_softc *sc)
        s = splusb();
        if (sc->sc_freeqtds == NULL) {
                DPRINTFN(2, ("ehci_alloc_sqtd: allocating chunk\n"));
+               dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, EHCI_SQTD_SIZE*EHCI_SQTD_CHUNK,
                    EHCI_PAGE_SIZE, &dma);
                if (err)
@@ -2533,6 +2537,7 @@ ehci_alloc_itd(struct ehci_softc *sc)
        }
 
        if (freeitd == NULL) {
+               dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, EHCI_ITD_SIZE * EHCI_ITD_CHUNK,
                    EHCI_PAGE_SIZE, &dma);
                if (err) {
diff --git a/sys/dev/usb/ohci.c b/sys/dev/usb/ohci.c
index ad3a9811a9c..022286380dc 100644
--- a/sys/dev/usb/ohci.c
+++ b/sys/dev/usb/ohci.c
@@ -394,6 +394,7 @@ ohci_alloc_sed(struct ohci_softc *sc)
        s = splusb();
        if (sc->sc_freeeds == NULL) {
                DPRINTFN(2, ("ohci_alloc_sed: allocating chunk\n"));
+               dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, OHCI_SED_SIZE * OHCI_SED_CHUNK,
                          OHCI_ED_ALIGN, &dma);
                if (err)
@@ -439,6 +440,7 @@ ohci_alloc_std(struct ohci_softc *sc)
        s = splusb();
        if (sc->sc_freetds == NULL) {
                DPRINTFN(2, ("ohci_alloc_std: allocating chunk\n"));
+               dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, OHCI_STD_SIZE * OHCI_STD_CHUNK,
                          OHCI_TD_ALIGN, &dma);
                if (err)
@@ -597,6 +599,7 @@ ohci_alloc_sitd(struct ohci_softc *sc)
 
        if (sc->sc_freeitds == NULL) {
                DPRINTFN(2, ("ohci_alloc_sitd: allocating chunk\n"));
+               dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, OHCI_SITD_SIZE * 
OHCI_SITD_CHUNK,
                          OHCI_ITD_ALIGN, &dma);
                if (err)
@@ -728,6 +731,7 @@ ohci_init(struct ohci_softc *sc)
 
        /* XXX determine alignment by R/W */
        /* Allocate the HCCA area. */
+       sc->sc_hccadma.flags |= USB_DMA_COHERENT;
        err = usb_allocmem(&sc->sc_bus, OHCI_HCCA_SIZE,
                         OHCI_HCCA_ALIGN, &sc->sc_hccadma);
        if (err)
@@ -1931,6 +1935,7 @@ ohci_open(struct usbd_pipe *pipe)
                switch (xfertype) {
                case UE_CONTROL:
                        pipe->methods = &ohci_device_ctrl_methods;
+                       opipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
                        err = usb_allocmem(&sc->sc_bus,
                                  sizeof(usb_device_request_t),
                                  0, &opipe->u.ctl.reqdma);
diff --git a/sys/dev/usb/uhci.c b/sys/dev/usb/uhci.c
index 47e6d1678c0..c3e0e15856a 100644
--- a/sys/dev/usb/uhci.c
+++ b/sys/dev/usb/uhci.c
@@ -377,6 +377,7 @@ uhci_init(struct uhci_softc *sc)
        UWRITE1(sc, UHCI_SOF, sc->sc_saved_sof);
 
        /* Allocate and initialize real frame array. */
+       sc->sc_dma.flags |= USB_DMA_COHERENT;
        err = usb_allocmem(&sc->sc_bus,
                  UHCI_FRAMELIST_COUNT * sizeof(uhci_physaddr_t),
                  UHCI_FRAMELIST_ALIGN, &sc->sc_dma);
@@ -1414,6 +1415,7 @@ uhci_alloc_std(struct uhci_softc *sc)
        s = splusb();
        if (sc->sc_freetds == NULL) {
                DPRINTFN(2,("uhci_alloc_std: allocating chunk\n"));
+               dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, UHCI_STD_SIZE * UHCI_STD_CHUNK,
                          UHCI_TD_ALIGN, &dma);
                if (err)
@@ -1468,6 +1470,7 @@ uhci_alloc_sqh(struct uhci_softc *sc)
        s = splusb();
        if (sc->sc_freeqhs == NULL) {
                DPRINTFN(2, ("uhci_alloc_sqh: allocating chunk\n"));
+               dma.flags |= USB_DMA_COHERENT;
                err = usb_allocmem(&sc->sc_bus, UHCI_SQH_SIZE * UHCI_SQH_CHUNK,
                          UHCI_QH_ALIGN, &dma);
                if (err)
@@ -2650,6 +2653,7 @@ uhci_open(struct usbd_pipe *pipe)
                                uhci_free_std(sc, upipe->u.ctl.setup);
                                goto bad;
                        }
+                       upipe->u.ctl.reqdma.flags |= USB_DMA_COHERENT;
                        err = usb_allocmem(&sc->sc_bus,
                                  sizeof(usb_device_request_t),
                                  0, &upipe->u.ctl.reqdma);
diff --git a/sys/dev/usb/usb_mem.c b/sys/dev/usb/usb_mem.c
index c65906b43f4..a2b8a3c10be 100644
--- a/sys/dev/usb/usb_mem.c
+++ b/sys/dev/usb/usb_mem.c
@@ -72,7 +72,7 @@ struct usb_frag_dma {
 };
 
 usbd_status    usb_block_allocmem(bus_dma_tag_t, size_t, size_t,
-                   struct usb_dma_block **);
+                   struct usb_dma_block **, int);
 void           usb_block_freemem(struct usb_dma_block *);
 
 LIST_HEAD(, usb_dma_block) usb_blk_freelist =
@@ -84,7 +84,7 @@ LIST_HEAD(, usb_frag_dma) usb_frag_freelist =
 
 usbd_status
 usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
-    struct usb_dma_block **dmap)
+    struct usb_dma_block **dmap, int coherent)
 {
        int error;
         struct usb_dma_block *p;
@@ -96,7 +96,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
        s = splusb();
        /* First check the free list. */
        for (p = LIST_FIRST(&usb_blk_freelist); p; p = LIST_NEXT(p, next)) {
-               if (p->tag == tag && p->size >= size && p->align >= align) {
+               if (p->tag == tag && p->size >= size && p->align >= align &&
+                   p->coherent == coherent) {
                        LIST_REMOVE(p, next);
                        usb_blk_nfree--;
                        splx(s);
@@ -116,6 +117,7 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
        p->tag = tag;
        p->size = size;
        p->align = align;
+       p->coherent = coherent;
        error = bus_dmamem_alloc(tag, p->size, align, 0,
                                 p->segs, nitems(p->segs),
                                 &p->nsegs, BUS_DMA_NOWAIT);
@@ -123,7 +125,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
                goto free0;
 
        error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
-                              &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
+                              &p->kaddr, BUS_DMA_NOWAIT | (coherent ?
+                              BUS_DMA_COHERENT : 0));
        if (error)
                goto free1;
 
@@ -187,14 +190,18 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t 
align, struct usb_dma *p)
        usbd_status err;
        struct usb_frag_dma *f;
        struct usb_dma_block *b;
+       int coherent;
        int i;
        int s;
 
+       coherent = !!(p->flags & USB_DMA_COHERENT);
+
        /* If the request is large then just use a full block. */
        if (size > USB_MEM_SMALL || align > USB_MEM_SMALL) {
                DPRINTFN(1, ("%s: large alloc %d\n", __func__, (int)size));
                size = (size + USB_MEM_BLOCK - 1) & ~(USB_MEM_BLOCK - 1);
-               err = usb_block_allocmem(tag, size, align, &p->block);
+               err = usb_block_allocmem(tag, size, align, &p->block,
+                   coherent);
                if (!err) {
                        p->block->frags = NULL;
                        p->offs = 0;
@@ -205,11 +212,12 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t 
align, struct usb_dma *p)
        s = splusb();
        /* Check for free fragments. */
        for (f = LIST_FIRST(&usb_frag_freelist); f; f = LIST_NEXT(f, next))
-               if (f->block->tag == tag)
+               if (f->block->tag == tag && f->block->coherent == coherent)
                        break;
        if (f == NULL) {
                DPRINTFN(1, ("usb_allocmem: adding fragments\n"));
-               err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL,&b);
+               err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL, &b,
+                   coherent);
                if (err) {
                        splx(s);
                        return (err);
diff --git a/sys/dev/usb/usb_mem.h b/sys/dev/usb/usb_mem.h
index 1ae933f4bd9..7d490dd4833 100644
--- a/sys/dev/usb/usb_mem.h
+++ b/sys/dev/usb/usb_mem.h
@@ -40,6 +40,7 @@ struct usb_dma_block {
         caddr_t kaddr;
         bus_dma_segment_t segs[1];
         int nsegs;
+       int coherent;
         size_t size;
         size_t align;
        struct usb_frag_dma *frags;
diff --git a/sys/dev/usb/usbdivar.h b/sys/dev/usb/usbdivar.h
index 6600c402979..102ec2f1af4 100644
--- a/sys/dev/usb/usbdivar.h
+++ b/sys/dev/usb/usbdivar.h
@@ -47,6 +47,8 @@ struct usb_dma_block;
 struct usb_dma {
        struct usb_dma_block    *block;
        u_int                    offs;
+       int                      flags;
+#define USB_DMA_COHERENT               (1 << 0)
 };
 
 struct usbd_xfer;

Reply via email to