Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-04-02 Thread Patrick Wildt
On Wed, Apr 01, 2020 at 12:23:53PM +0200, Patrick Wildt wrote:
> On Wed, Apr 01, 2020 at 12:04:25PM +0200, Patrick Wildt wrote:
> > On Wed, Apr 01, 2020 at 09:40:06AM +0200, Patrick Wildt wrote:
> > > On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > > > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > > > Hi,
> > > > > > > > 
> > > > > > > > I've spent a few days investigating why USB ethernet adapters 
> > > > > > > > are so
> > > > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was 
> > > > > > > > spending
> > > > > > > > most of its time in memcpy.  But, why?  As it turns out, all 
> > > > > > > > USB data
> > > > > > > > buffers are mapped COHERENT, which on some/most ARMs means 
> > > > > > > > uncached.
> > > > > > > > Using cached data buffers makes the performance rise from 20 
> > > > > > > > mbit/s to
> > > > > > > > 200 mbit/s.  Quite a difference.
> > > > > > > > 
> > > > > > > > sys/dev/usb/usb_mem.c:
> > > > > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > > > >>kaddr, 
> > > > > > > > BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > > > > 
> > > > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on 
> > > > > > > > the SoC.
> > > > > > > > Some SoCs have cache-coherent USB controllers, some don't.  
> > > > > > > > Mine does
> > > > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > > > > 
> > > > > > > > Why do we do that?  Well, when the code was imported in 99, it 
> > > > > > > > was
> > > > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > > > syncs in the USB stack, which I think are proper.
> > > > > > > > 
> > > > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > > > > 
> > > > > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > > > memcpy(KERNADDR(>dmabuf, 0), 
> > > > > > > > xfer->buffer,
> > > > > > > > xfer->length);
> > > > > > > > usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > > > BUS_DMASYNC_PREWRITE);
> > > > > > > > } else
> > > > > > > > usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > > > BUS_DMASYNC_PREREAD);
> > > > > > > > err = pipe->methods->transfer(xfer);
> > > > > > > > 
> > > > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > > > > 
> > > > > > > > if (xfer->actlen != 0) {
> > > > > > > > if (usbd_xfer_isread(xfer)) {
> > > > > > > > usb_syncmem(>dmabuf, 0, 
> > > > > > > > xfer->actlen,
> > > > > > > > BUS_DMASYNC_POSTREAD);
> > > > > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > > > > memcpy(xfer->buffer, 
> > > > > > > > KERNADDR(>dmabuf, 0),
> > > > > > > > xfer->actlen);
> > > > > > > > } else
> > > > > > > > usb_syncmem(>dmabuf, 0, 
> > > > > > > > xfer->actlen,
> > > > > > > > BUS_DMASYNC_POSTWRITE);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > We cannot just remove COHERENT, since some drivers, like 
> > > > > > > > ehci(4), use
> > > > > > > > the same backend to allocate their rings.  And I can't vouch 
> > > > > > > > for those
> > > > > > > > drivers' sanity.
> > > > > > > > 
> > > > > > > > As a first step, I would like to go ahead with another 
> > > > > > > > solution, which
> > > > > > > > is based on a diff from Marius Strobl, who added those syncs in 
> > > > > > > > the
> > > > > > > > first place.  Essentially it splits the memory handling into 
> > > > > > > > cacheable
> > > > > > > > and non-cacheable blocks.  The USB data transfers and everyone 
> > > > > > > > who uses
> > > > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like 
> > > > > > > > ehci(4)
> > > > > > > > still don't.  This is a bit of a safer approach imho, since we 
> > > > > > > > don't
> > > > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > > > > 
> > > > > > > > Once we have verified that there are no regressions, we can 
> > > > > > > > adjust
> > > > > > > > ehci(4) and the like, add proper syncs, make sure they still 
> > > > > > > > work as
> > > > > > > > well as before, and maybe then back this out again.
> > > > > > > > 
> > > > > > > > Keep note that this is all a no-op on X86, but all the other 
> > > > > > > > archs will
> > > > > > > > profit from this.
> > > > > > > 

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-04-01 Thread Patrick Wildt
On Wed, Apr 01, 2020 at 12:04:25PM +0200, Patrick Wildt wrote:
> On Wed, Apr 01, 2020 at 09:40:06AM +0200, Patrick Wildt wrote:
> > On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > > Hi,
> > > > > > > 
> > > > > > > I've spent a few days investigating why USB ethernet adapters are 
> > > > > > > so
> > > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was 
> > > > > > > spending
> > > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB 
> > > > > > > data
> > > > > > > buffers are mapped COHERENT, which on some/most ARMs means 
> > > > > > > uncached.
> > > > > > > Using cached data buffers makes the performance rise from 20 
> > > > > > > mbit/s to
> > > > > > > 200 mbit/s.  Quite a difference.
> > > > > > > 
> > > > > > > sys/dev/usb/usb_mem.c:
> > > > > > >   error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > > >  >kaddr, 
> > > > > > > BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > > > 
> > > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on 
> > > > > > > the SoC.
> > > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine 
> > > > > > > does
> > > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > > > 
> > > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > > syncs in the USB stack, which I think are proper.
> > > > > > > 
> > > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > > > 
> > > > > > >   if (!usbd_xfer_isread(xfer)) {
> > > > > > >   if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > >   memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
> > > > > > >   xfer->length);
> > > > > > >   usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > >   BUS_DMASYNC_PREWRITE);
> > > > > > >   } else
> > > > > > >   usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > >   BUS_DMASYNC_PREREAD);
> > > > > > >   err = pipe->methods->transfer(xfer);
> > > > > > > 
> > > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > > > 
> > > > > > >   if (xfer->actlen != 0) {
> > > > > > >   if (usbd_xfer_isread(xfer)) {
> > > > > > >   usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > > > >   BUS_DMASYNC_POSTREAD);
> > > > > > >   if (!(xfer->flags & USBD_NO_COPY))
> > > > > > >   memcpy(xfer->buffer, 
> > > > > > > KERNADDR(>dmabuf, 0),
> > > > > > >   xfer->actlen);
> > > > > > >   } else
> > > > > > >   usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > > > >   BUS_DMASYNC_POSTWRITE);
> > > > > > >   }
> > > > > > > 
> > > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), 
> > > > > > > use
> > > > > > > the same backend to allocate their rings.  And I can't vouch for 
> > > > > > > those
> > > > > > > drivers' sanity.
> > > > > > > 
> > > > > > > As a first step, I would like to go ahead with another solution, 
> > > > > > > which
> > > > > > > is based on a diff from Marius Strobl, who added those syncs in 
> > > > > > > the
> > > > > > > first place.  Essentially it splits the memory handling into 
> > > > > > > cacheable
> > > > > > > and non-cacheable blocks.  The USB data transfers and everyone 
> > > > > > > who uses
> > > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like 
> > > > > > > ehci(4)
> > > > > > > still don't.  This is a bit of a safer approach imho, since we 
> > > > > > > don't
> > > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > > > 
> > > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > > ehci(4) and the like, add proper syncs, make sure they still work 
> > > > > > > as
> > > > > > > well as before, and maybe then back this out again.
> > > > > > > 
> > > > > > > Keep note that this is all a no-op on X86, but all the other 
> > > > > > > archs will
> > > > > > > profit from this.
> > > > > > > 
> > > > > > > ok?
> > > > > > > 
> > > > > > > Patrick
> > > > > > 
> > > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > > be passed explicitly.  This also points out all those users that
> > > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > > if 

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-04-01 Thread Patrick Wildt
On Wed, Apr 01, 2020 at 09:40:06AM +0200, Patrick Wildt wrote:
> On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > Hi,
> > > > > > 
> > > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was 
> > > > > > spending
> > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB 
> > > > > > data
> > > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > > Using cached data buffers makes the performance rise from 20 mbit/s 
> > > > > > to
> > > > > > 200 mbit/s.  Quite a difference.
> > > > > > 
> > > > > > sys/dev/usb/usb_mem.c:
> > > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > >>kaddr, 
> > > > > > BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > > 
> > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the 
> > > > > > SoC.
> > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine 
> > > > > > does
> > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > > 
> > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > syncs in the USB stack, which I think are proper.
> > > > > > 
> > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > > 
> > > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
> > > > > > xfer->length);
> > > > > > usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > BUS_DMASYNC_PREWRITE);
> > > > > > } else
> > > > > > usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > BUS_DMASYNC_PREREAD);
> > > > > > err = pipe->methods->transfer(xfer);
> > > > > > 
> > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > > 
> > > > > > if (xfer->actlen != 0) {
> > > > > > if (usbd_xfer_isread(xfer)) {
> > > > > > usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > > > BUS_DMASYNC_POSTREAD);
> > > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > > memcpy(xfer->buffer, 
> > > > > > KERNADDR(>dmabuf, 0),
> > > > > > xfer->actlen);
> > > > > > } else
> > > > > > usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > > > BUS_DMASYNC_POSTWRITE);
> > > > > > }
> > > > > > 
> > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), 
> > > > > > use
> > > > > > the same backend to allocate their rings.  And I can't vouch for 
> > > > > > those
> > > > > > drivers' sanity.
> > > > > > 
> > > > > > As a first step, I would like to go ahead with another solution, 
> > > > > > which
> > > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > > first place.  Essentially it splits the memory handling into 
> > > > > > cacheable
> > > > > > and non-cacheable blocks.  The USB data transfers and everyone who 
> > > > > > uses
> > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like 
> > > > > > ehci(4)
> > > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > > 
> > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > > well as before, and maybe then back this out again.
> > > > > > 
> > > > > > Keep note that this is all a no-op on X86, but all the other archs 
> > > > > > will
> > > > > > profit from this.
> > > > > > 
> > > > > > ok?
> > > > > > 
> > > > > > Patrick
> > > > > 
> > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > be passed explicitly.  This also points out all those users that
> > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > > in another way.
> > > > 
> > > > These commits broke usb on imx.6 with cubox:
> > > > 
> > > > imxehci0 at simplebus3
> > > > usb0 at imxehci0: USB revision 2.0
> > > > usb0: root hub problem
> > > > imxehci1 at 

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-04-01 Thread Martin Pieuchot
On 01/04/20(Wed) 10:06, Mark Kettenis wrote:
> > Date: Wed, 1 Apr 2020 09:40:05 +0200
> > From: Patrick Wildt 
> > 
> > On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > > Hi,
> > > > > > > 
> > > > > > > I've spent a few days investigating why USB ethernet adapters are 
> > > > > > > so
> > > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was 
> > > > > > > spending
> > > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB 
> > > > > > > data
> > > > > > > buffers are mapped COHERENT, which on some/most ARMs means 
> > > > > > > uncached.
> > > > > > > Using cached data buffers makes the performance rise from 20 
> > > > > > > mbit/s to
> > > > > > > 200 mbit/s.  Quite a difference.
> > > > > > > 
> > > > > > > sys/dev/usb/usb_mem.c:
> > > > > > >   error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > > >  >kaddr, 
> > > > > > > BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > > > 
> > > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on 
> > > > > > > the SoC.
> > > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine 
> > > > > > > does
> > > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > > > 
> > > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > > syncs in the USB stack, which I think are proper.
> > > > > > > 
> > > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > > > 
> > > > > > >   if (!usbd_xfer_isread(xfer)) {
> > > > > > >   if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > >   memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
> > > > > > >   xfer->length);
> > > > > > >   usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > >   BUS_DMASYNC_PREWRITE);
> > > > > > >   } else
> > > > > > >   usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > >   BUS_DMASYNC_PREREAD);
> > > > > > >   err = pipe->methods->transfer(xfer);
> > > > > > > 
> > > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > > > 
> > > > > > >   if (xfer->actlen != 0) {
> > > > > > >   if (usbd_xfer_isread(xfer)) {
> > > > > > >   usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > > > >   BUS_DMASYNC_POSTREAD);
> > > > > > >   if (!(xfer->flags & USBD_NO_COPY))
> > > > > > >   memcpy(xfer->buffer, 
> > > > > > > KERNADDR(>dmabuf, 0),
> > > > > > >   xfer->actlen);
> > > > > > >   } else
> > > > > > >   usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > > > >   BUS_DMASYNC_POSTWRITE);
> > > > > > >   }
> > > > > > > 
> > > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), 
> > > > > > > use
> > > > > > > the same backend to allocate their rings.  And I can't vouch for 
> > > > > > > those
> > > > > > > drivers' sanity.
> > > > > > > 
> > > > > > > As a first step, I would like to go ahead with another solution, 
> > > > > > > which
> > > > > > > is based on a diff from Marius Strobl, who added those syncs in 
> > > > > > > the
> > > > > > > first place.  Essentially it splits the memory handling into 
> > > > > > > cacheable
> > > > > > > and non-cacheable blocks.  The USB data transfers and everyone 
> > > > > > > who uses
> > > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like 
> > > > > > > ehci(4)
> > > > > > > still don't.  This is a bit of a safer approach imho, since we 
> > > > > > > don't
> > > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > > > 
> > > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > > ehci(4) and the like, add proper syncs, make sure they still work 
> > > > > > > as
> > > > > > > well as before, and maybe then back this out again.
> > > > > > > 
> > > > > > > Keep note that this is all a no-op on X86, but all the other 
> > > > > > > archs will
> > > > > > > profit from this.
> > > > > > > 
> > > > > > > ok?
> > > > > > > 
> > > > > > > Patrick
> > > > > > 
> > > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > > be passed explicitly.  This also points out all those users that
> > > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > > if COHERENT is 

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-04-01 Thread Mark Kettenis
> Date: Wed, 1 Apr 2020 09:40:05 +0200
> From: Patrick Wildt 
> 
> On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> > On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > > Hi,
> > > > > > 
> > > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was 
> > > > > > spending
> > > > > > most of its time in memcpy.  But, why?  As it turns out, all USB 
> > > > > > data
> > > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > > Using cached data buffers makes the performance rise from 20 mbit/s 
> > > > > > to
> > > > > > 200 mbit/s.  Quite a difference.
> > > > > > 
> > > > > > sys/dev/usb/usb_mem.c:
> > > > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > > >>kaddr, 
> > > > > > BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > > 
> > > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the 
> > > > > > SoC.
> > > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine 
> > > > > > does
> > > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > > 
> > > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > > syncs in the USB stack, which I think are proper.
> > > > > > 
> > > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > > 
> > > > > > if (!usbd_xfer_isread(xfer)) {
> > > > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > > > memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
> > > > > > xfer->length);
> > > > > > usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > BUS_DMASYNC_PREWRITE);
> > > > > > } else
> > > > > > usb_syncmem(>dmabuf, 0, xfer->length,
> > > > > > BUS_DMASYNC_PREREAD);
> > > > > > err = pipe->methods->transfer(xfer);
> > > > > > 
> > > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > > 
> > > > > > if (xfer->actlen != 0) {
> > > > > > if (usbd_xfer_isread(xfer)) {
> > > > > > usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > > > BUS_DMASYNC_POSTREAD);
> > > > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > > > memcpy(xfer->buffer, 
> > > > > > KERNADDR(>dmabuf, 0),
> > > > > > xfer->actlen);
> > > > > > } else
> > > > > > usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > > > BUS_DMASYNC_POSTWRITE);
> > > > > > }
> > > > > > 
> > > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), 
> > > > > > use
> > > > > > the same backend to allocate their rings.  And I can't vouch for 
> > > > > > those
> > > > > > drivers' sanity.
> > > > > > 
> > > > > > As a first step, I would like to go ahead with another solution, 
> > > > > > which
> > > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > > first place.  Essentially it splits the memory handling into 
> > > > > > cacheable
> > > > > > and non-cacheable blocks.  The USB data transfers and everyone who 
> > > > > > uses
> > > > > > usbd_alloc_buffer() then use cacheable buffers, while code like 
> > > > > > ehci(4)
> > > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > > 
> > > > > > Once we have verified that there are no regressions, we can adjust
> > > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > > well as before, and maybe then back this out again.
> > > > > > 
> > > > > > Keep note that this is all a no-op on X86, but all the other archs 
> > > > > > will
> > > > > > profit from this.
> > > > > > 
> > > > > > ok?
> > > > > > 
> > > > > > Patrick
> > > > > 
> > > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > > invert the logic, and those who need COHERENT memory should ask
> > > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > > be passed explicitly.  This also points out all those users that
> > > > > use usb_allocmem() internally, where we might want to have a look
> > > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > > in another way.
> > > > 
> > > > These commits broke usb on imx.6 with cubox:
> > > > 
> > > > imxehci0 at simplebus3
> > > > usb0 at imxehci0: USB revision 2.0
> > > > usb0: root hub problem
> > > > imxehci1 

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-04-01 Thread Patrick Wildt
On Wed, Apr 01, 2020 at 09:22:07AM +0200, Patrick Wildt wrote:
> On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> > On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > > Hi,
> > > > > 
> > > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > > 200 mbit/s.  Quite a difference.
> > > > > 
> > > > > sys/dev/usb/usb_mem.c:
> > > > >   error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > > >  >kaddr, 
> > > > > BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > > 
> > > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the 
> > > > > SoC.
> > > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > > 
> > > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > > already there.  Since then we have gained infrastructure for DMA
> > > > > syncs in the USB stack, which I think are proper.
> > > > > 
> > > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > > 
> > > > >   if (!usbd_xfer_isread(xfer)) {
> > > > >   if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > >   memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
> > > > >   xfer->length);
> > > > >   usb_syncmem(>dmabuf, 0, xfer->length,
> > > > >   BUS_DMASYNC_PREWRITE);
> > > > >   } else
> > > > >   usb_syncmem(>dmabuf, 0, xfer->length,
> > > > >   BUS_DMASYNC_PREREAD);
> > > > >   err = pipe->methods->transfer(xfer);
> > > > > 
> > > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > > 
> > > > >   if (xfer->actlen != 0) {
> > > > >   if (usbd_xfer_isread(xfer)) {
> > > > >   usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > >   BUS_DMASYNC_POSTREAD);
> > > > >   if (!(xfer->flags & USBD_NO_COPY))
> > > > >   memcpy(xfer->buffer, 
> > > > > KERNADDR(>dmabuf, 0),
> > > > >   xfer->actlen);
> > > > >   } else
> > > > >   usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > >   BUS_DMASYNC_POSTWRITE);
> > > > >   }
> > > > > 
> > > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > > drivers' sanity.
> > > > > 
> > > > > As a first step, I would like to go ahead with another solution, which
> > > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > > first place.  Essentially it splits the memory handling into cacheable
> > > > > and non-cacheable blocks.  The USB data transfers and everyone who 
> > > > > uses
> > > > > usbd_alloc_buffer() then use cacheable buffers, while code like 
> > > > > ehci(4)
> > > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > > hurt the controller drivers, but speed up the data buffers.
> > > > > 
> > > > > Once we have verified that there are no regressions, we can adjust
> > > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > > well as before, and maybe then back this out again.
> > > > > 
> > > > > Keep note that this is all a no-op on X86, but all the other archs 
> > > > > will
> > > > > profit from this.
> > > > > 
> > > > > ok?
> > > > > 
> > > > > Patrick
> > > > 
> > > > Update diff with inverted logic.  kettenis@ argues that we should
> > > > invert the logic, and those who need COHERENT memory should ask
> > > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > > be passed explicitly.  This also points out all those users that
> > > > use usb_allocmem() internally, where we might want to have a look
> > > > if COHERENT is actually needed or not, or if it can be refactored
> > > > in another way.
> > > 
> > > These commits broke usb on imx.6 with cubox:
> > > 
> > > imxehci0 at simplebus3
> > > usb0 at imxehci0: USB revision 2.0
> > > usb0: root hub problem
> > > imxehci1 at simplebus3
> > > usb1 at imxehci1: USB revision 2.0
> > > usb1: root hub problem
> > > "usbmisc" at simplebus3 not configured
> > 
> > pandaboard with omap4 (another cortex a9) also has broken usb with
> > the latest snapshot:
> > 
> > omehci0 at simplebus0
> > usb0 at omehci0: USB revision 2.0
> > usb0: root 

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-04-01 Thread Patrick Wildt
On Wed, Apr 01, 2020 at 04:47:10PM +1100, Jonathan Gray wrote:
> On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> > On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > > Hi,
> > > > 
> > > > I've spent a few days investigating why USB ethernet adapters are so
> > > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > > 200 mbit/s.  Quite a difference.
> > > > 
> > > > sys/dev/usb/usb_mem.c:
> > > > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > > >>kaddr, 
> > > > BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > > 
> > > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > > not, so mapping it COHERENT means uncached and thus slow.
> > > > 
> > > > Why do we do that?  Well, when the code was imported in 99, it was
> > > > already there.  Since then we have gained infrastructure for DMA
> > > > syncs in the USB stack, which I think are proper.
> > > > 
> > > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > > 
> > > > if (!usbd_xfer_isread(xfer)) {
> > > > if ((xfer->flags & USBD_NO_COPY) == 0)
> > > > memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
> > > > xfer->length);
> > > > usb_syncmem(>dmabuf, 0, xfer->length,
> > > > BUS_DMASYNC_PREWRITE);
> > > > } else
> > > > usb_syncmem(>dmabuf, 0, xfer->length,
> > > > BUS_DMASYNC_PREREAD);
> > > > err = pipe->methods->transfer(xfer);
> > > > 
> > > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > > 
> > > > if (xfer->actlen != 0) {
> > > > if (usbd_xfer_isread(xfer)) {
> > > > usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > BUS_DMASYNC_POSTREAD);
> > > > if (!(xfer->flags & USBD_NO_COPY))
> > > > memcpy(xfer->buffer, 
> > > > KERNADDR(>dmabuf, 0),
> > > > xfer->actlen);
> > > > } else
> > > > usb_syncmem(>dmabuf, 0, xfer->actlen,
> > > > BUS_DMASYNC_POSTWRITE);
> > > > }
> > > > 
> > > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > > the same backend to allocate their rings.  And I can't vouch for those
> > > > drivers' sanity.
> > > > 
> > > > As a first step, I would like to go ahead with another solution, which
> > > > is based on a diff from Marius Strobl, who added those syncs in the
> > > > first place.  Essentially it splits the memory handling into cacheable
> > > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > > still don't.  This is a bit of a safer approach imho, since we don't
> > > > hurt the controller drivers, but speed up the data buffers.
> > > > 
> > > > Once we have verified that there are no regressions, we can adjust
> > > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > > well as before, and maybe then back this out again.
> > > > 
> > > > Keep note that this is all a no-op on X86, but all the other archs will
> > > > profit from this.
> > > > 
> > > > ok?
> > > > 
> > > > Patrick
> > > 
> > > Update diff with inverted logic.  kettenis@ argues that we should
> > > invert the logic, and those who need COHERENT memory should ask
> > > for that explicitly, since for bus_dmamem_map() it also needs to
> > > be passed explicitly.  This also points out all those users that
> > > use usb_allocmem() internally, where we might want to have a look
> > > if COHERENT is actually needed or not, or if it can be refactored
> > > in another way.
> > 
> > These commits broke usb on imx.6 with cubox:
> > 
> > imxehci0 at simplebus3
> > usb0 at imxehci0: USB revision 2.0
> > usb0: root hub problem
> > imxehci1 at simplebus3
> > usb1 at imxehci1: USB revision 2.0
> > usb1: root hub problem
> > "usbmisc" at simplebus3 not configured
> 
> pandaboard with omap4 (another cortex a9) also has broken usb with
> the latest snapshot:
> 
> omehci0 at simplebus0
> usb0 at omehci0: USB revision 2.0
> usb0: root hub problem

I think I know what it is.  When we enqueue a request for the root
hub, the buffer, for which the USB subsystem allocates a DMA buffer,
is filled not by an external device, but by our driver.

Now on completion of that request, since it's doing a READ 

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-03-31 Thread Jonathan Gray
On Wed, Apr 01, 2020 at 12:58:23PM +1100, Jonathan Gray wrote:
> On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> > On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > > Hi,
> > > 
> > > I've spent a few days investigating why USB ethernet adapters are so
> > > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > > Using cached data buffers makes the performance rise from 20 mbit/s to
> > > 200 mbit/s.  Quite a difference.
> > > 
> > > sys/dev/usb/usb_mem.c:
> > >   error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> > >  >kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > > 
> > > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > > not, so mapping it COHERENT means uncached and thus slow.
> > > 
> > > Why do we do that?  Well, when the code was imported in 99, it was
> > > already there.  Since then we have gained infrastructure for DMA
> > > syncs in the USB stack, which I think are proper.
> > > 
> > > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > > 
> > >   if (!usbd_xfer_isread(xfer)) {
> > >   if ((xfer->flags & USBD_NO_COPY) == 0)
> > >   memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
> > >   xfer->length);
> > >   usb_syncmem(>dmabuf, 0, xfer->length,
> > >   BUS_DMASYNC_PREWRITE);
> > >   } else
> > >   usb_syncmem(>dmabuf, 0, xfer->length,
> > >   BUS_DMASYNC_PREREAD);
> > >   err = pipe->methods->transfer(xfer);
> > > 
> > > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > > 
> > >   if (xfer->actlen != 0) {
> > >   if (usbd_xfer_isread(xfer)) {
> > >   usb_syncmem(>dmabuf, 0, xfer->actlen,
> > >   BUS_DMASYNC_POSTREAD);
> > >   if (!(xfer->flags & USBD_NO_COPY))
> > >   memcpy(xfer->buffer, KERNADDR(>dmabuf, 0),
> > >   xfer->actlen);
> > >   } else
> > >   usb_syncmem(>dmabuf, 0, xfer->actlen,
> > >   BUS_DMASYNC_POSTWRITE);
> > >   }
> > > 
> > > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > > the same backend to allocate their rings.  And I can't vouch for those
> > > drivers' sanity.
> > > 
> > > As a first step, I would like to go ahead with another solution, which
> > > is based on a diff from Marius Strobl, who added those syncs in the
> > > first place.  Essentially it splits the memory handling into cacheable
> > > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > > still don't.  This is a bit of a safer approach imho, since we don't
> > > hurt the controller drivers, but speed up the data buffers.
> > > 
> > > Once we have verified that there are no regressions, we can adjust
> > > ehci(4) and the like, add proper syncs, make sure they still work as
> > > well as before, and maybe then back this out again.
> > > 
> > > Keep note that this is all a no-op on X86, but all the other archs will
> > > profit from this.
> > > 
> > > ok?
> > > 
> > > Patrick
> > 
> > Update diff with inverted logic.  kettenis@ argues that we should
> > invert the logic, and those who need COHERENT memory should ask
> > for that explicitly, since for bus_dmamem_map() it also needs to
> > be passed explicitly.  This also points out all those users that
> > use usb_allocmem() internally, where we might want to have a look
> > if COHERENT is actually needed or not, or if it can be refactored
> > in another way.
> 
> These commits broke usb on imx.6 with cubox:
> 
> imxehci0 at simplebus3
> usb0 at imxehci0: USB revision 2.0
> usb0: root hub problem
> imxehci1 at simplebus3
> usb1 at imxehci1: USB revision 2.0
> usb1: root hub problem
> "usbmisc" at simplebus3 not configured

pandaboard with omap4 (another cortex a9) also has broken usb with
the latest snapshot:

omehci0 at simplebus0
usb0 at omehci0: USB revision 2.0
usb0: root hub problem



Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-03-31 Thread Jonathan Gray
On Wed, Mar 18, 2020 at 01:41:06PM +0100, Patrick Wildt wrote:
> On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> > Hi,
> > 
> > I've spent a few days investigating why USB ethernet adapters are so
> > horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> > most of its time in memcpy.  But, why?  As it turns out, all USB data
> > buffers are mapped COHERENT, which on some/most ARMs means uncached.
> > Using cached data buffers makes the performance rise from 20 mbit/s to
> > 200 mbit/s.  Quite a difference.
> > 
> > sys/dev/usb/usb_mem.c:
> > error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
> >>kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> > 
> > On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> > Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> > not, so mapping it COHERENT means uncached and thus slow.
> > 
> > Why do we do that?  Well, when the code was imported in 99, it was
> > already there.  Since then we have gained infrastructure for DMA
> > syncs in the USB stack, which I think are proper.
> > 
> > sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> > 
> > if (!usbd_xfer_isread(xfer)) {
> > if ((xfer->flags & USBD_NO_COPY) == 0)
> > memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
> > xfer->length);
> > usb_syncmem(>dmabuf, 0, xfer->length,
> > BUS_DMASYNC_PREWRITE);
> > } else
> > usb_syncmem(>dmabuf, 0, xfer->length,
> > BUS_DMASYNC_PREREAD);
> > err = pipe->methods->transfer(xfer);
> > 
> > sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> > 
> > if (xfer->actlen != 0) {
> > if (usbd_xfer_isread(xfer)) {
> > usb_syncmem(>dmabuf, 0, xfer->actlen,
> > BUS_DMASYNC_POSTREAD);
> > if (!(xfer->flags & USBD_NO_COPY))
> > memcpy(xfer->buffer, KERNADDR(>dmabuf, 0),
> > xfer->actlen);
> > } else
> > usb_syncmem(>dmabuf, 0, xfer->actlen,
> > BUS_DMASYNC_POSTWRITE);
> > }
> > 
> > We cannot just remove COHERENT, since some drivers, like ehci(4), use
> > the same backend to allocate their rings.  And I can't vouch for those
> > drivers' sanity.
> > 
> > As a first step, I would like to go ahead with another solution, which
> > is based on a diff from Marius Strobl, who added those syncs in the
> > first place.  Essentially it splits the memory handling into cacheable
> > and non-cacheable blocks.  The USB data transfers and everyone who uses
> > usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> > still don't.  This is a bit of a safer approach imho, since we don't
> > hurt the controller drivers, but speed up the data buffers.
> > 
> > Once we have verified that there are no regressions, we can adjust
> > ehci(4) and the like, add proper syncs, make sure they still work as
> > well as before, and maybe then back this out again.
> > 
> > Keep note that this is all a no-op on X86, but all the other archs will
> > profit from this.
> > 
> > ok?
> > 
> > Patrick
> 
> Update diff with inverted logic.  kettenis@ argues that we should
> invert the logic, and those who need COHERENT memory should ask
> for that explicitly, since for bus_dmamem_map() it also needs to
> be passed explicitly.  This also points out all those users that
> use usb_allocmem() internally, where we might want to have a look
> if COHERENT is actually needed or not, or if it can be refactored
> in another way.

These commits broke usb on imx.6 with cubox:

imxehci0 at simplebus3
usb0 at imxehci0: USB revision 2.0
usb0: root hub problem
imxehci1 at simplebus3
usb1 at imxehci1: USB revision 2.0
usb1: root hub problem
"usbmisc" at simplebus3 not configured

After reverting them I can use filesystems on usb again.

diff --git sys/dev/usb/dwc2/dwc2.c sys/dev/usb/dwc2/dwc2.c
index 6ca3cc658e5..6f035467213 100644
--- sys/dev/usb/dwc2/dwc2.c
+++ sys/dev/usb/dwc2/dwc2.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: dwc2.c,v 1.51 2020/03/21 12:08:31 patrick Exp $   */
+/* $OpenBSD: dwc2.c,v 1.49 2019/11/27 11:16:59 mpi Exp $   */
 /* $NetBSD: dwc2.c,v 1.32 2014/09/02 23:26:20 macallan Exp $   */
 
 /*-
@@ -474,7 +474,7 @@ dwc2_open(struct usbd_pipe *pipe)
case UE_CONTROL:
pipe->methods = _device_ctrl_methods;
err = usb_allocmem(>sc_bus, sizeof(usb_device_request_t),
-   0, USB_DMA_COHERENT, >req_dma);
+   0, >req_dma);
if (err)
return err;
break;
diff --git sys/dev/usb/dwc2/dwc2_hcd.c sys/dev/usb/dwc2/dwc2_hcd.c
index ab76de7d766..7e5c91481d5 100644
--- sys/dev/usb/dwc2/dwc2_hcd.c
+++ sys/dev/usb/dwc2/dwc2_hcd.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: 

Re: usb(4): use cacheable buffers for data transfers (massive speedup)

2020-03-18 Thread Patrick Wildt
On Wed, Mar 18, 2020 at 11:22:40AM +0100, Patrick Wildt wrote:
> Hi,
> 
> I've spent a few days investigating why USB ethernet adapters are so
> horribly slow on my ARMs.  Using dt(4) I realized that it was spending
> most of its time in memcpy.  But, why?  As it turns out, all USB data
> buffers are mapped COHERENT, which on some/most ARMs means uncached.
> Using cached data buffers makes the performance rise from 20 mbit/s to
> 200 mbit/s.  Quite a difference.
> 
> sys/dev/usb/usb_mem.c:
>   error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
>  >kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
> 
> On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
> Some SoCs have cache-coherent USB controllers, some don't.  Mine does
> not, so mapping it COHERENT means uncached and thus slow.
> 
> Why do we do that?  Well, when the code was imported in 99, it was
> already there.  Since then we have gained infrastructure for DMA
> syncs in the USB stack, which I think are proper.
> 
> sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)
> 
>   if (!usbd_xfer_isread(xfer)) {
>   if ((xfer->flags & USBD_NO_COPY) == 0)
>   memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
>   xfer->length);
>   usb_syncmem(>dmabuf, 0, xfer->length,
>   BUS_DMASYNC_PREWRITE);
>   } else
>   usb_syncmem(>dmabuf, 0, xfer->length,
>   BUS_DMASYNC_PREREAD);
>   err = pipe->methods->transfer(xfer);
> 
> sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)
> 
>   if (xfer->actlen != 0) {
>   if (usbd_xfer_isread(xfer)) {
>   usb_syncmem(>dmabuf, 0, xfer->actlen,
>   BUS_DMASYNC_POSTREAD);
>   if (!(xfer->flags & USBD_NO_COPY))
>   memcpy(xfer->buffer, KERNADDR(>dmabuf, 0),
>   xfer->actlen);
>   } else
>   usb_syncmem(>dmabuf, 0, xfer->actlen,
>   BUS_DMASYNC_POSTWRITE);
>   }
> 
> We cannot just remove COHERENT, since some drivers, like ehci(4), use
> the same backend to allocate their rings.  And I can't vouch for those
> drivers' sanity.
> 
> As a first step, I would like to go ahead with another solution, which
> is based on a diff from Marius Strobl, who added those syncs in the
> first place.  Essentially it splits the memory handling into cacheable
> and non-cacheable blocks.  The USB data transfers and everyone who uses
> usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
> still don't.  This is a bit of a safer approach imho, since we don't
> hurt the controller drivers, but speed up the data buffers.
> 
> Once we have verified that there are no regressions, we can adjust
> ehci(4) and the like, add proper syncs, make sure they still work as
> well as before, and maybe then back this out again.
> 
> Keep note that this is all a no-op on X86, but all the other archs will
> profit from this.
> 
> ok?
> 
> Patrick

Update diff with inverted logic.  kettenis@ argues that we should
invert the logic, and those who need COHERENT memory should ask
for that explicitly, since for bus_dmamem_map() it also needs to
be passed explicitly.  This also points out all those users that
use usb_allocmem() internally, where we might want to have a look
if COHERENT is actually needed or not, or if it can be refactored
in another way.

diff --git a/sys/dev/usb/dwc2/dwc2.c b/sys/dev/usb/dwc2/dwc2.c
index 6f035467213..099dfa26da1 100644
--- a/sys/dev/usb/dwc2/dwc2.c
+++ b/sys/dev/usb/dwc2/dwc2.c
@@ -473,6 +473,7 @@ dwc2_open(struct usbd_pipe *pipe)
switch (xfertype) {
case UE_CONTROL:
pipe->methods = _device_ctrl_methods;
+   dpipe->req_dma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(>sc_bus, sizeof(usb_device_request_t),
0, >req_dma);
if (err)
diff --git a/sys/dev/usb/dwc2/dwc2_hcd.c b/sys/dev/usb/dwc2/dwc2_hcd.c
index 7e5c91481d5..d44e3196e61 100644
--- a/sys/dev/usb/dwc2/dwc2_hcd.c
+++ b/sys/dev/usb/dwc2/dwc2_hcd.c
@@ -679,6 +679,7 @@ STATIC int dwc2_hc_setup_align_buf(struct dwc2_hsotg 
*hsotg, struct dwc2_qh *qh,
 
qh->dw_align_buf = NULL;
qh->dw_align_buf_dma = 0;
+   qh->dw_align_buf_usbdma.flags |= USB_DMA_COHERENT;
err = usb_allocmem(>hsotg_sc->sc_bus, buf_size, buf_size,
   >dw_align_buf_usbdma);
if (!err) {
@@ -2267,6 +2268,7 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg,
 */
hsotg->status_buf = NULL;
if (hsotg->core_params->dma_enable > 0) {
+   hsotg->status_buf_usbdma.flags |= USB_DMA_COHERENT;
retval = usb_allocmem(>hsotg_sc->sc_bus,
  DWC2_HCD_STATUS_BUF_SIZE, 

usb(4): use cacheable buffers for data transfers (massive speedup)

2020-03-18 Thread Patrick Wildt
Hi,

I've spent a few days investigating why USB ethernet adapters are so
horribly slow on my ARMs.  Using dt(4) I realized that it was spending
most of its time in memcpy.  But, why?  As it turns out, all USB data
buffers are mapped COHERENT, which on some/most ARMs means uncached.
Using cached data buffers makes the performance rise from 20 mbit/s to
200 mbit/s.  Quite a difference.

sys/dev/usb/usb_mem.c:
error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
   >kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);

On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
Some SoCs have cache-coherent USB controllers, some don't.  Mine does
not, so mapping it COHERENT means uncached and thus slow.

Why do we do that?  Well, when the code was imported in 99, it was
already there.  Since then we have gained infrastructure for DMA
syncs in the USB stack, which I think are proper.

sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)

if (!usbd_xfer_isread(xfer)) {
if ((xfer->flags & USBD_NO_COPY) == 0)
memcpy(KERNADDR(>dmabuf, 0), xfer->buffer,
xfer->length);
usb_syncmem(>dmabuf, 0, xfer->length,
BUS_DMASYNC_PREWRITE);
} else
usb_syncmem(>dmabuf, 0, xfer->length,
BUS_DMASYNC_PREREAD);
err = pipe->methods->transfer(xfer);

sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)

if (xfer->actlen != 0) {
if (usbd_xfer_isread(xfer)) {
usb_syncmem(>dmabuf, 0, xfer->actlen,
BUS_DMASYNC_POSTREAD);
if (!(xfer->flags & USBD_NO_COPY))
memcpy(xfer->buffer, KERNADDR(>dmabuf, 0),
xfer->actlen);
} else
usb_syncmem(>dmabuf, 0, xfer->actlen,
BUS_DMASYNC_POSTWRITE);
}

We cannot just remove COHERENT, since some drivers, like ehci(4), use
the same backend to allocate their rings.  And I can't vouch for those
drivers' sanity.

As a first step, I would like to go ahead with another solution, which
is based on a diff from Marius Strobl, who added those syncs in the
first place.  Essentially it splits the memory handling into cacheable
and non-cacheable blocks.  The USB data transfers and everyone who uses
usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
still don't.  This is a bit of a safer approach imho, since we don't
hurt the controller drivers, but speed up the data buffers.

Once we have verified that there are no regressions, we can adjust
ehci(4) and the like, add proper syncs, make sure they still work as
well as before, and maybe then back this out again.

Keep note that this is all a no-op on X86, but all the other archs will
profit from this.

ok?

Patrick

diff --git a/sys/dev/usb/usb_mem.c b/sys/dev/usb/usb_mem.c
index c65906b43f4..95993093b5a 100644
--- a/sys/dev/usb/usb_mem.c
+++ b/sys/dev/usb/usb_mem.c
@@ -72,7 +72,7 @@ struct usb_frag_dma {
 };
 
 usbd_statususb_block_allocmem(bus_dma_tag_t, size_t, size_t,
-   struct usb_dma_block **);
+   struct usb_dma_block **, int);
 void   usb_block_freemem(struct usb_dma_block *);
 
 LIST_HEAD(, usb_dma_block) usb_blk_freelist =
@@ -84,7 +84,7 @@ LIST_HEAD(, usb_frag_dma) usb_frag_freelist =
 
 usbd_status
 usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
-struct usb_dma_block **dmap)
+struct usb_dma_block **dmap, int cacheable)
 {
int error;
 struct usb_dma_block *p;
@@ -96,7 +96,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
s = splusb();
/* First check the free list. */
for (p = LIST_FIRST(_blk_freelist); p; p = LIST_NEXT(p, next)) {
-   if (p->tag == tag && p->size >= size && p->align >= align) {
+   if (p->tag == tag && p->size >= size && p->align >= align &&
+   p->cacheable == cacheable) {
LIST_REMOVE(p, next);
usb_blk_nfree--;
splx(s);
@@ -116,6 +117,7 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
p->tag = tag;
p->size = size;
p->align = align;
+   p->cacheable = cacheable;
error = bus_dmamem_alloc(tag, p->size, align, 0,
 p->segs, nitems(p->segs),
 >nsegs, BUS_DMA_NOWAIT);
@@ -123,7 +125,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
goto free0;
 
error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
-  >kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
+  >kaddr, BUS_DMA_NOWAIT | (cacheable ? 0 :
+