Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Dan Williams


On 8/19/13 5:07 PM, "Jon Mason"  wrote:

>> >> Is this for the case where we are bouncing back and forth between
>> >> sync/async?  Otherwise I do not see how transactions could get out of
>> >> order given you allocate a channel once per queue.  Is this comment
>> >> saying that the iowrite32 is somehow a fix, or is this comment a
>> >> FIXME?
>> >
>> >There is a case for a mix, the "copy_bytes" variable above switches to
>> >CPU for small transfers (which greatly increases throughput on small
>> >transfers).  The caveat to it is the need to flush the DMA engine to
>> >prevent out-of-order.  This comment is mainly an reminder of this
>>issue.
>> 
>> So this is going forward with the stall as a known issue?  The next
>>patch
>> should just do the sync to prevent the re-ordering, right?
>
>There is already a dma_sync_wait in the error path of ntb_async_rx to
>enforce the ordering.  Do I need to change the comment (or move it) to
>make it more obvious what is happening?

Yeah, I think it just needs to move to the dma_sync_wait() otherwise it
seems like it¹s an open issue that needs fixing.

 > +   txd->callback = ntb_rx_copy_callback;
>> >> > +   txd->callback_param = entry;
>> >> > +
>> >> > +   cookie = dmaengine_submit(txd);
>> >> > +   if (dma_submit_error(cookie))
>> >> > +   goto err3;
>> >> > +
>> >> > +   qp->last_cookie = cookie;
>> >> > +
>> >> > +   dma_async_issue_pending(chan);
>> >> 
>> >> hmm... can this go in ntb_process_rx() so that the submission is
>> >> batched?  Cuts down on mmio.
>> >
>> >I moved it down to ntb_transport_rx (after the calls to
>> >ntb_process_rxc), and the performance seems to be roughly the same.
>> 
>> Yeah, not expecting it to be noticeable, but conceptually
>> 
>> submit
>> submit
>> submit
>> submit
>> issue
>> 
>> 
>> Is nicer than:
>> 
>> submit
>> issue
>> submit
>> issue
>> 
>> 
>
>I agree, but I liked having all the dma engine awareness
>compartmentalized in the ntb_async_* and callbacks.

Ok, makes sense.

--
Dan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Jon Mason
On Mon, Aug 19, 2013 at 11:36:13PM +, Dan Williams wrote:
> 
> 
> On 8/19/13 1:37 PM, "Jon Mason"  wrote:
> 
> >On Mon, Aug 19, 2013 at 03:01:54AM -0700, Dan Williams wrote:
> >> On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason  wrote:
> >> > Allocate and use a DMA engine channel to transmit and receive data
> >>over
> >> > NTB.  If none is allocated, fall back to using the CPU to transfer
> >>data.
> >> >
> >> > Cc: Dan Williams 
> >> > Cc: Vinod Koul 
> >> > Cc: Dave Jiang 
> >> > Signed-off-by: Jon Mason 
> >> > ---
> >> >  drivers/ntb/ntb_hw.c|   17 +++
> >> >  drivers/ntb/ntb_hw.h|1 +
> >> >  drivers/ntb/ntb_transport.c |  285
> >>---
> >> >  3 files changed, 258 insertions(+), 45 deletions(-)
> >> >
> >> > diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
> >> > index 1d8e551..014222c 100644
> >> > --- a/drivers/ntb/ntb_hw.c
> >> > +++ b/drivers/ntb/ntb_hw.c
> >> > @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device
> >>*ndev, unsigned int idx, u32 *val)
> >> >  }
> >> >
> >> >  /**
> >> > + * ntb_get_mw_base() - get addr for the NTB memory window
> >> > + * @ndev: pointer to ntb_device instance
> >> > + * @mw: memory window number
> >> > + *
> >> > + * This function provides the base address of the memory window
> >>specified.
> >> > + *
> >> > + * RETURNS: address, or NULL on error.
> >> > + */
> >> > +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned
> >>int mw)
> >> > +{
> >> > +   if (mw >= ntb_max_mw(ndev))
> >> > +   return 0;
> >> > +
> >> > +   return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
> >> > +}
> 
> Nothing does error checking on this return value.  I think the code should
> either be sure that Œmw' is valid (mw_num is passed to the
> ntb_get_mw_vbase helper too) and delete the check, or at least make it a
> WARN_ONCE.  The former seems a tad cleaner to me.

Ugh!  Thanks.

> 
> 
> >> > +
> >> > +/**
> >> >   * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
> >> >   * @ndev: pointer to ntb_device instance
> >> >   * @mw: memory window number
> >> > diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
> >> > index b03de80..ab5f768 100644
> >> > --- a/drivers/ntb/ntb_hw.h
> >> > +++ b/drivers/ntb/ntb_hw.h
> >> > @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev,
> >>unsigned int idx, u32 val);
> >> >  int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx,
> >>u32 *val);
> >> >  int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx,
> >>u32 val);
> >> >  int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx,
> >>u32 *val);
> >> > +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned
> >>int mw);
> >> >  void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int
> >>mw);
> >> >  u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
> >> >  void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
> >> > diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
> >> > index f7380e9..73a35e4 100644
> >> > --- a/drivers/ntb/ntb_transport.c
> >> > +++ b/drivers/ntb/ntb_transport.c
> >> > @@ -47,6 +47,7 @@
> >> >   */
> >> >  #include 
> >> >  #include 
> >> > +#include 
> >> >  #include 
> >> >  #include 
> >> >  #include 
> >> > @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
> >> >  module_param(max_num_clients, byte, 0644);
> >> >  MODULE_PARM_DESC(max_num_clients, "Maximum number of NTB transport
> >>clients");
> >> >
> >> > +static unsigned int copy_bytes = 1024;
> >> > +module_param(copy_bytes, uint, 0644);
> >> > +MODULE_PARM_DESC(copy_bytes, "Threshold under which NTB will use the
> >>CPU to copy instead of DMA");
> >> > +
> >> >  struct ntb_queue_entry {
> >> > /* ntb_queue list reference */
> >> > struct list_head entry;
> >> > @@ -76,6 +81,13 @@ struct ntb_queue_entry {
> >> > void *buf;
> >> > unsigned int len;
> >> > unsigned int flags;
> >> > +
> >> > +   struct ntb_transport_qp *qp;
> >> > +   union {
> >> > +   struct ntb_payload_header __iomem *tx_hdr;
> >> > +   struct ntb_payload_header *rx_hdr;
> >> > +   };
> >> > +   unsigned int index;
> >> >  };
> >> >
> >> >  struct ntb_rx_info {
> >> > @@ -86,6 +98,7 @@ struct ntb_transport_qp {
> >> > struct ntb_transport *transport;
> >> > struct ntb_device *ndev;
> >> > void *cb_data;
> >> > +   struct dma_chan *dma_chan;
> >> >
> >> > bool client_ready;
> >> > bool qp_link;
> >> > @@ -99,6 +112,7 @@ struct ntb_transport_qp {
> >> > struct list_head tx_free_q;
> >> > spinlock_t ntb_tx_free_q_lock;
> >> > void __iomem *tx_mw;
> >> > +   dma_addr_t tx_mw_raw;
> >> > unsigned int tx_index;
> >> > unsigned int tx_max_entry;
> >> > unsigned int tx_max_frame;
> >> > @@ -114,6 +128,7 @@ struct ntb_transport_qp {
> 

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Dan Williams


On 8/19/13 1:37 PM, "Jon Mason"  wrote:

>On Mon, Aug 19, 2013 at 03:01:54AM -0700, Dan Williams wrote:
>> On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason  wrote:
>> > Allocate and use a DMA engine channel to transmit and receive data
>>over
>> > NTB.  If none is allocated, fall back to using the CPU to transfer
>>data.
>> >
>> > Cc: Dan Williams 
>> > Cc: Vinod Koul 
>> > Cc: Dave Jiang 
>> > Signed-off-by: Jon Mason 
>> > ---
>> >  drivers/ntb/ntb_hw.c|   17 +++
>> >  drivers/ntb/ntb_hw.h|1 +
>> >  drivers/ntb/ntb_transport.c |  285
>>---
>> >  3 files changed, 258 insertions(+), 45 deletions(-)
>> >
>> > diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
>> > index 1d8e551..014222c 100644
>> > --- a/drivers/ntb/ntb_hw.c
>> > +++ b/drivers/ntb/ntb_hw.c
>> > @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device
>>*ndev, unsigned int idx, u32 *val)
>> >  }
>> >
>> >  /**
>> > + * ntb_get_mw_base() - get addr for the NTB memory window
>> > + * @ndev: pointer to ntb_device instance
>> > + * @mw: memory window number
>> > + *
>> > + * This function provides the base address of the memory window
>>specified.
>> > + *
>> > + * RETURNS: address, or NULL on error.
>> > + */
>> > +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned
>>int mw)
>> > +{
>> > +   if (mw >= ntb_max_mw(ndev))
>> > +   return 0;
>> > +
>> > +   return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
>> > +}

Nothing does error checking on this return value.  I think the code should
either be sure that Œmw' is valid (mw_num is passed to the
ntb_get_mw_vbase helper too) and delete the check, or at least make it a
WARN_ONCE.  The former seems a tad cleaner to me.


>> > +
>> > +/**
>> >   * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
>> >   * @ndev: pointer to ntb_device instance
>> >   * @mw: memory window number
>> > diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
>> > index b03de80..ab5f768 100644
>> > --- a/drivers/ntb/ntb_hw.h
>> > +++ b/drivers/ntb/ntb_hw.h
>> > @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev,
>>unsigned int idx, u32 val);
>> >  int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx,
>>u32 *val);
>> >  int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx,
>>u32 val);
>> >  int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx,
>>u32 *val);
>> > +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned
>>int mw);
>> >  void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int
>>mw);
>> >  u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
>> >  void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
>> > diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
>> > index f7380e9..73a35e4 100644
>> > --- a/drivers/ntb/ntb_transport.c
>> > +++ b/drivers/ntb/ntb_transport.c
>> > @@ -47,6 +47,7 @@
>> >   */
>> >  #include 
>> >  #include 
>> > +#include 
>> >  #include 
>> >  #include 
>> >  #include 
>> > @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
>> >  module_param(max_num_clients, byte, 0644);
>> >  MODULE_PARM_DESC(max_num_clients, "Maximum number of NTB transport
>>clients");
>> >
>> > +static unsigned int copy_bytes = 1024;
>> > +module_param(copy_bytes, uint, 0644);
>> > +MODULE_PARM_DESC(copy_bytes, "Threshold under which NTB will use the
>>CPU to copy instead of DMA");
>> > +
>> >  struct ntb_queue_entry {
>> > /* ntb_queue list reference */
>> > struct list_head entry;
>> > @@ -76,6 +81,13 @@ struct ntb_queue_entry {
>> > void *buf;
>> > unsigned int len;
>> > unsigned int flags;
>> > +
>> > +   struct ntb_transport_qp *qp;
>> > +   union {
>> > +   struct ntb_payload_header __iomem *tx_hdr;
>> > +   struct ntb_payload_header *rx_hdr;
>> > +   };
>> > +   unsigned int index;
>> >  };
>> >
>> >  struct ntb_rx_info {
>> > @@ -86,6 +98,7 @@ struct ntb_transport_qp {
>> > struct ntb_transport *transport;
>> > struct ntb_device *ndev;
>> > void *cb_data;
>> > +   struct dma_chan *dma_chan;
>> >
>> > bool client_ready;
>> > bool qp_link;
>> > @@ -99,6 +112,7 @@ struct ntb_transport_qp {
>> > struct list_head tx_free_q;
>> > spinlock_t ntb_tx_free_q_lock;
>> > void __iomem *tx_mw;
>> > +   dma_addr_t tx_mw_raw;
>> > unsigned int tx_index;
>> > unsigned int tx_max_entry;
>> > unsigned int tx_max_frame;
>> > @@ -114,6 +128,7 @@ struct ntb_transport_qp {
>> > unsigned int rx_index;
>> > unsigned int rx_max_entry;
>> > unsigned int rx_max_frame;
>> > +   dma_cookie_t last_cookie;
>> >
>> > void (*event_handler) (void *data, int status);
>> > struct delayed_work link_work;
>> > @@ -129,9 +144,14 @@ struct ntb_transport_qp {
>> > u64 

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Jon Mason
On Mon, Aug 19, 2013 at 03:01:54AM -0700, Dan Williams wrote:
> On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason  wrote:
> > Allocate and use a DMA engine channel to transmit and receive data over
> > NTB.  If none is allocated, fall back to using the CPU to transfer data.
> >
> > Cc: Dan Williams 
> > Cc: Vinod Koul 
> > Cc: Dave Jiang 
> > Signed-off-by: Jon Mason 
> > ---
> >  drivers/ntb/ntb_hw.c|   17 +++
> >  drivers/ntb/ntb_hw.h|1 +
> >  drivers/ntb/ntb_transport.c |  285 
> > ---
> >  3 files changed, 258 insertions(+), 45 deletions(-)
> >
> > diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
> > index 1d8e551..014222c 100644
> > --- a/drivers/ntb/ntb_hw.c
> > +++ b/drivers/ntb/ntb_hw.c
> > @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device *ndev, 
> > unsigned int idx, u32 *val)
> >  }
> >
> >  /**
> > + * ntb_get_mw_base() - get addr for the NTB memory window
> > + * @ndev: pointer to ntb_device instance
> > + * @mw: memory window number
> > + *
> > + * This function provides the base address of the memory window specified.
> > + *
> > + * RETURNS: address, or NULL on error.
> > + */
> > +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
> > +{
> > +   if (mw >= ntb_max_mw(ndev))
> > +   return 0;
> > +
> > +   return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
> > +}
> > +
> > +/**
> >   * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
> >   * @ndev: pointer to ntb_device instance
> >   * @mw: memory window number
> > diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
> > index b03de80..ab5f768 100644
> > --- a/drivers/ntb/ntb_hw.h
> > +++ b/drivers/ntb/ntb_hw.h
> > @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev, 
> > unsigned int idx, u32 val);
> >  int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 
> > *val);
> >  int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
> > val);
> >  int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
> > *val);
> > +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
> >  void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
> >  u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
> >  void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
> > diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
> > index f7380e9..73a35e4 100644
> > --- a/drivers/ntb/ntb_transport.c
> > +++ b/drivers/ntb/ntb_transport.c
> > @@ -47,6 +47,7 @@
> >   */
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
> >  module_param(max_num_clients, byte, 0644);
> >  MODULE_PARM_DESC(max_num_clients, "Maximum number of NTB transport 
> > clients");
> >
> > +static unsigned int copy_bytes = 1024;
> > +module_param(copy_bytes, uint, 0644);
> > +MODULE_PARM_DESC(copy_bytes, "Threshold under which NTB will use the CPU 
> > to copy instead of DMA");
> > +
> >  struct ntb_queue_entry {
> > /* ntb_queue list reference */
> > struct list_head entry;
> > @@ -76,6 +81,13 @@ struct ntb_queue_entry {
> > void *buf;
> > unsigned int len;
> > unsigned int flags;
> > +
> > +   struct ntb_transport_qp *qp;
> > +   union {
> > +   struct ntb_payload_header __iomem *tx_hdr;
> > +   struct ntb_payload_header *rx_hdr;
> > +   };
> > +   unsigned int index;
> >  };
> >
> >  struct ntb_rx_info {
> > @@ -86,6 +98,7 @@ struct ntb_transport_qp {
> > struct ntb_transport *transport;
> > struct ntb_device *ndev;
> > void *cb_data;
> > +   struct dma_chan *dma_chan;
> >
> > bool client_ready;
> > bool qp_link;
> > @@ -99,6 +112,7 @@ struct ntb_transport_qp {
> > struct list_head tx_free_q;
> > spinlock_t ntb_tx_free_q_lock;
> > void __iomem *tx_mw;
> > +   dma_addr_t tx_mw_raw;
> > unsigned int tx_index;
> > unsigned int tx_max_entry;
> > unsigned int tx_max_frame;
> > @@ -114,6 +128,7 @@ struct ntb_transport_qp {
> > unsigned int rx_index;
> > unsigned int rx_max_entry;
> > unsigned int rx_max_frame;
> > +   dma_cookie_t last_cookie;
> >
> > void (*event_handler) (void *data, int status);
> > struct delayed_work link_work;
> > @@ -129,9 +144,14 @@ struct ntb_transport_qp {
> > u64 rx_err_no_buf;
> > u64 rx_err_oflow;
> > u64 rx_err_ver;
> > +   u64 rx_memcpy;
> > +   u64 rx_async;
> > u64 tx_bytes;
> > u64 tx_pkts;
> > u64 tx_ring_full;
> > +   u64 tx_err_no_buf;
> > +   u64 tx_memcpy;
> > +   u64 tx_async;
> >  };
> >
> >  struct ntb_transport_mw {
> > @@ -381,7 +401,7 @@ static ssize_t debugfs_read(struct file *filp, char 
> > __user 

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Dan Williams
On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason  wrote:
> Allocate and use a DMA engine channel to transmit and receive data over
> NTB.  If none is allocated, fall back to using the CPU to transfer data.
>
> Cc: Dan Williams 
> Cc: Vinod Koul 
> Cc: Dave Jiang 
> Signed-off-by: Jon Mason 
> ---
>  drivers/ntb/ntb_hw.c|   17 +++
>  drivers/ntb/ntb_hw.h|1 +
>  drivers/ntb/ntb_transport.c |  285 
> ---
>  3 files changed, 258 insertions(+), 45 deletions(-)
>
> diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
> index 1d8e551..014222c 100644
> --- a/drivers/ntb/ntb_hw.c
> +++ b/drivers/ntb/ntb_hw.c
> @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device *ndev, 
> unsigned int idx, u32 *val)
>  }
>
>  /**
> + * ntb_get_mw_base() - get addr for the NTB memory window
> + * @ndev: pointer to ntb_device instance
> + * @mw: memory window number
> + *
> + * This function provides the base address of the memory window specified.
> + *
> + * RETURNS: address, or NULL on error.
> + */
> +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
> +{
> +   if (mw >= ntb_max_mw(ndev))
> +   return 0;
> +
> +   return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
> +}
> +
> +/**
>   * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
>   * @ndev: pointer to ntb_device instance
>   * @mw: memory window number
> diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
> index b03de80..ab5f768 100644
> --- a/drivers/ntb/ntb_hw.h
> +++ b/drivers/ntb/ntb_hw.h
> @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev, 
> unsigned int idx, u32 val);
>  int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
>  int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
> val);
>  int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
> *val);
> +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
>  void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
>  u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
>  void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
> diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
> index f7380e9..73a35e4 100644
> --- a/drivers/ntb/ntb_transport.c
> +++ b/drivers/ntb/ntb_transport.c
> @@ -47,6 +47,7 @@
>   */
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
>  module_param(max_num_clients, byte, 0644);
>  MODULE_PARM_DESC(max_num_clients, "Maximum number of NTB transport clients");
>
> +static unsigned int copy_bytes = 1024;
> +module_param(copy_bytes, uint, 0644);
> +MODULE_PARM_DESC(copy_bytes, "Threshold under which NTB will use the CPU to 
> copy instead of DMA");
> +
>  struct ntb_queue_entry {
> /* ntb_queue list reference */
> struct list_head entry;
> @@ -76,6 +81,13 @@ struct ntb_queue_entry {
> void *buf;
> unsigned int len;
> unsigned int flags;
> +
> +   struct ntb_transport_qp *qp;
> +   union {
> +   struct ntb_payload_header __iomem *tx_hdr;
> +   struct ntb_payload_header *rx_hdr;
> +   };
> +   unsigned int index;
>  };
>
>  struct ntb_rx_info {
> @@ -86,6 +98,7 @@ struct ntb_transport_qp {
> struct ntb_transport *transport;
> struct ntb_device *ndev;
> void *cb_data;
> +   struct dma_chan *dma_chan;
>
> bool client_ready;
> bool qp_link;
> @@ -99,6 +112,7 @@ struct ntb_transport_qp {
> struct list_head tx_free_q;
> spinlock_t ntb_tx_free_q_lock;
> void __iomem *tx_mw;
> +   dma_addr_t tx_mw_raw;
> unsigned int tx_index;
> unsigned int tx_max_entry;
> unsigned int tx_max_frame;
> @@ -114,6 +128,7 @@ struct ntb_transport_qp {
> unsigned int rx_index;
> unsigned int rx_max_entry;
> unsigned int rx_max_frame;
> +   dma_cookie_t last_cookie;
>
> void (*event_handler) (void *data, int status);
> struct delayed_work link_work;
> @@ -129,9 +144,14 @@ struct ntb_transport_qp {
> u64 rx_err_no_buf;
> u64 rx_err_oflow;
> u64 rx_err_ver;
> +   u64 rx_memcpy;
> +   u64 rx_async;
> u64 tx_bytes;
> u64 tx_pkts;
> u64 tx_ring_full;
> +   u64 tx_err_no_buf;
> +   u64 tx_memcpy;
> +   u64 tx_async;
>  };
>
>  struct ntb_transport_mw {
> @@ -381,7 +401,7 @@ static ssize_t debugfs_read(struct file *filp, char 
> __user *ubuf, size_t count,
> char *buf;
> ssize_t ret, out_offset, out_count;
>
> -   out_count = 600;
> +   out_count = 1000;
>
> buf = kmalloc(out_count, GFP_KERNEL);
> if (!buf)
> @@ -396,6 +416,10 @@ static ssize_t debugfs_read(struct file *filp, char 
> __user *ubuf, size_t count,
> out_offset += 

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Dan Williams
On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason  wrote:
> Allocate and use a DMA engine channel to transmit and receive data over
> NTB.  If none is allocated, fall back to using the CPU to transfer data.
>
> Cc: Dan Williams 
> Cc: Vinod Koul 
> Cc: Dave Jiang 
> Signed-off-by: Jon Mason 
> ---
>  drivers/ntb/ntb_hw.c|   17 +++
>  drivers/ntb/ntb_hw.h|1 +
>  drivers/ntb/ntb_transport.c |  285 
> ---
>  3 files changed, 258 insertions(+), 45 deletions(-)
>
> diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
> index 1d8e551..014222c 100644
> --- a/drivers/ntb/ntb_hw.c
> +++ b/drivers/ntb/ntb_hw.c
> @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device *ndev, 
> unsigned int idx, u32 *val)
>  }
>
>  /**
> + * ntb_get_mw_base() - get addr for the NTB memory window
> + * @ndev: pointer to ntb_device instance
> + * @mw: memory window number
> + *
> + * This function provides the base address of the memory window specified.
> + *
> + * RETURNS: address, or NULL on error.
> + */
> +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
> +{
> +   if (mw >= ntb_max_mw(ndev))
> +   return 0;
> +
> +   return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
> +}
> +
> +/**
>   * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
>   * @ndev: pointer to ntb_device instance
>   * @mw: memory window number
> diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
> index b03de80..ab5f768 100644
> --- a/drivers/ntb/ntb_hw.h
> +++ b/drivers/ntb/ntb_hw.h
> @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev, 
> unsigned int idx, u32 val);
>  int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
>  int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
> val);
>  int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
> *val);
> +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
>  void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
>  u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
>  void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
> diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
> index f7380e9..73a35e4 100644
> --- a/drivers/ntb/ntb_transport.c
> +++ b/drivers/ntb/ntb_transport.c
> @@ -47,6 +47,7 @@
>   */
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
>  module_param(max_num_clients, byte, 0644);
>  MODULE_PARM_DESC(max_num_clients, "Maximum number of NTB transport clients");
>
> +static unsigned int copy_bytes = 1024;
> +module_param(copy_bytes, uint, 0644);
> +MODULE_PARM_DESC(copy_bytes, "Threshold under which NTB will use the CPU to 
> copy instead of DMA");
> +
>  struct ntb_queue_entry {
> /* ntb_queue list reference */
> struct list_head entry;
> @@ -76,6 +81,13 @@ struct ntb_queue_entry {
> void *buf;
> unsigned int len;
> unsigned int flags;
> +
> +   struct ntb_transport_qp *qp;
> +   union {
> +   struct ntb_payload_header __iomem *tx_hdr;
> +   struct ntb_payload_header *rx_hdr;
> +   };
> +   unsigned int index;
>  };
>
>  struct ntb_rx_info {
> @@ -86,6 +98,7 @@ struct ntb_transport_qp {
> struct ntb_transport *transport;
> struct ntb_device *ndev;
> void *cb_data;
> +   struct dma_chan *dma_chan;
>
> bool client_ready;
> bool qp_link;
> @@ -99,6 +112,7 @@ struct ntb_transport_qp {
> struct list_head tx_free_q;
> spinlock_t ntb_tx_free_q_lock;
> void __iomem *tx_mw;
> +   dma_addr_t tx_mw_raw;
> unsigned int tx_index;
> unsigned int tx_max_entry;
> unsigned int tx_max_frame;
> @@ -114,6 +128,7 @@ struct ntb_transport_qp {
> unsigned int rx_index;
> unsigned int rx_max_entry;
> unsigned int rx_max_frame;
> +   dma_cookie_t last_cookie;
>
> void (*event_handler) (void *data, int status);
> struct delayed_work link_work;
> @@ -129,9 +144,14 @@ struct ntb_transport_qp {
> u64 rx_err_no_buf;
> u64 rx_err_oflow;
> u64 rx_err_ver;
> +   u64 rx_memcpy;
> +   u64 rx_async;
> u64 tx_bytes;
> u64 tx_pkts;
> u64 tx_ring_full;
> +   u64 tx_err_no_buf;
> +   u64 tx_memcpy;
> +   u64 tx_async;
>  };
>
>  struct ntb_transport_mw {
> @@ -381,7 +401,7 @@ static ssize_t debugfs_read(struct file *filp, char 
> __user *ubuf, size_t count,
> char *buf;
> ssize_t ret, out_offset, out_count;
>
> -   out_count = 600;
> +   out_count = 1000;
>
> buf = kmalloc(out_count, GFP_KERNEL);
> if (!buf)
> @@ -396,6 +416,10 @@ static ssize_t debugfs_read(struct file *filp, char 
> __user *ubuf, size_t count,
> out_offset += 

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Dan Williams
On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason jon.ma...@intel.com wrote:
 Allocate and use a DMA engine channel to transmit and receive data over
 NTB.  If none is allocated, fall back to using the CPU to transfer data.

 Cc: Dan Williams d...@fb.com
 Cc: Vinod Koul vinod.k...@intel.com
 Cc: Dave Jiang dave.ji...@intel.com
 Signed-off-by: Jon Mason jon.ma...@intel.com
 ---
  drivers/ntb/ntb_hw.c|   17 +++
  drivers/ntb/ntb_hw.h|1 +
  drivers/ntb/ntb_transport.c |  285 
 ---
  3 files changed, 258 insertions(+), 45 deletions(-)

 diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
 index 1d8e551..014222c 100644
 --- a/drivers/ntb/ntb_hw.c
 +++ b/drivers/ntb/ntb_hw.c
 @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device *ndev, 
 unsigned int idx, u32 *val)
  }

  /**
 + * ntb_get_mw_base() - get addr for the NTB memory window
 + * @ndev: pointer to ntb_device instance
 + * @mw: memory window number
 + *
 + * This function provides the base address of the memory window specified.
 + *
 + * RETURNS: address, or NULL on error.
 + */
 +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
 +{
 +   if (mw = ntb_max_mw(ndev))
 +   return 0;
 +
 +   return pci_resource_start(ndev-pdev, MW_TO_BAR(mw));
 +}
 +
 +/**
   * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
   * @ndev: pointer to ntb_device instance
   * @mw: memory window number
 diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
 index b03de80..ab5f768 100644
 --- a/drivers/ntb/ntb_hw.h
 +++ b/drivers/ntb/ntb_hw.h
 @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev, 
 unsigned int idx, u32 val);
  int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
  int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
 val);
  int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
 *val);
 +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
  void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
  u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
  void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
 diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
 index f7380e9..73a35e4 100644
 --- a/drivers/ntb/ntb_transport.c
 +++ b/drivers/ntb/ntb_transport.c
 @@ -47,6 +47,7 @@
   */
  #include linux/debugfs.h
  #include linux/delay.h
 +#include linux/dmaengine.h
  #include linux/dma-mapping.h
  #include linux/errno.h
  #include linux/export.h
 @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
  module_param(max_num_clients, byte, 0644);
  MODULE_PARM_DESC(max_num_clients, Maximum number of NTB transport clients);

 +static unsigned int copy_bytes = 1024;
 +module_param(copy_bytes, uint, 0644);
 +MODULE_PARM_DESC(copy_bytes, Threshold under which NTB will use the CPU to 
 copy instead of DMA);
 +
  struct ntb_queue_entry {
 /* ntb_queue list reference */
 struct list_head entry;
 @@ -76,6 +81,13 @@ struct ntb_queue_entry {
 void *buf;
 unsigned int len;
 unsigned int flags;
 +
 +   struct ntb_transport_qp *qp;
 +   union {
 +   struct ntb_payload_header __iomem *tx_hdr;
 +   struct ntb_payload_header *rx_hdr;
 +   };
 +   unsigned int index;
  };

  struct ntb_rx_info {
 @@ -86,6 +98,7 @@ struct ntb_transport_qp {
 struct ntb_transport *transport;
 struct ntb_device *ndev;
 void *cb_data;
 +   struct dma_chan *dma_chan;

 bool client_ready;
 bool qp_link;
 @@ -99,6 +112,7 @@ struct ntb_transport_qp {
 struct list_head tx_free_q;
 spinlock_t ntb_tx_free_q_lock;
 void __iomem *tx_mw;
 +   dma_addr_t tx_mw_raw;
 unsigned int tx_index;
 unsigned int tx_max_entry;
 unsigned int tx_max_frame;
 @@ -114,6 +128,7 @@ struct ntb_transport_qp {
 unsigned int rx_index;
 unsigned int rx_max_entry;
 unsigned int rx_max_frame;
 +   dma_cookie_t last_cookie;

 void (*event_handler) (void *data, int status);
 struct delayed_work link_work;
 @@ -129,9 +144,14 @@ struct ntb_transport_qp {
 u64 rx_err_no_buf;
 u64 rx_err_oflow;
 u64 rx_err_ver;
 +   u64 rx_memcpy;
 +   u64 rx_async;
 u64 tx_bytes;
 u64 tx_pkts;
 u64 tx_ring_full;
 +   u64 tx_err_no_buf;
 +   u64 tx_memcpy;
 +   u64 tx_async;
  };

  struct ntb_transport_mw {
 @@ -381,7 +401,7 @@ static ssize_t debugfs_read(struct file *filp, char 
 __user *ubuf, size_t count,
 char *buf;
 ssize_t ret, out_offset, out_count;

 -   out_count = 600;
 +   out_count = 1000;

 buf = kmalloc(out_count, GFP_KERNEL);
 if (!buf)
 @@ -396,6 +416,10 @@ static ssize_t debugfs_read(struct file *filp, char 
 __user *ubuf, size_t count,
 

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Dan Williams
On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason jon.ma...@intel.com wrote:
 Allocate and use a DMA engine channel to transmit and receive data over
 NTB.  If none is allocated, fall back to using the CPU to transfer data.

 Cc: Dan Williams d...@fb.com
 Cc: Vinod Koul vinod.k...@intel.com
 Cc: Dave Jiang dave.ji...@intel.com
 Signed-off-by: Jon Mason jon.ma...@intel.com
 ---
  drivers/ntb/ntb_hw.c|   17 +++
  drivers/ntb/ntb_hw.h|1 +
  drivers/ntb/ntb_transport.c |  285 
 ---
  3 files changed, 258 insertions(+), 45 deletions(-)

 diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
 index 1d8e551..014222c 100644
 --- a/drivers/ntb/ntb_hw.c
 +++ b/drivers/ntb/ntb_hw.c
 @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device *ndev, 
 unsigned int idx, u32 *val)
  }

  /**
 + * ntb_get_mw_base() - get addr for the NTB memory window
 + * @ndev: pointer to ntb_device instance
 + * @mw: memory window number
 + *
 + * This function provides the base address of the memory window specified.
 + *
 + * RETURNS: address, or NULL on error.
 + */
 +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
 +{
 +   if (mw = ntb_max_mw(ndev))
 +   return 0;
 +
 +   return pci_resource_start(ndev-pdev, MW_TO_BAR(mw));
 +}
 +
 +/**
   * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
   * @ndev: pointer to ntb_device instance
   * @mw: memory window number
 diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
 index b03de80..ab5f768 100644
 --- a/drivers/ntb/ntb_hw.h
 +++ b/drivers/ntb/ntb_hw.h
 @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev, 
 unsigned int idx, u32 val);
  int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
  int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
 val);
  int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
 *val);
 +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
  void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
  u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
  void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
 diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
 index f7380e9..73a35e4 100644
 --- a/drivers/ntb/ntb_transport.c
 +++ b/drivers/ntb/ntb_transport.c
 @@ -47,6 +47,7 @@
   */
  #include linux/debugfs.h
  #include linux/delay.h
 +#include linux/dmaengine.h
  #include linux/dma-mapping.h
  #include linux/errno.h
  #include linux/export.h
 @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
  module_param(max_num_clients, byte, 0644);
  MODULE_PARM_DESC(max_num_clients, Maximum number of NTB transport clients);

 +static unsigned int copy_bytes = 1024;
 +module_param(copy_bytes, uint, 0644);
 +MODULE_PARM_DESC(copy_bytes, Threshold under which NTB will use the CPU to 
 copy instead of DMA);
 +
  struct ntb_queue_entry {
 /* ntb_queue list reference */
 struct list_head entry;
 @@ -76,6 +81,13 @@ struct ntb_queue_entry {
 void *buf;
 unsigned int len;
 unsigned int flags;
 +
 +   struct ntb_transport_qp *qp;
 +   union {
 +   struct ntb_payload_header __iomem *tx_hdr;
 +   struct ntb_payload_header *rx_hdr;
 +   };
 +   unsigned int index;
  };

  struct ntb_rx_info {
 @@ -86,6 +98,7 @@ struct ntb_transport_qp {
 struct ntb_transport *transport;
 struct ntb_device *ndev;
 void *cb_data;
 +   struct dma_chan *dma_chan;

 bool client_ready;
 bool qp_link;
 @@ -99,6 +112,7 @@ struct ntb_transport_qp {
 struct list_head tx_free_q;
 spinlock_t ntb_tx_free_q_lock;
 void __iomem *tx_mw;
 +   dma_addr_t tx_mw_raw;
 unsigned int tx_index;
 unsigned int tx_max_entry;
 unsigned int tx_max_frame;
 @@ -114,6 +128,7 @@ struct ntb_transport_qp {
 unsigned int rx_index;
 unsigned int rx_max_entry;
 unsigned int rx_max_frame;
 +   dma_cookie_t last_cookie;

 void (*event_handler) (void *data, int status);
 struct delayed_work link_work;
 @@ -129,9 +144,14 @@ struct ntb_transport_qp {
 u64 rx_err_no_buf;
 u64 rx_err_oflow;
 u64 rx_err_ver;
 +   u64 rx_memcpy;
 +   u64 rx_async;
 u64 tx_bytes;
 u64 tx_pkts;
 u64 tx_ring_full;
 +   u64 tx_err_no_buf;
 +   u64 tx_memcpy;
 +   u64 tx_async;
  };

  struct ntb_transport_mw {
 @@ -381,7 +401,7 @@ static ssize_t debugfs_read(struct file *filp, char 
 __user *ubuf, size_t count,
 char *buf;
 ssize_t ret, out_offset, out_count;

 -   out_count = 600;
 +   out_count = 1000;

 buf = kmalloc(out_count, GFP_KERNEL);
 if (!buf)
 @@ -396,6 +416,10 @@ static ssize_t debugfs_read(struct file *filp, char 
 __user *ubuf, size_t count,
 

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Jon Mason
On Mon, Aug 19, 2013 at 03:01:54AM -0700, Dan Williams wrote:
 On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason jon.ma...@intel.com wrote:
  Allocate and use a DMA engine channel to transmit and receive data over
  NTB.  If none is allocated, fall back to using the CPU to transfer data.
 
  Cc: Dan Williams d...@fb.com
  Cc: Vinod Koul vinod.k...@intel.com
  Cc: Dave Jiang dave.ji...@intel.com
  Signed-off-by: Jon Mason jon.ma...@intel.com
  ---
   drivers/ntb/ntb_hw.c|   17 +++
   drivers/ntb/ntb_hw.h|1 +
   drivers/ntb/ntb_transport.c |  285 
  ---
   3 files changed, 258 insertions(+), 45 deletions(-)
 
  diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
  index 1d8e551..014222c 100644
  --- a/drivers/ntb/ntb_hw.c
  +++ b/drivers/ntb/ntb_hw.c
  @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device *ndev, 
  unsigned int idx, u32 *val)
   }
 
   /**
  + * ntb_get_mw_base() - get addr for the NTB memory window
  + * @ndev: pointer to ntb_device instance
  + * @mw: memory window number
  + *
  + * This function provides the base address of the memory window specified.
  + *
  + * RETURNS: address, or NULL on error.
  + */
  +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
  +{
  +   if (mw = ntb_max_mw(ndev))
  +   return 0;
  +
  +   return pci_resource_start(ndev-pdev, MW_TO_BAR(mw));
  +}
  +
  +/**
* ntb_get_mw_vbase() - get virtual addr for the NTB memory window
* @ndev: pointer to ntb_device instance
* @mw: memory window number
  diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
  index b03de80..ab5f768 100644
  --- a/drivers/ntb/ntb_hw.h
  +++ b/drivers/ntb/ntb_hw.h
  @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev, 
  unsigned int idx, u32 val);
   int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 
  *val);
   int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
  val);
   int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 
  *val);
  +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
   void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
   u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
   void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
  diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
  index f7380e9..73a35e4 100644
  --- a/drivers/ntb/ntb_transport.c
  +++ b/drivers/ntb/ntb_transport.c
  @@ -47,6 +47,7 @@
*/
   #include linux/debugfs.h
   #include linux/delay.h
  +#include linux/dmaengine.h
   #include linux/dma-mapping.h
   #include linux/errno.h
   #include linux/export.h
  @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
   module_param(max_num_clients, byte, 0644);
   MODULE_PARM_DESC(max_num_clients, Maximum number of NTB transport 
  clients);
 
  +static unsigned int copy_bytes = 1024;
  +module_param(copy_bytes, uint, 0644);
  +MODULE_PARM_DESC(copy_bytes, Threshold under which NTB will use the CPU 
  to copy instead of DMA);
  +
   struct ntb_queue_entry {
  /* ntb_queue list reference */
  struct list_head entry;
  @@ -76,6 +81,13 @@ struct ntb_queue_entry {
  void *buf;
  unsigned int len;
  unsigned int flags;
  +
  +   struct ntb_transport_qp *qp;
  +   union {
  +   struct ntb_payload_header __iomem *tx_hdr;
  +   struct ntb_payload_header *rx_hdr;
  +   };
  +   unsigned int index;
   };
 
   struct ntb_rx_info {
  @@ -86,6 +98,7 @@ struct ntb_transport_qp {
  struct ntb_transport *transport;
  struct ntb_device *ndev;
  void *cb_data;
  +   struct dma_chan *dma_chan;
 
  bool client_ready;
  bool qp_link;
  @@ -99,6 +112,7 @@ struct ntb_transport_qp {
  struct list_head tx_free_q;
  spinlock_t ntb_tx_free_q_lock;
  void __iomem *tx_mw;
  +   dma_addr_t tx_mw_raw;
  unsigned int tx_index;
  unsigned int tx_max_entry;
  unsigned int tx_max_frame;
  @@ -114,6 +128,7 @@ struct ntb_transport_qp {
  unsigned int rx_index;
  unsigned int rx_max_entry;
  unsigned int rx_max_frame;
  +   dma_cookie_t last_cookie;
 
  void (*event_handler) (void *data, int status);
  struct delayed_work link_work;
  @@ -129,9 +144,14 @@ struct ntb_transport_qp {
  u64 rx_err_no_buf;
  u64 rx_err_oflow;
  u64 rx_err_ver;
  +   u64 rx_memcpy;
  +   u64 rx_async;
  u64 tx_bytes;
  u64 tx_pkts;
  u64 tx_ring_full;
  +   u64 tx_err_no_buf;
  +   u64 tx_memcpy;
  +   u64 tx_async;
   };
 
   struct ntb_transport_mw {
  @@ -381,7 +401,7 @@ static ssize_t debugfs_read(struct file *filp, char 
  __user *ubuf, size_t count,
  char *buf;
  ssize_t ret, out_offset, out_count;
 
  -   

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Dan Williams


On 8/19/13 1:37 PM, Jon Mason jon.ma...@intel.com wrote:

On Mon, Aug 19, 2013 at 03:01:54AM -0700, Dan Williams wrote:
 On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason jon.ma...@intel.com wrote:
  Allocate and use a DMA engine channel to transmit and receive data
over
  NTB.  If none is allocated, fall back to using the CPU to transfer
data.
 
  Cc: Dan Williams d...@fb.com
  Cc: Vinod Koul vinod.k...@intel.com
  Cc: Dave Jiang dave.ji...@intel.com
  Signed-off-by: Jon Mason jon.ma...@intel.com
  ---
   drivers/ntb/ntb_hw.c|   17 +++
   drivers/ntb/ntb_hw.h|1 +
   drivers/ntb/ntb_transport.c |  285
---
   3 files changed, 258 insertions(+), 45 deletions(-)
 
  diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
  index 1d8e551..014222c 100644
  --- a/drivers/ntb/ntb_hw.c
  +++ b/drivers/ntb/ntb_hw.c
  @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device
*ndev, unsigned int idx, u32 *val)
   }
 
   /**
  + * ntb_get_mw_base() - get addr for the NTB memory window
  + * @ndev: pointer to ntb_device instance
  + * @mw: memory window number
  + *
  + * This function provides the base address of the memory window
specified.
  + *
  + * RETURNS: address, or NULL on error.
  + */
  +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned
int mw)
  +{
  +   if (mw = ntb_max_mw(ndev))
  +   return 0;
  +
  +   return pci_resource_start(ndev-pdev, MW_TO_BAR(mw));
  +}

Nothing does error checking on this return value.  I think the code should
either be sure that Œmw' is valid (mw_num is passed to the
ntb_get_mw_vbase helper too) and delete the check, or at least make it a
WARN_ONCE.  The former seems a tad cleaner to me.


  +
  +/**
* ntb_get_mw_vbase() - get virtual addr for the NTB memory window
* @ndev: pointer to ntb_device instance
* @mw: memory window number
  diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
  index b03de80..ab5f768 100644
  --- a/drivers/ntb/ntb_hw.h
  +++ b/drivers/ntb/ntb_hw.h
  @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev,
unsigned int idx, u32 val);
   int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx,
u32 *val);
   int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx,
u32 val);
   int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx,
u32 *val);
  +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned
int mw);
   void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int
mw);
   u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
   void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
  diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
  index f7380e9..73a35e4 100644
  --- a/drivers/ntb/ntb_transport.c
  +++ b/drivers/ntb/ntb_transport.c
  @@ -47,6 +47,7 @@
*/
   #include linux/debugfs.h
   #include linux/delay.h
  +#include linux/dmaengine.h
   #include linux/dma-mapping.h
   #include linux/errno.h
   #include linux/export.h
  @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
   module_param(max_num_clients, byte, 0644);
   MODULE_PARM_DESC(max_num_clients, Maximum number of NTB transport
clients);
 
  +static unsigned int copy_bytes = 1024;
  +module_param(copy_bytes, uint, 0644);
  +MODULE_PARM_DESC(copy_bytes, Threshold under which NTB will use the
CPU to copy instead of DMA);
  +
   struct ntb_queue_entry {
  /* ntb_queue list reference */
  struct list_head entry;
  @@ -76,6 +81,13 @@ struct ntb_queue_entry {
  void *buf;
  unsigned int len;
  unsigned int flags;
  +
  +   struct ntb_transport_qp *qp;
  +   union {
  +   struct ntb_payload_header __iomem *tx_hdr;
  +   struct ntb_payload_header *rx_hdr;
  +   };
  +   unsigned int index;
   };
 
   struct ntb_rx_info {
  @@ -86,6 +98,7 @@ struct ntb_transport_qp {
  struct ntb_transport *transport;
  struct ntb_device *ndev;
  void *cb_data;
  +   struct dma_chan *dma_chan;
 
  bool client_ready;
  bool qp_link;
  @@ -99,6 +112,7 @@ struct ntb_transport_qp {
  struct list_head tx_free_q;
  spinlock_t ntb_tx_free_q_lock;
  void __iomem *tx_mw;
  +   dma_addr_t tx_mw_raw;
  unsigned int tx_index;
  unsigned int tx_max_entry;
  unsigned int tx_max_frame;
  @@ -114,6 +128,7 @@ struct ntb_transport_qp {
  unsigned int rx_index;
  unsigned int rx_max_entry;
  unsigned int rx_max_frame;
  +   dma_cookie_t last_cookie;
 
  void (*event_handler) (void *data, int status);
  struct delayed_work link_work;
  @@ -129,9 +144,14 @@ struct ntb_transport_qp {
  u64 rx_err_no_buf;
  u64 rx_err_oflow;
  u64 rx_err_ver;
  +   u64 rx_memcpy;
  +   u64 rx_async;
  u64 tx_bytes;
  u64 tx_pkts;
  u64 tx_ring_full;
  +  

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Jon Mason
On Mon, Aug 19, 2013 at 11:36:13PM +, Dan Williams wrote:
 
 
 On 8/19/13 1:37 PM, Jon Mason jon.ma...@intel.com wrote:
 
 On Mon, Aug 19, 2013 at 03:01:54AM -0700, Dan Williams wrote:
  On Fri, Aug 2, 2013 at 10:35 AM, Jon Mason jon.ma...@intel.com wrote:
   Allocate and use a DMA engine channel to transmit and receive data
 over
   NTB.  If none is allocated, fall back to using the CPU to transfer
 data.
  
   Cc: Dan Williams d...@fb.com
   Cc: Vinod Koul vinod.k...@intel.com
   Cc: Dave Jiang dave.ji...@intel.com
   Signed-off-by: Jon Mason jon.ma...@intel.com
   ---
drivers/ntb/ntb_hw.c|   17 +++
drivers/ntb/ntb_hw.h|1 +
drivers/ntb/ntb_transport.c |  285
 ---
3 files changed, 258 insertions(+), 45 deletions(-)
  
   diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
   index 1d8e551..014222c 100644
   --- a/drivers/ntb/ntb_hw.c
   +++ b/drivers/ntb/ntb_hw.c
   @@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device
 *ndev, unsigned int idx, u32 *val)
}
  
/**
   + * ntb_get_mw_base() - get addr for the NTB memory window
   + * @ndev: pointer to ntb_device instance
   + * @mw: memory window number
   + *
   + * This function provides the base address of the memory window
 specified.
   + *
   + * RETURNS: address, or NULL on error.
   + */
   +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned
 int mw)
   +{
   +   if (mw = ntb_max_mw(ndev))
   +   return 0;
   +
   +   return pci_resource_start(ndev-pdev, MW_TO_BAR(mw));
   +}
 
 Nothing does error checking on this return value.  I think the code should
 either be sure that Œmw' is valid (mw_num is passed to the
 ntb_get_mw_vbase helper too) and delete the check, or at least make it a
 WARN_ONCE.  The former seems a tad cleaner to me.

Ugh!  Thanks.

 
 
   +
   +/**
 * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
 * @ndev: pointer to ntb_device instance
 * @mw: memory window number
   diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
   index b03de80..ab5f768 100644
   --- a/drivers/ntb/ntb_hw.h
   +++ b/drivers/ntb/ntb_hw.h
   @@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev,
 unsigned int idx, u32 val);
int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx,
 u32 *val);
int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx,
 u32 val);
int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx,
 u32 *val);
   +resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned
 int mw);
void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int
 mw);
u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
   diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
   index f7380e9..73a35e4 100644
   --- a/drivers/ntb/ntb_transport.c
   +++ b/drivers/ntb/ntb_transport.c
   @@ -47,6 +47,7 @@
 */
#include linux/debugfs.h
#include linux/delay.h
   +#include linux/dmaengine.h
#include linux/dma-mapping.h
#include linux/errno.h
#include linux/export.h
   @@ -68,6 +69,10 @@ static unsigned char max_num_clients;
module_param(max_num_clients, byte, 0644);
MODULE_PARM_DESC(max_num_clients, Maximum number of NTB transport
 clients);
  
   +static unsigned int copy_bytes = 1024;
   +module_param(copy_bytes, uint, 0644);
   +MODULE_PARM_DESC(copy_bytes, Threshold under which NTB will use the
 CPU to copy instead of DMA);
   +
struct ntb_queue_entry {
   /* ntb_queue list reference */
   struct list_head entry;
   @@ -76,6 +81,13 @@ struct ntb_queue_entry {
   void *buf;
   unsigned int len;
   unsigned int flags;
   +
   +   struct ntb_transport_qp *qp;
   +   union {
   +   struct ntb_payload_header __iomem *tx_hdr;
   +   struct ntb_payload_header *rx_hdr;
   +   };
   +   unsigned int index;
};
  
struct ntb_rx_info {
   @@ -86,6 +98,7 @@ struct ntb_transport_qp {
   struct ntb_transport *transport;
   struct ntb_device *ndev;
   void *cb_data;
   +   struct dma_chan *dma_chan;
  
   bool client_ready;
   bool qp_link;
   @@ -99,6 +112,7 @@ struct ntb_transport_qp {
   struct list_head tx_free_q;
   spinlock_t ntb_tx_free_q_lock;
   void __iomem *tx_mw;
   +   dma_addr_t tx_mw_raw;
   unsigned int tx_index;
   unsigned int tx_max_entry;
   unsigned int tx_max_frame;
   @@ -114,6 +128,7 @@ struct ntb_transport_qp {
   unsigned int rx_index;
   unsigned int rx_max_entry;
   unsigned int rx_max_frame;
   +   dma_cookie_t last_cookie;
  
   void (*event_handler) (void *data, int status);
   struct delayed_work link_work;
   @@ -129,9 +144,14 @@ struct 

Re: [PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-19 Thread Dan Williams


On 8/19/13 5:07 PM, Jon Mason jon.ma...@intel.com wrote:

  Is this for the case where we are bouncing back and forth between
  sync/async?  Otherwise I do not see how transactions could get out of
  order given you allocate a channel once per queue.  Is this comment
  saying that the iowrite32 is somehow a fix, or is this comment a
  FIXME?
 
 There is a case for a mix, the copy_bytes variable above switches to
 CPU for small transfers (which greatly increases throughput on small
 transfers).  The caveat to it is the need to flush the DMA engine to
 prevent out-of-order.  This comment is mainly an reminder of this
issue.
 
 So this is going forward with the stall as a known issue?  The next
patch
 should just do the sync to prevent the re-ordering, right?

There is already a dma_sync_wait in the error path of ntb_async_rx to
enforce the ordering.  Do I need to change the comment (or move it) to
make it more obvious what is happening?

Yeah, I think it just needs to move to the dma_sync_wait() otherwise it
seems like it¹s an open issue that needs fixing.

  +   txd-callback = ntb_rx_copy_callback;
   +   txd-callback_param = entry;
   +
   +   cookie = dmaengine_submit(txd);
   +   if (dma_submit_error(cookie))
   +   goto err3;
   +
   +   qp-last_cookie = cookie;
   +
   +   dma_async_issue_pending(chan);
  
  hmm... can this go in ntb_process_rx() so that the submission is
  batched?  Cuts down on mmio.
 
 I moved it down to ntb_transport_rx (after the calls to
 ntb_process_rxc), and the performance seems to be roughly the same.
 
 Yeah, not expecting it to be noticeable, but conceptually
 
 submit
 submit
 submit
 submit
 issue
 
 
 Is nicer than:
 
 submit
 issue
 submit
 issue
 
 

I agree, but I liked having all the dma engine awareness
compartmentalized in the ntb_async_* and callbacks.

Ok, makes sense.

--
Dan

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-02 Thread Jon Mason
Allocate and use a DMA engine channel to transmit and receive data over
NTB.  If none is allocated, fall back to using the CPU to transfer data.

Cc: Dan Williams 
Cc: Vinod Koul 
Cc: Dave Jiang 
Signed-off-by: Jon Mason 
---
 drivers/ntb/ntb_hw.c|   17 +++
 drivers/ntb/ntb_hw.h|1 +
 drivers/ntb/ntb_transport.c |  285 ---
 3 files changed, 258 insertions(+), 45 deletions(-)

diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
index 1d8e551..014222c 100644
--- a/drivers/ntb/ntb_hw.c
+++ b/drivers/ntb/ntb_hw.c
@@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device *ndev, unsigned 
int idx, u32 *val)
 }
 
 /**
+ * ntb_get_mw_base() - get addr for the NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the base address of the memory window specified.
+ *
+ * RETURNS: address, or NULL on error.
+ */
+resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
+{
+   if (mw >= ntb_max_mw(ndev))
+   return 0;
+
+   return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
+}
+
+/**
  * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
  * @ndev: pointer to ntb_device instance
  * @mw: memory window number
diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
index b03de80..ab5f768 100644
--- a/drivers/ntb/ntb_hw.h
+++ b/drivers/ntb/ntb_hw.h
@@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev, unsigned 
int idx, u32 val);
 int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
 int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
 int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
+resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
 void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
 u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
 void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index f7380e9..73a35e4 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -47,6 +47,7 @@
  */
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -68,6 +69,10 @@ static unsigned char max_num_clients;
 module_param(max_num_clients, byte, 0644);
 MODULE_PARM_DESC(max_num_clients, "Maximum number of NTB transport clients");
 
+static unsigned int copy_bytes = 1024;
+module_param(copy_bytes, uint, 0644);
+MODULE_PARM_DESC(copy_bytes, "Threshold under which NTB will use the CPU to 
copy instead of DMA");
+
 struct ntb_queue_entry {
/* ntb_queue list reference */
struct list_head entry;
@@ -76,6 +81,13 @@ struct ntb_queue_entry {
void *buf;
unsigned int len;
unsigned int flags;
+
+   struct ntb_transport_qp *qp;
+   union {
+   struct ntb_payload_header __iomem *tx_hdr;
+   struct ntb_payload_header *rx_hdr;
+   };
+   unsigned int index;
 };
 
 struct ntb_rx_info {
@@ -86,6 +98,7 @@ struct ntb_transport_qp {
struct ntb_transport *transport;
struct ntb_device *ndev;
void *cb_data;
+   struct dma_chan *dma_chan;
 
bool client_ready;
bool qp_link;
@@ -99,6 +112,7 @@ struct ntb_transport_qp {
struct list_head tx_free_q;
spinlock_t ntb_tx_free_q_lock;
void __iomem *tx_mw;
+   dma_addr_t tx_mw_raw;
unsigned int tx_index;
unsigned int tx_max_entry;
unsigned int tx_max_frame;
@@ -114,6 +128,7 @@ struct ntb_transport_qp {
unsigned int rx_index;
unsigned int rx_max_entry;
unsigned int rx_max_frame;
+   dma_cookie_t last_cookie;
 
void (*event_handler) (void *data, int status);
struct delayed_work link_work;
@@ -129,9 +144,14 @@ struct ntb_transport_qp {
u64 rx_err_no_buf;
u64 rx_err_oflow;
u64 rx_err_ver;
+   u64 rx_memcpy;
+   u64 rx_async;
u64 tx_bytes;
u64 tx_pkts;
u64 tx_ring_full;
+   u64 tx_err_no_buf;
+   u64 tx_memcpy;
+   u64 tx_async;
 };
 
 struct ntb_transport_mw {
@@ -381,7 +401,7 @@ static ssize_t debugfs_read(struct file *filp, char __user 
*ubuf, size_t count,
char *buf;
ssize_t ret, out_offset, out_count;
 
-   out_count = 600;
+   out_count = 1000;
 
buf = kmalloc(out_count, GFP_KERNEL);
if (!buf)
@@ -396,6 +416,10 @@ static ssize_t debugfs_read(struct file *filp, char __user 
*ubuf, size_t count,
out_offset += snprintf(buf + out_offset, out_count - out_offset,
   "rx_pkts - \t%llu\n", qp->rx_pkts);
out_offset += snprintf(buf + out_offset, out_count - out_offset,
+  "rx_memcpy - \t%llu\n", qp->rx_memcpy);
+   out_offset += snprintf(buf + out_offset, out_count - out_offset,

[PATCH 09/15] NTB: Use DMA Engine to Transmit and Receive

2013-08-02 Thread Jon Mason
Allocate and use a DMA engine channel to transmit and receive data over
NTB.  If none is allocated, fall back to using the CPU to transfer data.

Cc: Dan Williams d...@fb.com
Cc: Vinod Koul vinod.k...@intel.com
Cc: Dave Jiang dave.ji...@intel.com
Signed-off-by: Jon Mason jon.ma...@intel.com
---
 drivers/ntb/ntb_hw.c|   17 +++
 drivers/ntb/ntb_hw.h|1 +
 drivers/ntb/ntb_transport.c |  285 ---
 3 files changed, 258 insertions(+), 45 deletions(-)

diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
index 1d8e551..014222c 100644
--- a/drivers/ntb/ntb_hw.c
+++ b/drivers/ntb/ntb_hw.c
@@ -350,6 +350,23 @@ int ntb_read_remote_spad(struct ntb_device *ndev, unsigned 
int idx, u32 *val)
 }
 
 /**
+ * ntb_get_mw_base() - get addr for the NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the base address of the memory window specified.
+ *
+ * RETURNS: address, or NULL on error.
+ */
+resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
+{
+   if (mw = ntb_max_mw(ndev))
+   return 0;
+
+   return pci_resource_start(ndev-pdev, MW_TO_BAR(mw));
+}
+
+/**
  * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
  * @ndev: pointer to ntb_device instance
  * @mw: memory window number
diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
index b03de80..ab5f768 100644
--- a/drivers/ntb/ntb_hw.h
+++ b/drivers/ntb/ntb_hw.h
@@ -240,6 +240,7 @@ int ntb_write_local_spad(struct ntb_device *ndev, unsigned 
int idx, u32 val);
 int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
 int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
 int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
+resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
 void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
 u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
 void ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index f7380e9..73a35e4 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -47,6 +47,7 @@
  */
 #include linux/debugfs.h
 #include linux/delay.h
+#include linux/dmaengine.h
 #include linux/dma-mapping.h
 #include linux/errno.h
 #include linux/export.h
@@ -68,6 +69,10 @@ static unsigned char max_num_clients;
 module_param(max_num_clients, byte, 0644);
 MODULE_PARM_DESC(max_num_clients, Maximum number of NTB transport clients);
 
+static unsigned int copy_bytes = 1024;
+module_param(copy_bytes, uint, 0644);
+MODULE_PARM_DESC(copy_bytes, Threshold under which NTB will use the CPU to 
copy instead of DMA);
+
 struct ntb_queue_entry {
/* ntb_queue list reference */
struct list_head entry;
@@ -76,6 +81,13 @@ struct ntb_queue_entry {
void *buf;
unsigned int len;
unsigned int flags;
+
+   struct ntb_transport_qp *qp;
+   union {
+   struct ntb_payload_header __iomem *tx_hdr;
+   struct ntb_payload_header *rx_hdr;
+   };
+   unsigned int index;
 };
 
 struct ntb_rx_info {
@@ -86,6 +98,7 @@ struct ntb_transport_qp {
struct ntb_transport *transport;
struct ntb_device *ndev;
void *cb_data;
+   struct dma_chan *dma_chan;
 
bool client_ready;
bool qp_link;
@@ -99,6 +112,7 @@ struct ntb_transport_qp {
struct list_head tx_free_q;
spinlock_t ntb_tx_free_q_lock;
void __iomem *tx_mw;
+   dma_addr_t tx_mw_raw;
unsigned int tx_index;
unsigned int tx_max_entry;
unsigned int tx_max_frame;
@@ -114,6 +128,7 @@ struct ntb_transport_qp {
unsigned int rx_index;
unsigned int rx_max_entry;
unsigned int rx_max_frame;
+   dma_cookie_t last_cookie;
 
void (*event_handler) (void *data, int status);
struct delayed_work link_work;
@@ -129,9 +144,14 @@ struct ntb_transport_qp {
u64 rx_err_no_buf;
u64 rx_err_oflow;
u64 rx_err_ver;
+   u64 rx_memcpy;
+   u64 rx_async;
u64 tx_bytes;
u64 tx_pkts;
u64 tx_ring_full;
+   u64 tx_err_no_buf;
+   u64 tx_memcpy;
+   u64 tx_async;
 };
 
 struct ntb_transport_mw {
@@ -381,7 +401,7 @@ static ssize_t debugfs_read(struct file *filp, char __user 
*ubuf, size_t count,
char *buf;
ssize_t ret, out_offset, out_count;
 
-   out_count = 600;
+   out_count = 1000;
 
buf = kmalloc(out_count, GFP_KERNEL);
if (!buf)
@@ -396,6 +416,10 @@ static ssize_t debugfs_read(struct file *filp, char __user 
*ubuf, size_t count,
out_offset += snprintf(buf + out_offset, out_count - out_offset,
   rx_pkts - \t%llu\n, qp-rx_pkts);
out_offset += snprintf(buf + out_offset, out_count -