Re: [PATCH 02/13] dmaengine: edma: Optimize memcpy operation

2015-10-14 Thread Vinod Koul
On Wed, Oct 14, 2015 at 04:12:13PM +0300, Peter Ujfalusi wrote:
> @@ -1320,41 +1317,92 @@ static struct dma_async_tx_descriptor 
> *edma_prep_dma_memcpy(
>   struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
>   size_t len, unsigned long tx_flags)
>  {
> - int ret;
> + int ret, nslots;
>   struct edma_desc *edesc;
>   struct device *dev = chan->device->dev;
>   struct edma_chan *echan = to_edma_chan(chan);
> - unsigned int width;
> + unsigned int width, pset_len;
>  
>   if (unlikely(!echan || !len))
>   return NULL;
>  
> - edesc = kzalloc(sizeof(*edesc) + sizeof(edesc->pset[0]), GFP_ATOMIC);
> + if (len < SZ_64K) {
> + /*
> +  * Transfer size less than 64K can be handled with one paRAM
> +  * slot. ACNT = length
> +  */
> + width = len;
> + pset_len = len;
> + nslots = 1;
> + } else {
> + /*
> +  * Transfer size bigger than 64K will be handled with maximum of
> +  * two paRAM slots.
> +  * slot1: ACNT = 32767, length1: (length / 32767)
> +  * slot2: the remaining amount of data.
> +  */
> + width = SZ_32K - 1;
> + pset_len = rounddown(len, width);
> + /* One slot is enough for lengths multiple of (SZ_32K -1) */

Hmm so does this mean if I have 140K transfer, it will do two 64K for 1st
slot and 12K in second slot ?

Is there a limit on 'blocks' of 64K we can do here?

-- 
~Vinod
--
To unsubscribe from this list: send the line "unsubscribe linux-omap" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/13] dmaengine: edma: Optimize memcpy operation

2015-10-14 Thread Peter Ujfalusi
On 10/14/2015 05:41 PM, Vinod Koul wrote:
> On Wed, Oct 14, 2015 at 04:12:13PM +0300, Peter Ujfalusi wrote:
>> @@ -1320,41 +1317,92 @@ static struct dma_async_tx_descriptor 
>> *edma_prep_dma_memcpy(
>>  struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
>>  size_t len, unsigned long tx_flags)
>>  {
>> -int ret;
>> +int ret, nslots;
>>  struct edma_desc *edesc;
>>  struct device *dev = chan->device->dev;
>>  struct edma_chan *echan = to_edma_chan(chan);
>> -unsigned int width;
>> +unsigned int width, pset_len;
>>  
>>  if (unlikely(!echan || !len))
>>  return NULL;
>>  
>> -edesc = kzalloc(sizeof(*edesc) + sizeof(edesc->pset[0]), GFP_ATOMIC);
>> +if (len < SZ_64K) {
>> +/*
>> + * Transfer size less than 64K can be handled with one paRAM
>> + * slot. ACNT = length
>> + */
>> +width = len;
>> +pset_len = len;
>> +nslots = 1;
>> +} else {
>> +/*
>> + * Transfer size bigger than 64K will be handled with maximum of
>> + * two paRAM slots.
>> + * slot1: ACNT = 32767, length1: (length / 32767)
>> + * slot2: the remaining amount of data.
>> + */
>> +width = SZ_32K - 1;
>> +pset_len = rounddown(len, width);
>> +/* One slot is enough for lengths multiple of (SZ_32K -1) */
> 
> Hmm so does this mean if I have 140K transfer, it will do two 64K for 1st
> slot and 12K in second slot ?

Not exactly. If the size is less than 64K it can be done with one 'burst' but
if it is bigger we need to have two sets of transfer:
1. 32K blocks
2. the remaining data

so in case of 140K:
4 x 32K followed by 12K

> 
> Is there a limit on 'blocks' of 64K we can do here?

32767 32K blocks is the limit.

The 64K burst is only possible if the whole transfer is less less than 64K.
With the ACNT counter we can transfer 64K - 1 bytes, but if this is not enough
we need to use the BCNT counter and for that to work the the distance between
the start of 'slot n' and the start of 'slot n+1' need to be less than 32K,
this is the reason why we have 32K 'blocks' to transfer first followed by the
remaining.

-- 
P├ęter
--
To unsubscribe from this list: send the line "unsubscribe linux-omap" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/13] dmaengine: edma: Optimize memcpy operation

2015-10-14 Thread Vinod Koul
On Wed, Oct 14, 2015 at 06:02:18PM +0300, Peter Ujfalusi wrote:
> On 10/14/2015 05:41 PM, Vinod Koul wrote:
> > On Wed, Oct 14, 2015 at 04:12:13PM +0300, Peter Ujfalusi wrote:
> >> @@ -1320,41 +1317,92 @@ static struct dma_async_tx_descriptor 
> >> *edma_prep_dma_memcpy(
> >>struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
> >>size_t len, unsigned long tx_flags)
> >>  {
> >> -  int ret;
> >> +  int ret, nslots;
> >>struct edma_desc *edesc;
> >>struct device *dev = chan->device->dev;
> >>struct edma_chan *echan = to_edma_chan(chan);
> >> -  unsigned int width;
> >> +  unsigned int width, pset_len;
> >>  
> >>if (unlikely(!echan || !len))
> >>return NULL;
> >>  
> >> -  edesc = kzalloc(sizeof(*edesc) + sizeof(edesc->pset[0]), GFP_ATOMIC);
> >> +  if (len < SZ_64K) {
> >> +  /*
> >> +   * Transfer size less than 64K can be handled with one paRAM
> >> +   * slot. ACNT = length
> >> +   */
> >> +  width = len;
> >> +  pset_len = len;
> >> +  nslots = 1;
> >> +  } else {
> >> +  /*
> >> +   * Transfer size bigger than 64K will be handled with maximum of
> >> +   * two paRAM slots.
> >> +   * slot1: ACNT = 32767, length1: (length / 32767)
> >> +   * slot2: the remaining amount of data.
> >> +   */
> >> +  width = SZ_32K - 1;
> >> +  pset_len = rounddown(len, width);
> >> +  /* One slot is enough for lengths multiple of (SZ_32K -1) */
> > 
> > Hmm so does this mean if I have 140K transfer, it will do two 64K for 1st
> > slot and 12K in second slot ?
> 
> Not exactly. If the size is less than 64K it can be done with one 'burst' but
> if it is bigger we need to have two sets of transfer:
> 1. 32K blocks
> 2. the remaining data
> 
> so in case of 140K:
> 4 x 32K followed by 12K

Okay this part wasn't very clear to me, can you please add some comment
explaining this bit

> 
> > 
> > Is there a limit on 'blocks' of 64K we can do here?
> 
> 32767 32K blocks is the limit.
> 
> The 64K burst is only possible if the whole transfer is less less than 64K.
> With the ACNT counter we can transfer 64K - 1 bytes, but if this is not enough
> we need to use the BCNT counter and for that to work the the distance between
> the start of 'slot n' and the start of 'slot n+1' need to be less than 32K,
> this is the reason why we have 32K 'blocks' to transfer first followed by the
> remaining.

Okay IIUC, we have option to single burst if its less that 64K using one
slot, otherwise split to 32K chunk with 2 slots, or would it be N in that
case

Really need more documentation here :)
-- 
~Vinod
--
To unsubscribe from this list: send the line "unsubscribe linux-omap" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/13] dmaengine: edma: Optimize memcpy operation

2015-10-14 Thread Peter Ujfalusi
If the transfer is shorted then 64K we can complete it with one ACNT burst
by configuring ACNT to the length of the copy, this require one paRAM slot.
Otherwise we use two paRAM slots for the copy:
slot1: will copy (length / 32767) number of 32767 byte long blocks
slot2: will be configured to copy the remaining data.

According to tests this patch increases the throughput of memcpy from
~3MB/s to 15MB/s

Signed-off-by: Peter Ujfalusi 
---
 drivers/dma/edma.c | 90 +-
 1 file changed, 69 insertions(+), 21 deletions(-)

diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c
index b36dfa5458cb..6de571f4aa0f 100644
--- a/drivers/dma/edma.c
+++ b/drivers/dma/edma.c
@@ -1107,19 +1107,16 @@ static int edma_dma_resume(struct dma_chan *chan)
  */
 static int edma_config_pset(struct dma_chan *chan, struct edma_pset *epset,
dma_addr_t src_addr, dma_addr_t dst_addr, u32 burst,
-   enum dma_slave_buswidth dev_width,
-   unsigned int dma_length,
+   unsigned int acnt, unsigned int dma_length,
enum dma_transfer_direction direction)
 {
struct edma_chan *echan = to_edma_chan(chan);
struct device *dev = chan->device->dev;
struct edmacc_param *param = >param;
-   int acnt, bcnt, ccnt, cidx;
+   int bcnt, ccnt, cidx;
int src_bidx, dst_bidx, src_cidx, dst_cidx;
int absync;
 
-   acnt = dev_width;
-
/* src/dst_maxburst == 0 is the same case as src/dst_maxburst == 1 */
if (!burst)
burst = 1;
@@ -1320,41 +1317,92 @@ static struct dma_async_tx_descriptor 
*edma_prep_dma_memcpy(
struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
size_t len, unsigned long tx_flags)
 {
-   int ret;
+   int ret, nslots;
struct edma_desc *edesc;
struct device *dev = chan->device->dev;
struct edma_chan *echan = to_edma_chan(chan);
-   unsigned int width;
+   unsigned int width, pset_len;
 
if (unlikely(!echan || !len))
return NULL;
 
-   edesc = kzalloc(sizeof(*edesc) + sizeof(edesc->pset[0]), GFP_ATOMIC);
+   if (len < SZ_64K) {
+   /*
+* Transfer size less than 64K can be handled with one paRAM
+* slot. ACNT = length
+*/
+   width = len;
+   pset_len = len;
+   nslots = 1;
+   } else {
+   /*
+* Transfer size bigger than 64K will be handled with maximum of
+* two paRAM slots.
+* slot1: ACNT = 32767, length1: (length / 32767)
+* slot2: the remaining amount of data.
+*/
+   width = SZ_32K - 1;
+   pset_len = rounddown(len, width);
+   /* One slot is enough for lengths multiple of (SZ_32K -1) */
+   if (unlikely(pset_len == len))
+   nslots = 1;
+   else
+   nslots = 2;
+   }
+
+   edesc = kzalloc(sizeof(*edesc) + nslots * sizeof(edesc->pset[0]),
+   GFP_ATOMIC);
if (!edesc) {
dev_dbg(dev, "Failed to allocate a descriptor\n");
return NULL;
}
 
-   edesc->pset_nr = 1;
-
-   width = 1 << __ffs((src | dest | len));
-   if (width > DMA_SLAVE_BUSWIDTH_64_BYTES)
-   width = DMA_SLAVE_BUSWIDTH_64_BYTES;
+   edesc->pset_nr = nslots;
+   edesc->residue = edesc->residue_stat = len;
+   edesc->direction = DMA_MEM_TO_MEM;
+   edesc->echan = echan;
 
ret = edma_config_pset(chan, >pset[0], src, dest, 1,
-  width, len, DMA_MEM_TO_MEM);
-   if (ret < 0)
+  width, pset_len, DMA_MEM_TO_MEM);
+   if (ret < 0) {
+   kfree(edesc);
return NULL;
+   }
 
edesc->absync = ret;
 
-   /*
-* Enable intermediate transfer chaining to re-trigger channel
-* on completion of every TR, and enable transfer-completion
-* interrupt on completion of the whole transfer.
-*/
edesc->pset[0].param.opt |= ITCCHEN;
-   edesc->pset[0].param.opt |= TCINTEN;
+   if (nslots == 1) {
+   /* Enable transfer complete interrupt */
+   edesc->pset[0].param.opt |= TCINTEN;
+   } else {
+   /* Enable transfer complete chaining for the first slot */
+   edesc->pset[0].param.opt |= TCCHEN;
+
+   if (echan->slot[1] < 0) {
+   echan->slot[1] = edma_alloc_slot(echan->ecc,
+EDMA_SLOT_ANY);
+   if (echan->slot[1] < 0) {
+   kfree(edesc);
+   dev_err(dev, "%s: Failed to