Re: [PATCH 5/9] iommu: add qcom_iommu

2017-03-30 Thread Archit Taneja



On 3/30/2017 7:16 PM, Rob Clark wrote:

On Thu, Mar 30, 2017 at 2:19 AM, Archit Taneja  wrote:

Hi,

On 03/14/2017 08:48 PM, Rob Clark wrote:


An iommu driver for Qualcomm "B" family devices which do not completely
implement the ARM SMMU spec.  These devices have context-bank register
layout that is similar to ARM SMMU, but no global register space (or at
least not one that is accessible).

Signed-off-by: Rob Clark 
Signed-off-by: Stanimir Varbanov 
---
 drivers/iommu/Kconfig |  10 +
 drivers/iommu/Makefile|   1 +
 drivers/iommu/arm-smmu-regs.h |   2 +
 drivers/iommu/qcom_iommu.c| 818
++
 4 files changed, 831 insertions(+)
 create mode 100644 drivers/iommu/qcom_iommu.c






+
+static int qcom_iommu_add_device(struct device *dev)
+{
+   struct qcom_iommu_dev *qcom_iommu = __to_iommu(dev->iommu_fwspec);



__to_iommu() has a WARN_ON() that gets triggered here for all devices on
the platform bus that aren't backed by our iommu. We should return -ENODEV
for all of them without throwing a warning.


+   struct iommu_group *group;
+   struct device_link *link;
+



We could do something like:

if (fwspec && fwspec->ops == _iommu_ops)
qcom_iommu = __to_iommu(fwspec);
else
qcom_iommu = NULL;


thanks.. I wonder how I wasn't hitting that?


There's an additional commit in your branch which unintentionally
avoids calling add_device() being called for all devs.

https://github.com/freedreno/kernel-msm/commit/e2444266e37a85ad5188a4511ab26cc4b0f85641

Thanks,
Archit


I'll incorporate this (plus small dt bindings doc update) into next

version.. probably won't have time to send until the weekend or next
week

BR,
-R



Thanks,
Archit



+   if (!qcom_iommu)
+   return -ENODEV;
+
+   /*
+* Establish the link between iommu and master, so that the
+* iommu gets runtime enabled/disabled as per the master's
+* needs.
+*/
+   link = device_link_add(dev, qcom_iommu->dev, DL_FLAG_PM_RUNTIME);
+   if (!link) {
+   dev_err(qcom_iommu->dev, "Unable to create device link
between %s and %s\n",
+   dev_name(qcom_iommu->dev), dev_name(dev));
+   return -ENODEV;
+   }
+
+   group = iommu_group_get_for_dev(dev);
+   if (IS_ERR_OR_NULL(group))
+   return PTR_ERR_OR_ZERO(group);
+
+   iommu_group_put(group);
+   iommu_device_link(_iommu->iommu, dev);
+
+   return 0;
+}
+
+static void qcom_iommu_remove_device(struct device *dev)
+{
+   struct qcom_iommu_dev *qcom_iommu = __to_iommu(dev->iommu_fwspec);
+
+   if (!qcom_iommu)
+   return;
+
+   iommu_group_remove_device(dev);
+   iommu_device_unlink(_iommu->iommu, dev);
+   iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *qcom_iommu_device_group(struct device *dev)
+{
+   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_group *group = NULL;
+   unsigned i;
+
+   for (i = 0; i < fwspec->num_ids; i++) {
+   struct qcom_iommu_ctx *ctx = __to_ctx(fwspec,
fwspec->ids[i]);
+
+   if (group && ctx->group && group != ctx->group)
+   return ERR_PTR(-EINVAL);
+
+   group = ctx->group;
+   }
+
+   if (group)
+   return iommu_group_ref_get(group);
+
+   group = generic_device_group(dev);
+
+   for (i = 0; i < fwspec->num_ids; i++) {
+   struct qcom_iommu_ctx *ctx = __to_ctx(fwspec,
fwspec->ids[i]);
+   ctx->group = iommu_group_ref_get(group);
+   }
+
+   return group;
+}
+
+static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args
*args)
+{
+   struct platform_device *iommu_pdev;
+
+   if (args->args_count != 1) {
+   dev_err(dev, "incorrect number of iommu params found for
%s "
+   "(found %d, expected 1)\n",
+   args->np->full_name, args->args_count);
+   return -EINVAL;
+   }
+
+   if (!dev->iommu_fwspec->iommu_priv) {
+   iommu_pdev = of_find_device_by_node(args->np);
+   if (WARN_ON(!iommu_pdev))
+   return -EINVAL;
+
+   dev->iommu_fwspec->iommu_priv =
platform_get_drvdata(iommu_pdev);
+   }
+
+   return iommu_fwspec_add_ids(dev, >args[0], 1);
+}
+
+static const struct iommu_ops qcom_iommu_ops = {
+   .capable= qcom_iommu_capable,
+   .domain_alloc   = qcom_iommu_domain_alloc,
+   .domain_free= qcom_iommu_domain_free,
+   .attach_dev = qcom_iommu_attach_dev,
+   .detach_dev = qcom_iommu_detach_dev,
+   .map= qcom_iommu_map,
+   .unmap  = qcom_iommu_unmap,
+   .map_sg = default_iommu_map_sg,
+   .iova_to_phys   = qcom_iommu_iova_to_phys,
+   .add_device = 

Re: [PATCH 2/7] iommu/iova: cut down judgement times

2017-03-30 Thread Leizhen (ThunderTown)


On 2017/3/23 20:11, Robin Murphy wrote:
> On 22/03/17 06:27, Zhen Lei wrote:
>> Below judgement can only be satisfied at the last time, which produced 2N
>> judgements(suppose N times failed, 0 or 1 time successed) in vain.
>>
>> if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
>>  return iova;
>> }
> 
> For me, GCC (6.2.1 AArch64) seems to do a pretty good job of this
> function already, so this change only saves two instructions in total
> (pfn is compared against pfn_lo only once instead of twice), which I
> wouldn't expect to see a noticeable performance effect from.
OK, thanks for your careful analysis.

Although only two instructions saved in each loop iteration, but it's also an 
improvment and no harm.

> 
> Given the improvement in readability, though, I don't even care about
> any codegen differences :)
> 
> Reviewed-by: Robin Murphy 
> 
>> Signed-off-by: Zhen Lei 
>> ---
>>  drivers/iommu/iova.c | 9 +++--
>>  1 file changed, 3 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>> index 8ba8b496..1c49969 100644
>> --- a/drivers/iommu/iova.c
>> +++ b/drivers/iommu/iova.c
>> @@ -312,15 +312,12 @@ private_find_iova(struct iova_domain *iovad, unsigned 
>> long pfn)
>>  while (node) {
>>  struct iova *iova = rb_entry(node, struct iova, node);
>>  
>> -/* If pfn falls within iova's range, return iova */
>> -if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
>> -return iova;
>> -}
>> -
>>  if (pfn < iova->pfn_lo)
>>  node = node->rb_left;
>> -else if (pfn > iova->pfn_lo)
>> +else if (pfn > iova->pfn_hi)
>>  node = node->rb_right;
>> +else
>> +return iova;/* pfn falls within iova's range */
>>  }
>>  
>>  return NULL;
>>
> 
> 
> .
> 

-- 
Thanks!
BestRegards

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/7] iommu/iova: insert start_pfn boundary of dma32

2017-03-30 Thread Leizhen (ThunderTown)
Because the problem of my email-server, all patches sent to Joerg Roedel 
 failed.
So I repost this email again.


On 2017/3/24 11:43, Leizhen (ThunderTown) wrote:
> 
> 
> On 2017/3/23 21:01, Robin Murphy wrote:
>> On 22/03/17 06:27, Zhen Lei wrote:
>>> Reserve the first granule size memory(start at start_pfn) as boundary
>>> iova, to make sure that iovad->cached32_node can not be NULL in future.
>>> Meanwhile, changed the assignment of iovad->cached32_node from rb_next to
>>> rb_prev of >node in function __cached_rbnode_delete_update.
>>
>> I'm not sure I follow this. It's a top-down allocator, so cached32_node
>> points to the last node allocated (or the node above the last one freed)
>> on the assumption that there is likely free space directly below there,
>> thus it's a pretty good place for the next allocation to start searching
>> from. On the other hand, start_pfn is a hard "do not go below this line"
>> limit, so it doesn't seem to make any sense to ever point the former at
>> the latter.
> This patch just prepares for dma64. Because we really need to add the boundary
> between dma32 and dma64, there are two main purposes:
> 1. to make dma32 iova allocation faster, because the last node which dma32 
> can be
> seen is the boundary. So dma32 iova allocation will only try within dma32 
> iova space.
> Meanwhile, we hope dma64 allocation try dma64 iova space(iova>=4G) first, 
> because the
> maxium dma32 iova space is 4GB, dma64 iova space is almost richer than dma32.
> 
> 2. to prevent a allocated iova cross dma32 and dma64 space. Otherwise, this 
> special
> case should be considered when allocate and free iova.
> 
> After the above boundary added, it's better to add start_pfn of dma32 
> boundary also,
> to make them to be considered in one model.
> 
> After the two boundaries added, adjust cached32/64_node point to the free 
> iova node can
> simplified programming.
> 
> 
>>
>> I could understand slightly more if we were reserving the PFN *above*
>> the cached range, but as-is I don't see what we gain from the change
>> here, nor what benefit the cached32_node != NULL assumption gives
>> (AFAICS it would be more useful to simply restrict the cases where it
>> may be NULL to when the address space is either completely full or
>> completely empty, or perhaps both).
>>
>> Robin.
>>
>>> Signed-off-by: Zhen Lei 
>>> ---
>>>  drivers/iommu/iova.c | 63 
>>> ++--
>>>  1 file changed, 37 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>>> index 1c49969..b5a148e 100644
>>> --- a/drivers/iommu/iova.c
>>> +++ b/drivers/iommu/iova.c
>>> @@ -32,6 +32,17 @@ static unsigned long iova_rcache_get(struct iova_domain 
>>> *iovad,
>>>  static void init_iova_rcaches(struct iova_domain *iovad);
>>>  static void free_iova_rcaches(struct iova_domain *iovad);
>>>  
>>> +static void
>>> +insert_iova_boundary(struct iova_domain *iovad)
>>> +{
>>> +   struct iova *iova;
>>> +   unsigned long start_pfn_32bit = iovad->start_pfn;
>>> +
>>> +   iova = reserve_iova(iovad, start_pfn_32bit, start_pfn_32bit);
>>> +   BUG_ON(!iova);
>>> +   iovad->cached32_node = >node;
>>> +}
>>> +
>>>  void
>>>  init_iova_domain(struct iova_domain *iovad, unsigned long granule,
>>> unsigned long start_pfn, unsigned long pfn_32bit)
>>> @@ -45,27 +56,38 @@ init_iova_domain(struct iova_domain *iovad, unsigned 
>>> long granule,
>>>  
>>> spin_lock_init(>iova_rbtree_lock);
>>> iovad->rbroot = RB_ROOT;
>>> -   iovad->cached32_node = NULL;
>>> iovad->granule = granule;
>>> iovad->start_pfn = start_pfn;
>>> iovad->dma_32bit_pfn = pfn_32bit;
>>> init_iova_rcaches(iovad);
>>> +
>>> +   /*
>>> +* Insert boundary nodes for dma32. So cached32_node can not be NULL in
>>> +* future.
>>> +*/
>>> +   insert_iova_boundary(iovad);
>>>  }
>>>  EXPORT_SYMBOL_GPL(init_iova_domain);
>>>  
>>>  static struct rb_node *
>>>  __get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
>>>  {
>>> -   if ((*limit_pfn > iovad->dma_32bit_pfn) ||
>>> -   (iovad->cached32_node == NULL))
>>> +   struct rb_node *cached_node;
>>> +   struct rb_node *next_node;
>>> +
>>> +   if (*limit_pfn > iovad->dma_32bit_pfn)
>>> return rb_last(>rbroot);
>>> -   else {
>>> -   struct rb_node *prev_node = rb_prev(iovad->cached32_node);
>>> -   struct iova *curr_iova =
>>> -   rb_entry(iovad->cached32_node, struct iova, node);
>>> -   *limit_pfn = curr_iova->pfn_lo - 1;
>>> -   return prev_node;
>>> +   else
>>> +   cached_node = iovad->cached32_node;
>>> +
>>> +   next_node = rb_next(cached_node);
>>> +   if (next_node) {
>>> +   struct iova *next_iova = rb_entry(next_node, struct iova, node);
>>> +
>>> +   *limit_pfn = min(*limit_pfn, next_iova->pfn_lo - 1);
>>> }
>>> +
>>> +   return cached_node;
>>>  }
>>>  

Re: [PATCH 1/7] iommu/iova: fix incorrect variable types

2017-03-30 Thread Leizhen (ThunderTown)


On 2017/3/24 10:27, Leizhen (ThunderTown) wrote:
> 
> 
> On 2017/3/23 19:42, Robin Murphy wrote:
>> On 22/03/17 06:27, Zhen Lei wrote:
>>> Keep these four variables type consistent with the paramters of function
>>> __alloc_and_insert_iova_range and the members of struct iova:
>>>
>>> 1. static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
>>> unsigned long size, unsigned long limit_pfn,
>>>
>>> 2. struct iova {
>>> unsigned long   pfn_hi;
>>> unsigned long   pfn_lo;
>>>
>>> In fact, limit_pfn is most likely larger than 32 bits on DMA64.
>>
>> FWIW if pad_size manages to overflow an int something's probably gone
>> horribly wrong, but there's no harm in making it consistent with
>> everything else here. However, given that patch #6 makes this irrelevant
>> anyway, do we really need to bother?
> 
> Because I'm not sure whether patch #6 can be applied or not.
So if Patch #6 can be applied, I can merge this patch and patch #6 into one.

> 
>>
>> Robin.
>>
>>> Signed-off-by: Zhen Lei 
>>> ---
>>>  drivers/iommu/iova.c | 6 +++---
>>>  1 file changed, 3 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>>> index b7268a1..8ba8b496 100644
>>> --- a/drivers/iommu/iova.c
>>> +++ b/drivers/iommu/iova.c
>>> @@ -104,8 +104,8 @@ __cached_rbnode_delete_update(struct iova_domain 
>>> *iovad, struct iova *free)
>>>   * Computes the padding size required, to make the start address
>>>   * naturally aligned on the power-of-two order of its size
>>>   */
>>> -static unsigned int
>>> -iova_get_pad_size(unsigned int size, unsigned int limit_pfn)
>>> +static unsigned long
>>> +iova_get_pad_size(unsigned long size, unsigned long limit_pfn)
>>>  {
>>> return (limit_pfn + 1 - size) & (__roundup_pow_of_two(size) - 1);
>>>  }
>>> @@ -117,7 +117,7 @@ static int __alloc_and_insert_iova_range(struct 
>>> iova_domain *iovad,
>>> struct rb_node *prev, *curr = NULL;
>>> unsigned long flags;
>>> unsigned long saved_pfn;
>>> -   unsigned int pad_size = 0;
>>> +   unsigned long pad_size = 0;
>>>  
>>> /* Walk the tree backwards */
>>> spin_lock_irqsave(>iova_rbtree_lock, flags);
>>>
>>
>>
>> .
>>
> 

-- 
Thanks!
BestRegards

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 6/7] iommu/iova: move the caculation of pad mask out of loop

2017-03-30 Thread Zhen Lei
I'm not sure whether the compiler can optimize it, but move it out will
be better. At least, it does not require lock protection.

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 23abe84..68754e4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -127,23 +127,16 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, 
struct iova *free)
*cached_node = rb_prev(>node);
 }
 
-/*
- * Computes the padding size required, to make the start address
- * naturally aligned on the power-of-two order of its size
- */
-static unsigned long
-iova_get_pad_size(unsigned long size, unsigned long limit_pfn)
-{
-   return (limit_pfn + 1 - size) & (__roundup_pow_of_two(size) - 1);
-}
-
 static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
unsigned long size, unsigned long limit_pfn,
struct iova *new, bool size_aligned)
 {
struct rb_node *prev, *curr;
unsigned long flags;
-   unsigned long pad_size = 0;
+   unsigned long pad_mask, pad_size = 0;
+
+   if (size_aligned)
+   pad_mask = __roundup_pow_of_two(size) - 1;
 
/* Walk the tree backwards */
spin_lock_irqsave(>iova_rbtree_lock, flags);
@@ -157,8 +150,13 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
else if (limit_pfn < curr_iova->pfn_hi)
goto adjust_limit_pfn;
else {
+   /*
+* Computes the padding size required, to make the start
+* address naturally aligned on the power-of-two order
+* of its size
+*/
if (size_aligned)
-   pad_size = iova_get_pad_size(size, limit_pfn);
+   pad_size = (limit_pfn + 1 - size) & pad_mask;
if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
break;  /* found a free slot */
}
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 2/7] iommu/iova: cut down judgement times

2017-03-30 Thread Zhen Lei
Below judgement can only be satisfied at the last time, which produced 2N
judgements(suppose N times failed, 0 or 1 time successed) in vain.

if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
return iova;
}

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 8ba8b496..1c49969 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -312,15 +312,12 @@ private_find_iova(struct iova_domain *iovad, unsigned 
long pfn)
while (node) {
struct iova *iova = rb_entry(node, struct iova, node);
 
-   /* If pfn falls within iova's range, return iova */
-   if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-   return iova;
-   }
-
if (pfn < iova->pfn_lo)
node = node->rb_left;
-   else if (pfn > iova->pfn_lo)
+   else if (pfn > iova->pfn_hi)
node = node->rb_right;
+   else
+   return iova;/* pfn falls within iova's range */
}
 
return NULL;
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 1/7] iommu/iova: fix incorrect variable types

2017-03-30 Thread Zhen Lei
Keep these four variables type consistent with the paramters of function
__alloc_and_insert_iova_range and the members of struct iova:

1. static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
unsigned long size, unsigned long limit_pfn,

2. struct iova {
unsigned long   pfn_hi;
unsigned long   pfn_lo;

In fact, limit_pfn is most likely larger than 32 bits on DMA64.

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b7268a1..8ba8b496 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -104,8 +104,8 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, 
struct iova *free)
  * Computes the padding size required, to make the start address
  * naturally aligned on the power-of-two order of its size
  */
-static unsigned int
-iova_get_pad_size(unsigned int size, unsigned int limit_pfn)
+static unsigned long
+iova_get_pad_size(unsigned long size, unsigned long limit_pfn)
 {
return (limit_pfn + 1 - size) & (__roundup_pow_of_two(size) - 1);
 }
@@ -117,7 +117,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain 
*iovad,
struct rb_node *prev, *curr = NULL;
unsigned long flags;
unsigned long saved_pfn;
-   unsigned int pad_size = 0;
+   unsigned long pad_size = 0;
 
/* Walk the tree backwards */
spin_lock_irqsave(>iova_rbtree_lock, flags);
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 7/7] iommu/iova: fix iovad->dma_32bit_pfn as the last pfn of dma32

2017-03-30 Thread Zhen Lei
To make sure iovad->cached32_node and iovad->cached64_node can exactly
control dma32 and dma64 area. It also help us to remove the parameter
pfn_32bit of init_iova_domain.

Signed-off-by: Zhen Lei 
---
 drivers/iommu/amd_iommu.c|  7 ++-
 drivers/iommu/dma-iommu.c| 22 +-
 drivers/iommu/intel-iommu.c  | 11 +++
 drivers/iommu/iova.c |  4 ++--
 drivers/misc/mic/scif/scif_rma.c |  3 +--
 include/linux/iova.h |  2 +-
 6 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 98940d1..78c8b93 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -61,7 +61,6 @@
 /* IO virtual address start page frame number */
 #define IOVA_START_PFN (1)
 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN  IOVA_PFN(DMA_BIT_MASK(32))
 
 /* Reserved IOVA ranges */
 #define MSI_RANGE_START(0xfee0)
@@ -1776,8 +1775,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(void)
if (!dma_dom->domain.pt_root)
goto free_dma_dom;
 
-   init_iova_domain(_dom->iovad, PAGE_SIZE,
-IOVA_START_PFN, DMA_32BIT_PFN);
+   init_iova_domain(_dom->iovad, PAGE_SIZE, IOVA_START_PFN);
 
/* Initialize reserved ranges */
copy_reserved_iova(_iova_ranges, _dom->iovad);
@@ -2747,8 +2745,7 @@ static int init_reserved_iova_ranges(void)
struct pci_dev *pdev = NULL;
struct iova *val;
 
-   init_iova_domain(_iova_ranges, PAGE_SIZE,
-IOVA_START_PFN, DMA_32BIT_PFN);
+   init_iova_domain(_iova_ranges, PAGE_SIZE, IOVA_START_PFN);
 
lockdep_set_class(_iova_ranges.iova_rbtree_lock,
  _rbtree_key);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 48d36ce..7064d32 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -223,18 +223,7 @@ int iommu_dma_init_domain(struct iommu_domain *domain, 
dma_addr_t base,
/* ...then finally give it a kicking to make sure it fits */
base_pfn = max_t(unsigned long, base_pfn,
domain->geometry.aperture_start >> order);
-   end_pfn = min_t(unsigned long, end_pfn,
-   domain->geometry.aperture_end >> order);
}
-   /*
-* PCI devices may have larger DMA masks, but still prefer allocating
-* within a 32-bit mask to avoid DAC addressing. Such limitations don't
-* apply to the typical platform device, so for those we may as well
-* leave the cache limit at the top of their range to save an rb_last()
-* traversal on every allocation.
-*/
-   if (pci)
-   end_pfn &= DMA_BIT_MASK(32) >> order;
 
/* start_pfn is always nonzero for an already-initialised domain */
if (iovad->start_pfn) {
@@ -243,16 +232,15 @@ int iommu_dma_init_domain(struct iommu_domain *domain, 
dma_addr_t base,
pr_warn("Incompatible range for DMA domain\n");
return -EFAULT;
}
-   /*
-* If we have devices with different DMA masks, move the free
-* area cache limit down for the benefit of the smaller one.
-*/
-   iovad->dma_32bit_pfn = min(end_pfn, iovad->dma_32bit_pfn);
} else {
-   init_iova_domain(iovad, 1UL << order, base_pfn, end_pfn);
+   init_iova_domain(iovad, 1UL << order, base_pfn);
if (pci)
iova_reserve_pci_windows(to_pci_dev(dev), iovad);
}
+
+   if (end_pfn < iovad->dma_32bit_pfn)
+   dev_dbg(dev, "ancient device or dma range missed some bits?");
+
return 0;
 }
 EXPORT_SYMBOL(iommu_dma_init_domain);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 238ad34..de467c1 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -82,8 +82,6 @@
 #define IOVA_START_PFN (1)
 
 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN  IOVA_PFN(DMA_BIT_MASK(32))
-#define DMA_64BIT_PFN  IOVA_PFN(DMA_BIT_MASK(64))
 
 /* page table handling */
 #define LEVEL_STRIDE   (9)
@@ -1869,8 +1867,7 @@ static int dmar_init_reserved_ranges(void)
struct iova *iova;
int i;
 
-   init_iova_domain(_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
-   DMA_32BIT_PFN);
+   init_iova_domain(_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
 
lockdep_set_class(_iova_list.iova_rbtree_lock,
_rbtree_key);
@@ -1928,8 +1925,7 @@ static int domain_init(struct dmar_domain *domain, struct 
intel_iommu *iommu,
int adjust_width, agaw;
unsigned long sagaw;
 
-   

[PATCH v2 0/7] iommu/iova: improve the allocation performance of dma64

2017-03-30 Thread Zhen Lei
v1 -> v2:
Because the problem of my email-server, all patches sent to Joerg Roedel 
 failed.
So I repost all these patches again, there is no changes.

v1:
64 bits devices is very common now. But currently we only defined a 
cached32_node
to optimize the allocation performance of dma32, and I saw some dma64 drivers 
chose
to allocate iova from dma32 space first, maybe becuase of current dma64 
performance
problem or some other reasons.

For example:(in drivers/iommu/amd_iommu.c)
static unsigned long dma_ops_alloc_iova(..
{
..
if (dma_mask > DMA_BIT_MASK(32))
pfn = alloc_iova_fast(_dom->iovad, pages,
  IOVA_PFN(DMA_BIT_MASK(32)));
if (!pfn)
pfn = alloc_iova_fast(_dom->iovad, pages, 
IOVA_PFN(dma_mask));

For the details of why dma64 iova allocation performance is very bad, please 
refer the
description of patch-5.

In this patch series, I added a cached64_node to manage the dma64 iova 
space(iova>=4G), it
takes the same effect as cached32_node(iova<4G).

Below it's the performance data before and after my patch series:
(before)$ iperf -s

Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)

[  4] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 35898
[ ID] Interval   Transfer Bandwidth
[  4]  0.0-10.2 sec  7.88 MBytes  6.48 Mbits/sec
[  5] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 35900
[  5]  0.0-10.3 sec  7.88 MBytes  6.43 Mbits/sec
[  4] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 35902
[  4]  0.0-10.3 sec  7.88 MBytes  6.43 Mbits/sec

(after)$ iperf -s

Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)

[  4] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 36330
[ ID] Interval   Transfer Bandwidth
[  4]  0.0-10.0 sec  1.09 GBytes   933 Mbits/sec
[  5] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 36332
[  5]  0.0-10.0 sec  1.10 GBytes   939 Mbits/sec
[  4] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 36334
[  4]  0.0-10.0 sec  1.10 GBytes   938 Mbits/sec


Zhen Lei (7):
  iommu/iova: fix incorrect variable types
  iommu/iova: cut down judgement times
  iommu/iova: insert start_pfn boundary of dma32
  iommu/iova: adjust __cached_rbnode_insert_update
  iommu/iova: to optimize the allocation performance of dma64
  iommu/iova: move the caculation of pad mask out of loop
  iommu/iova: fix iovad->dma_32bit_pfn as the last pfn of dma32

 drivers/iommu/amd_iommu.c|   7 +-
 drivers/iommu/dma-iommu.c|  22 ++
 drivers/iommu/intel-iommu.c  |  11 +--
 drivers/iommu/iova.c | 143 +--
 drivers/misc/mic/scif/scif_rma.c |   3 +-
 include/linux/iova.h |   7 +-
 6 files changed, 94 insertions(+), 99 deletions(-)

-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 3/7] iommu/iova: insert start_pfn boundary of dma32

2017-03-30 Thread Zhen Lei
Reserve the first granule size memory(start at start_pfn) as boundary
iova, to make sure that iovad->cached32_node can not be NULL in future.
Meanwhile, changed the assignment of iovad->cached32_node from rb_next to
rb_prev of >node in function __cached_rbnode_delete_update.

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 63 ++--
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 1c49969..b5a148e 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -32,6 +32,17 @@ static unsigned long iova_rcache_get(struct iova_domain 
*iovad,
 static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 
+static void
+insert_iova_boundary(struct iova_domain *iovad)
+{
+   struct iova *iova;
+   unsigned long start_pfn_32bit = iovad->start_pfn;
+
+   iova = reserve_iova(iovad, start_pfn_32bit, start_pfn_32bit);
+   BUG_ON(!iova);
+   iovad->cached32_node = >node;
+}
+
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
unsigned long start_pfn, unsigned long pfn_32bit)
@@ -45,27 +56,38 @@ init_iova_domain(struct iova_domain *iovad, unsigned long 
granule,
 
spin_lock_init(>iova_rbtree_lock);
iovad->rbroot = RB_ROOT;
-   iovad->cached32_node = NULL;
iovad->granule = granule;
iovad->start_pfn = start_pfn;
iovad->dma_32bit_pfn = pfn_32bit;
init_iova_rcaches(iovad);
+
+   /*
+* Insert boundary nodes for dma32. So cached32_node can not be NULL in
+* future.
+*/
+   insert_iova_boundary(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
 static struct rb_node *
 __get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
 {
-   if ((*limit_pfn > iovad->dma_32bit_pfn) ||
-   (iovad->cached32_node == NULL))
+   struct rb_node *cached_node;
+   struct rb_node *next_node;
+
+   if (*limit_pfn > iovad->dma_32bit_pfn)
return rb_last(>rbroot);
-   else {
-   struct rb_node *prev_node = rb_prev(iovad->cached32_node);
-   struct iova *curr_iova =
-   rb_entry(iovad->cached32_node, struct iova, node);
-   *limit_pfn = curr_iova->pfn_lo - 1;
-   return prev_node;
+   else
+   cached_node = iovad->cached32_node;
+
+   next_node = rb_next(cached_node);
+   if (next_node) {
+   struct iova *next_iova = rb_entry(next_node, struct iova, node);
+
+   *limit_pfn = min(*limit_pfn, next_iova->pfn_lo - 1);
}
+
+   return cached_node;
 }
 
 static void
@@ -83,20 +105,13 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, 
struct iova *free)
struct iova *cached_iova;
struct rb_node *curr;
 
-   if (!iovad->cached32_node)
-   return;
curr = iovad->cached32_node;
cached_iova = rb_entry(curr, struct iova, node);
 
if (free->pfn_lo >= cached_iova->pfn_lo) {
-   struct rb_node *node = rb_next(>node);
-   struct iova *iova = rb_entry(node, struct iova, node);
-
/* only cache if it's below 32bit pfn */
-   if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
-   iovad->cached32_node = node;
-   else
-   iovad->cached32_node = NULL;
+   if (free->pfn_hi <= iovad->dma_32bit_pfn)
+   iovad->cached32_node = rb_prev(>node);
}
 }
 
@@ -114,7 +129,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain 
*iovad,
unsigned long size, unsigned long limit_pfn,
struct iova *new, bool size_aligned)
 {
-   struct rb_node *prev, *curr = NULL;
+   struct rb_node *prev, *curr;
unsigned long flags;
unsigned long saved_pfn;
unsigned long pad_size = 0;
@@ -144,13 +159,9 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
curr = rb_prev(curr);
}
 
-   if (!curr) {
-   if (size_aligned)
-   pad_size = iova_get_pad_size(size, limit_pfn);
-   if ((iovad->start_pfn + size + pad_size) > limit_pfn) {
-   spin_unlock_irqrestore(>iova_rbtree_lock, flags);
-   return -ENOMEM;
-   }
+   if (unlikely(!curr)) {
+   spin_unlock_irqrestore(>iova_rbtree_lock, flags);
+   return -ENOMEM;
}
 
/* pfn_lo will point to size aligned address if size_aligned is set */
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 5/7] iommu/iova: to optimize the allocation performance of dma64

2017-03-30 Thread Zhen Lei
Currently we always search free iova space for dma64 begin at the last
node of iovad rb-tree. In the worst case, there maybe too many nodes exist
at the tail, so that we should traverse many times for the first loop in
__alloc_and_insert_iova_range. As we traced, more than 10K times for the
case of iperf.

__alloc_and_insert_iova_range:
..
curr = __get_cached_rbnode(iovad, _pfn);
//--> return rb_last(>rbroot);
while (curr) {
..
curr = rb_prev(curr);
}

So add cached64_node to take the same effect as cached32_node, and add
the start_pfn boundary of dma64, to prevent a iova cross both dma32 and
dma64 area.
|---|--|
|<--cached32_node-->||
|   |
start_pfn dma_32bit_pfn + 1

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 46 +++---
 include/linux/iova.h |  5 +++--
 2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 87a9332..23abe84 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -37,10 +37,15 @@ insert_iova_boundary(struct iova_domain *iovad)
 {
struct iova *iova;
unsigned long start_pfn_32bit = iovad->start_pfn;
+   unsigned long start_pfn_64bit = iovad->dma_32bit_pfn + 1;
 
iova = reserve_iova(iovad, start_pfn_32bit, start_pfn_32bit);
BUG_ON(!iova);
iovad->cached32_node = >node;
+
+   iova = reserve_iova(iovad, start_pfn_64bit, start_pfn_64bit);
+   BUG_ON(!iova);
+   iovad->cached64_node = >node;
 }
 
 void
@@ -62,8 +67,8 @@ init_iova_domain(struct iova_domain *iovad, unsigned long 
granule,
init_iova_rcaches(iovad);
 
/*
-* Insert boundary nodes for dma32. So cached32_node can not be NULL in
-* future.
+* Insert boundary nodes for dma32 and dma64. So cached32_node and
+* cached64_node can not be NULL in future.
 */
insert_iova_boundary(iovad);
 }
@@ -75,10 +80,10 @@ __get_cached_rbnode(struct iova_domain *iovad, unsigned 
long *limit_pfn)
struct rb_node *cached_node;
struct rb_node *next_node;
 
-   if (*limit_pfn > iovad->dma_32bit_pfn)
-   return rb_last(>rbroot);
-   else
+   if (*limit_pfn <= iovad->dma_32bit_pfn)
cached_node = iovad->cached32_node;
+   else
+   cached_node = iovad->cached64_node;
 
next_node = rb_next(cached_node);
if (next_node) {
@@ -94,29 +99,32 @@ static void
 __cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
 {
struct iova *cached_iova;
+   struct rb_node **cached_node;
 
-   if (new->pfn_hi > iovad->dma_32bit_pfn)
-   return;
+   if (new->pfn_hi <= iovad->dma_32bit_pfn)
+   cached_node = >cached32_node;
+   else
+   cached_node = >cached64_node;
 
-   cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+   cached_iova = rb_entry(*cached_node, struct iova, node);
if (new->pfn_lo <= cached_iova->pfn_lo)
-   iovad->cached32_node = rb_prev(>node);
+   *cached_node = rb_prev(>node);
 }
 
 static void
 __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 {
struct iova *cached_iova;
-   struct rb_node *curr;
+   struct rb_node **cached_node;
 
-   curr = iovad->cached32_node;
-   cached_iova = rb_entry(curr, struct iova, node);
+   if (free->pfn_hi <= iovad->dma_32bit_pfn)
+   cached_node = >cached32_node;
+   else
+   cached_node = >cached64_node;
 
-   if (free->pfn_lo >= cached_iova->pfn_lo) {
-   /* only cache if it's below 32bit pfn */
-   if (free->pfn_hi <= iovad->dma_32bit_pfn)
-   iovad->cached32_node = rb_prev(>node);
-   }
+   cached_iova = rb_entry(*cached_node, struct iova, node);
+   if (free->pfn_lo >= cached_iova->pfn_lo)
+   *cached_node = rb_prev(>node);
 }
 
 /*
@@ -283,7 +291,7 @@ EXPORT_SYMBOL_GPL(iova_cache_put);
  * alloc_iova - allocates an iova
  * @iovad: - iova domain in question
  * @size: - size of page frames to allocate
- * @limit_pfn: - max limit address
+ * @limit_pfn: - max limit address(included)
  * @size_aligned: - set if size_aligned address range is required
  * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
  * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
@@ -402,7 +410,7 @@ EXPORT_SYMBOL_GPL(free_iova);
  * alloc_iova_fast - allocates an iova from rcache
  * @iovad: - iova domain in question
  * @size: - size of page frames to allocate
- * @limit_pfn: - max limit address
+ * @limit_pfn: - max limit address(included)
  * This function tries 

[PATCH v2 4/7] iommu/iova: adjust __cached_rbnode_insert_update

2017-03-30 Thread Zhen Lei
For case 2 and 3, adjust cached32_node to the new place, case 1 keep no
change.

For example:
case1: (the right part was allocated)
|--|
|<-free>|<--new_iova-->|
|
|
   cached32_node

case2: (all was allocated)
|--|
|<-new_iova--->|
|
|
   cached32_node

case3:
|---|..|-|
|..free..|<--new_iova-->|
|  |
|  |
   cached32_node(new) cached32_node(old)

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b5a148e..87a9332 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -91,12 +91,16 @@ __get_cached_rbnode(struct iova_domain *iovad, unsigned 
long *limit_pfn)
 }
 
 static void
-__cached_rbnode_insert_update(struct iova_domain *iovad,
-   unsigned long limit_pfn, struct iova *new)
+__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
 {
-   if (limit_pfn != iovad->dma_32bit_pfn)
+   struct iova *cached_iova;
+
+   if (new->pfn_hi > iovad->dma_32bit_pfn)
return;
-   iovad->cached32_node = >node;
+
+   cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+   if (new->pfn_lo <= cached_iova->pfn_lo)
+   iovad->cached32_node = rb_prev(>node);
 }
 
 static void
@@ -131,12 +135,10 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
 {
struct rb_node *prev, *curr;
unsigned long flags;
-   unsigned long saved_pfn;
unsigned long pad_size = 0;
 
/* Walk the tree backwards */
spin_lock_irqsave(>iova_rbtree_lock, flags);
-   saved_pfn = limit_pfn;
curr = __get_cached_rbnode(iovad, _pfn);
prev = curr;
while (curr) {
@@ -197,11 +199,10 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
rb_link_node(>node, parent, entry);
rb_insert_color(>node, >rbroot);
}
-   __cached_rbnode_insert_update(iovad, saved_pfn, new);
+   __cached_rbnode_insert_update(iovad, new);
 
spin_unlock_irqrestore(>iova_rbtree_lock, flags);
 
-
return 0;
 }
 
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 4/4] iommu/arm-smmu: Poll for TLB sync completion more effectively

2017-03-30 Thread Jordan Crouse
On Thu, Mar 30, 2017 at 05:56:32PM +0100, Robin Murphy wrote:
> On relatively slow development platforms and software models, the
> inefficiency of our TLB sync loop tends not to show up - for instance on
> a Juno r1 board I typically see the TLBI has completed of its own accord
> by the time we get to the sync, such that the latter finishes instantly.
> 
> However, on larger systems doing real I/O, it's less realistic for the
> TLBs to go idle immediately, and at that point falling into the 1MHz
> polling loop turns out to throw away performance drastically. Let's
> strike a balance by polling more than once between pauses, such that we
> have much more chance of catching normal operations completing before
> committing to the fixed delay, but also backing off exponentially, since
> if a sync really hasn't completed within one or two "reasonable time"
> periods, it becomes increasingly unlikely that it ever will.

I really really like this.

Reviewed-by: Jordan Crouse 

> Signed-off-by: Robin Murphy 
> ---
> 
> v2: Restored the cpu_relax() to the inner loop
> 
>  drivers/iommu/arm-smmu.c | 18 ++
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 759d5f261160..a15ca86e9703 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -162,6 +162,7 @@
>  #define ARM_SMMU_GR0_sTLBGSTATUS 0x74
>  #define sTLBGSTATUS_GSACTIVE (1 << 0)
>  #define TLB_LOOP_TIMEOUT 100 /* 1s! */
> +#define TLB_SPIN_COUNT   10
>  
>  /* Stream mapping registers */
>  #define ARM_SMMU_GR0_SMR(n)  (0x800 + ((n) << 2))
> @@ -574,18 +575,19 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
> int idx)
>  static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
>   void __iomem *sync, void __iomem *status)
>  {
> - int count = 0;
> + unsigned int spin_cnt, delay;
>  
>   writel_relaxed(0, sync);
> - while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
> - cpu_relax();
> - if (++count == TLB_LOOP_TIMEOUT) {
> - dev_err_ratelimited(smmu->dev,
> - "TLB sync timed out -- SMMU may be deadlocked\n");
> - return;
> + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
> + if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
> + return;
> + cpu_relax();
>   }
> - udelay(1);
> + udelay(delay);
>   }
> + dev_err_ratelimited(smmu->dev,
> + "TLB sync timed out -- SMMU may be deadlocked\n");
>  }
>  
>  static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
> -- 
> 2.11.0.dirty
> 
> ___
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 2/4] iommu/arm-smmu: Tidy up context bank indexing

2017-03-30 Thread Jordan Crouse
On Thu, Mar 30, 2017 at 05:56:30PM +0100, Robin Murphy wrote:
> ARM_AMMU_CB() is calculated relative to ARM_SMMU_CB_BASE(), but the
> latter is never of use on its own, and what we end up with is the same
> ARM_SMMU_CB_BASE() + ARM_AMMU_CB() expression being duplicated at every
> callsite. Folding the two together gives us a self-contained context
> bank accessor which is much more pleasant to work with.
> 
> Secondly, we might as well simplify CB_BASE itself at the same time.
> We use the address space size for its own sake precisely once, at probe
> time, and every other usage is to dynamically calculate CB_BASE over
> and over and over again. Let's flip things around so that we just
> maintain the CB_BASE address directly.

Reviewed-by: Jordan Crouse 

> Signed-off-by: Robin Murphy 
> ---
> 
> v2: No change
> 
>  drivers/iommu/arm-smmu.c | 29 ++---
>  1 file changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 34b745bf59f2..e79b623f9165 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -216,8 +216,7 @@ enum arm_smmu_s2cr_privcfg {
>  #define CBA2R_VMID_MASK  0x
>  
>  /* Translation context bank */
> -#define ARM_SMMU_CB_BASE(smmu)   ((smmu)->base + ((smmu)->size 
> >> 1))
> -#define ARM_SMMU_CB(smmu, n) ((n) * (1 << (smmu)->pgshift))
> +#define ARM_SMMU_CB(smmu, n) ((smmu)->cb_base + ((n) << (smmu)->pgshift))
>  
>  #define ARM_SMMU_CB_SCTLR0x0
>  #define ARM_SMMU_CB_ACTLR0x4
> @@ -344,7 +343,7 @@ struct arm_smmu_device {
>   struct device   *dev;
>  
>   void __iomem*base;
> - unsigned long   size;
> + void __iomem*cb_base;
>   unsigned long   pgshift;
>  
>  #define ARM_SMMU_FEAT_COHERENT_WALK  (1 << 0)
> @@ -603,7 +602,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>   void __iomem *base;
>  
>   if (stage1) {
> - base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> + base = ARM_SMMU_CB(smmu, cfg->cbndx);
>   writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
>   } else {
>   base = ARM_SMMU_GR0(smmu);
> @@ -623,7 +622,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
> iova, size_t size,
>   void __iomem *reg;
>  
>   if (stage1) {
> - reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> + reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>   reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
>  
>   if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
> @@ -642,7 +641,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
> iova, size_t size,
>   } while (size -= granule);
>   }
>   } else if (smmu->version == ARM_SMMU_V2) {
> - reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> + reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>   reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
> ARM_SMMU_CB_S2_TLBIIPAS2;
>   iova >>= 12;
> @@ -672,7 +671,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void 
> *dev)
>   struct arm_smmu_device *smmu = smmu_domain->smmu;
>   void __iomem *cb_base;
>  
> - cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> + cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>   fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
>  
>   if (!(fsr & FSR_FAULT))
> @@ -725,7 +724,7 @@ static void arm_smmu_init_context_bank(struct 
> arm_smmu_domain *smmu_domain,
>  
>   gr1_base = ARM_SMMU_GR1(smmu);
>   stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
> - cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> + cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  
>   if (smmu->version > ARM_SMMU_V1) {
>   if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
> @@ -1007,7 +1006,7 @@ static void arm_smmu_destroy_domain_context(struct 
> iommu_domain *domain)
>* Disable the context bank and free the page tables before freeing
>* it.
>*/
> - cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> + cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>   writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
>  
>   if (cfg->irptndx != INVALID_IRPTNDX) {
> @@ -1358,7 +1357,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct 
> iommu_domain *domain,
>   u64 phys;
>   unsigned long va;
>  
> - cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> + cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  
>   /* ATS1 registers can only be written atomically */
>   va = iova & ~0xfffUL;
> @@ -1685,7 +1684,7 @@ static void arm_smmu_device_reset(struct 
> arm_smmu_device 

Re: [PATCH v2 1/4] iommu/arm-smmu: Simplify ASID/VMID handling

2017-03-30 Thread Jordan Crouse
On Thu, Mar 30, 2017 at 05:56:29PM +0100, Robin Murphy wrote:
> Calculating ASIDs/VMIDs dynamically from arm_smmu_cfg was a neat trick,
> but the global uniqueness workaround makes it somewhat more awkward, and
> means we end up having to pass extra state around in certain cases just
> to keep a handle on the offset.
> 
> We already have 16 bits going spare in arm_smmu_cfg; let's just
> precalculate an ASID/VMID, plop it in there, and tidy up the users
> accordingly. We'd also need something like this anyway if we ever get
> near to thinking about SVM, so it's no bad thing.

If it helps:

Reviewed-by: Jordan Crouse 

> Signed-off-by: Robin Murphy 
> ---
> 
> v2: No change
> 
>  drivers/iommu/arm-smmu.c | 36 +++-
>  1 file changed, 19 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index abf6496843a6..34b745bf59f2 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -404,14 +404,15 @@ enum arm_smmu_context_fmt {
>  struct arm_smmu_cfg {
>   u8  cbndx;
>   u8  irptndx;
> + union {
> + u16 asid;
> + u16 vmid;
> + };
>   u32 cbar;
>   enum arm_smmu_context_fmt   fmt;
>  };
>  #define INVALID_IRPTNDX  0xff
>  
> -#define ARM_SMMU_CB_ASID(smmu, cfg) ((u16)(smmu)->cavium_id_base + 
> (cfg)->cbndx)
> -#define ARM_SMMU_CB_VMID(smmu, cfg) ((u16)(smmu)->cavium_id_base + 
> (cfg)->cbndx + 1)
> -
>  enum arm_smmu_domain_stage {
>   ARM_SMMU_DOMAIN_S1 = 0,
>   ARM_SMMU_DOMAIN_S2,
> @@ -603,12 +604,10 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>  
>   if (stage1) {
>   base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> - writel_relaxed(ARM_SMMU_CB_ASID(smmu, cfg),
> -base + ARM_SMMU_CB_S1_TLBIASID);
> + writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
>   } else {
>   base = ARM_SMMU_GR0(smmu);
> - writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg),
> -base + ARM_SMMU_GR0_TLBIVMID);
> + writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
>   }
>  
>   __arm_smmu_tlb_sync(smmu);
> @@ -629,14 +628,14 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
> iova, size_t size,
>  
>   if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
>   iova &= ~12UL;
> - iova |= ARM_SMMU_CB_ASID(smmu, cfg);
> + iova |= cfg->asid;
>   do {
>   writel_relaxed(iova, reg);
>   iova += granule;
>   } while (size -= granule);
>   } else {
>   iova >>= 12;
> - iova |= (u64)ARM_SMMU_CB_ASID(smmu, cfg) << 48;
> + iova |= (u64)cfg->asid << 48;
>   do {
>   writeq_relaxed(iova, reg);
>   iova += granule >> 12;
> @@ -653,7 +652,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
> iova, size_t size,
>   } while (size -= granule);
>   } else {
>   reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
> - writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg), reg);
> + writel_relaxed(cfg->vmid, reg);
>   }
>  }
>  
> @@ -735,7 +734,7 @@ static void arm_smmu_init_context_bank(struct 
> arm_smmu_domain *smmu_domain,
>   reg = CBA2R_RW64_32BIT;
>   /* 16-bit VMIDs live in CBA2R */
>   if (smmu->features & ARM_SMMU_FEAT_VMID16)
> - reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBA2R_VMID_SHIFT;
> + reg |= cfg->vmid << CBA2R_VMID_SHIFT;
>  
>   writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(cfg->cbndx));
>   }
> @@ -754,26 +753,24 @@ static void arm_smmu_init_context_bank(struct 
> arm_smmu_domain *smmu_domain,
>   (CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
>   } else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
>   /* 8-bit VMIDs live in CBAR */
> - reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBAR_VMID_SHIFT;
> + reg |= cfg->vmid << CBAR_VMID_SHIFT;
>   }
>   writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(cfg->cbndx));
>  
>   /* TTBRs */
>   if (stage1) {
> - u16 asid = ARM_SMMU_CB_ASID(smmu, cfg);
> -
>   if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
>   reg = pgtbl_cfg->arm_v7s_cfg.ttbr[0];
>   writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0);
>   reg = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
>   

[PATCH v2 1/4] iommu/arm-smmu: Simplify ASID/VMID handling

2017-03-30 Thread Robin Murphy
Calculating ASIDs/VMIDs dynamically from arm_smmu_cfg was a neat trick,
but the global uniqueness workaround makes it somewhat more awkward, and
means we end up having to pass extra state around in certain cases just
to keep a handle on the offset.

We already have 16 bits going spare in arm_smmu_cfg; let's just
precalculate an ASID/VMID, plop it in there, and tidy up the users
accordingly. We'd also need something like this anyway if we ever get
near to thinking about SVM, so it's no bad thing.

Signed-off-by: Robin Murphy 
---

v2: No change

 drivers/iommu/arm-smmu.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index abf6496843a6..34b745bf59f2 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -404,14 +404,15 @@ enum arm_smmu_context_fmt {
 struct arm_smmu_cfg {
u8  cbndx;
u8  irptndx;
+   union {
+   u16 asid;
+   u16 vmid;
+   };
u32 cbar;
enum arm_smmu_context_fmt   fmt;
 };
 #define INVALID_IRPTNDX0xff
 
-#define ARM_SMMU_CB_ASID(smmu, cfg) ((u16)(smmu)->cavium_id_base + 
(cfg)->cbndx)
-#define ARM_SMMU_CB_VMID(smmu, cfg) ((u16)(smmu)->cavium_id_base + 
(cfg)->cbndx + 1)
-
 enum arm_smmu_domain_stage {
ARM_SMMU_DOMAIN_S1 = 0,
ARM_SMMU_DOMAIN_S2,
@@ -603,12 +604,10 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 
if (stage1) {
base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
-   writel_relaxed(ARM_SMMU_CB_ASID(smmu, cfg),
-  base + ARM_SMMU_CB_S1_TLBIASID);
+   writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
} else {
base = ARM_SMMU_GR0(smmu);
-   writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg),
-  base + ARM_SMMU_GR0_TLBIVMID);
+   writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
}
 
__arm_smmu_tlb_sync(smmu);
@@ -629,14 +628,14 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
iova, size_t size,
 
if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
iova &= ~12UL;
-   iova |= ARM_SMMU_CB_ASID(smmu, cfg);
+   iova |= cfg->asid;
do {
writel_relaxed(iova, reg);
iova += granule;
} while (size -= granule);
} else {
iova >>= 12;
-   iova |= (u64)ARM_SMMU_CB_ASID(smmu, cfg) << 48;
+   iova |= (u64)cfg->asid << 48;
do {
writeq_relaxed(iova, reg);
iova += granule >> 12;
@@ -653,7 +652,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
iova, size_t size,
} while (size -= granule);
} else {
reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
-   writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg), reg);
+   writel_relaxed(cfg->vmid, reg);
}
 }
 
@@ -735,7 +734,7 @@ static void arm_smmu_init_context_bank(struct 
arm_smmu_domain *smmu_domain,
reg = CBA2R_RW64_32BIT;
/* 16-bit VMIDs live in CBA2R */
if (smmu->features & ARM_SMMU_FEAT_VMID16)
-   reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBA2R_VMID_SHIFT;
+   reg |= cfg->vmid << CBA2R_VMID_SHIFT;
 
writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(cfg->cbndx));
}
@@ -754,26 +753,24 @@ static void arm_smmu_init_context_bank(struct 
arm_smmu_domain *smmu_domain,
(CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
} else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
/* 8-bit VMIDs live in CBAR */
-   reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBAR_VMID_SHIFT;
+   reg |= cfg->vmid << CBAR_VMID_SHIFT;
}
writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(cfg->cbndx));
 
/* TTBRs */
if (stage1) {
-   u16 asid = ARM_SMMU_CB_ASID(smmu, cfg);
-
if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
reg = pgtbl_cfg->arm_v7s_cfg.ttbr[0];
writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0);
reg = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR1);
-   writel_relaxed(asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
+   writel_relaxed(cfg->asid, cb_base + 
ARM_SMMU_CB_CONTEXTIDR);
   

[PATCH v2 4/4] iommu/arm-smmu: Poll for TLB sync completion more effectively

2017-03-30 Thread Robin Murphy
On relatively slow development platforms and software models, the
inefficiency of our TLB sync loop tends not to show up - for instance on
a Juno r1 board I typically see the TLBI has completed of its own accord
by the time we get to the sync, such that the latter finishes instantly.

However, on larger systems doing real I/O, it's less realistic for the
TLBs to go idle immediately, and at that point falling into the 1MHz
polling loop turns out to throw away performance drastically. Let's
strike a balance by polling more than once between pauses, such that we
have much more chance of catching normal operations completing before
committing to the fixed delay, but also backing off exponentially, since
if a sync really hasn't completed within one or two "reasonable time"
periods, it becomes increasingly unlikely that it ever will.

Signed-off-by: Robin Murphy 
---

v2: Restored the cpu_relax() to the inner loop

 drivers/iommu/arm-smmu.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 759d5f261160..a15ca86e9703 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -162,6 +162,7 @@
 #define ARM_SMMU_GR0_sTLBGSTATUS   0x74
 #define sTLBGSTATUS_GSACTIVE   (1 << 0)
 #define TLB_LOOP_TIMEOUT   100 /* 1s! */
+#define TLB_SPIN_COUNT 10
 
 /* Stream mapping registers */
 #define ARM_SMMU_GR0_SMR(n)(0x800 + ((n) << 2))
@@ -574,18 +575,19 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
int idx)
 static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
void __iomem *sync, void __iomem *status)
 {
-   int count = 0;
+   unsigned int spin_cnt, delay;
 
writel_relaxed(0, sync);
-   while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
-   cpu_relax();
-   if (++count == TLB_LOOP_TIMEOUT) {
-   dev_err_ratelimited(smmu->dev,
-   "TLB sync timed out -- SMMU may be deadlocked\n");
-   return;
+   for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+   for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+   if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
+   return;
+   cpu_relax();
}
-   udelay(1);
+   udelay(delay);
}
+   dev_err_ratelimited(smmu->dev,
+   "TLB sync timed out -- SMMU may be deadlocked\n");
 }
 
 static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
-- 
2.11.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 2/4] iommu/arm-smmu: Tidy up context bank indexing

2017-03-30 Thread Robin Murphy
ARM_AMMU_CB() is calculated relative to ARM_SMMU_CB_BASE(), but the
latter is never of use on its own, and what we end up with is the same
ARM_SMMU_CB_BASE() + ARM_AMMU_CB() expression being duplicated at every
callsite. Folding the two together gives us a self-contained context
bank accessor which is much more pleasant to work with.

Secondly, we might as well simplify CB_BASE itself at the same time.
We use the address space size for its own sake precisely once, at probe
time, and every other usage is to dynamically calculate CB_BASE over
and over and over again. Let's flip things around so that we just
maintain the CB_BASE address directly.

Signed-off-by: Robin Murphy 
---

v2: No change

 drivers/iommu/arm-smmu.c | 29 ++---
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 34b745bf59f2..e79b623f9165 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -216,8 +216,7 @@ enum arm_smmu_s2cr_privcfg {
 #define CBA2R_VMID_MASK0x
 
 /* Translation context bank */
-#define ARM_SMMU_CB_BASE(smmu) ((smmu)->base + ((smmu)->size >> 1))
-#define ARM_SMMU_CB(smmu, n)   ((n) * (1 << (smmu)->pgshift))
+#define ARM_SMMU_CB(smmu, n)   ((smmu)->cb_base + ((n) << (smmu)->pgshift))
 
 #define ARM_SMMU_CB_SCTLR  0x0
 #define ARM_SMMU_CB_ACTLR  0x4
@@ -344,7 +343,7 @@ struct arm_smmu_device {
struct device   *dev;
 
void __iomem*base;
-   unsigned long   size;
+   void __iomem*cb_base;
unsigned long   pgshift;
 
 #define ARM_SMMU_FEAT_COHERENT_WALK(1 << 0)
@@ -603,7 +602,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
void __iomem *base;
 
if (stage1) {
-   base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+   base = ARM_SMMU_CB(smmu, cfg->cbndx);
writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
} else {
base = ARM_SMMU_GR0(smmu);
@@ -623,7 +622,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
iova, size_t size,
void __iomem *reg;
 
if (stage1) {
-   reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+   reg = ARM_SMMU_CB(smmu, cfg->cbndx);
reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
 
if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
@@ -642,7 +641,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
iova, size_t size,
} while (size -= granule);
}
} else if (smmu->version == ARM_SMMU_V2) {
-   reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+   reg = ARM_SMMU_CB(smmu, cfg->cbndx);
reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
  ARM_SMMU_CB_S2_TLBIIPAS2;
iova >>= 12;
@@ -672,7 +671,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void 
*dev)
struct arm_smmu_device *smmu = smmu_domain->smmu;
void __iomem *cb_base;
 
-   cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+   cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
 
if (!(fsr & FSR_FAULT))
@@ -725,7 +724,7 @@ static void arm_smmu_init_context_bank(struct 
arm_smmu_domain *smmu_domain,
 
gr1_base = ARM_SMMU_GR1(smmu);
stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-   cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+   cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 
if (smmu->version > ARM_SMMU_V1) {
if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
@@ -1007,7 +1006,7 @@ static void arm_smmu_destroy_domain_context(struct 
iommu_domain *domain)
 * Disable the context bank and free the page tables before freeing
 * it.
 */
-   cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+   cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
 
if (cfg->irptndx != INVALID_IRPTNDX) {
@@ -1358,7 +1357,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct 
iommu_domain *domain,
u64 phys;
unsigned long va;
 
-   cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+   cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 
/* ATS1 registers can only be written atomically */
va = iova & ~0xfffUL;
@@ -1685,7 +1684,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device 
*smmu)
 
/* Make sure all context banks are disabled and clear CB_FSR  */
for (i = 0; i < smmu->num_context_banks; ++i) {
-   cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, i);
+   cb_base = 

[PATCH v2 3/4] iommu/arm-smmu: Use per-context TLB sync as appropriate

2017-03-30 Thread Robin Murphy
TLB synchronisation typically involves the SMMU blocking all incoming
transactions until the TLBs report completion of all outstanding
operations. In the common SMMUv2 configuration of a single distributed
SMMU serving multiple peripherals, that means that a single unmap
request has the potential to bring the hammer down on the entire system
if synchronised globally. Since stage 1 contexts, and stage 2 contexts
under SMMUv2, offer local sync operations, let's make use of those
wherever we can in the hope of minimising global disruption.

To that end, rather than add any more branches to the already unwieldy
monolithic TLB maintenance ops, break them up into smaller, neater,
functions which we can then mix and match as appropriate.

Signed-off-by: Robin Murphy 
---

v2: Undo unnecessary refactoring of arm_smmu_tlb_inv_range_nosync()
(and forget about further attempts, since this turns out to be
all of 3 lines longer than v1)

 drivers/iommu/arm-smmu.c | 115 +--
 1 file changed, 81 insertions(+), 34 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index e79b623f9165..759d5f261160 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -237,6 +237,8 @@ enum arm_smmu_s2cr_privcfg {
 #define ARM_SMMU_CB_S1_TLBIVAL 0x620
 #define ARM_SMMU_CB_S2_TLBIIPAS2   0x630
 #define ARM_SMMU_CB_S2_TLBIIPAS2L  0x638
+#define ARM_SMMU_CB_TLBSYNC0x7f0
+#define ARM_SMMU_CB_TLBSTATUS  0x7f4
 #define ARM_SMMU_CB_ATS1PR 0x800
 #define ARM_SMMU_CB_ATSR   0x8f0
 
@@ -569,14 +571,13 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
int idx)
 }
 
 /* Wait for any pending TLB invalidations to complete */
-static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
+   void __iomem *sync, void __iomem *status)
 {
int count = 0;
-   void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
 
-   writel_relaxed(0, gr0_base + ARM_SMMU_GR0_sTLBGSYNC);
-   while (readl_relaxed(gr0_base + ARM_SMMU_GR0_sTLBGSTATUS)
-  & sTLBGSTATUS_GSACTIVE) {
+   writel_relaxed(0, sync);
+   while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
cpu_relax();
if (++count == TLB_LOOP_TIMEOUT) {
dev_err_ratelimited(smmu->dev,
@@ -587,29 +588,49 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device 
*smmu)
}
 }
 
-static void arm_smmu_tlb_sync(void *cookie)
+static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
 {
-   struct arm_smmu_domain *smmu_domain = cookie;
-   __arm_smmu_tlb_sync(smmu_domain->smmu);
+   void __iomem *base = ARM_SMMU_GR0(smmu);
+
+   __arm_smmu_tlb_sync(smmu, base + ARM_SMMU_GR0_sTLBGSYNC,
+   base + ARM_SMMU_GR0_sTLBGSTATUS);
 }
 
-static void arm_smmu_tlb_inv_context(void *cookie)
+static void arm_smmu_tlb_sync_context(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
+   struct arm_smmu_device *smmu = smmu_domain->smmu;
+   void __iomem *base = ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx);
+
+   __arm_smmu_tlb_sync(smmu, base + ARM_SMMU_CB_TLBSYNC,
+   base + ARM_SMMU_CB_TLBSTATUS);
+}
+
+static void arm_smmu_tlb_sync_vmid(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
+
+   arm_smmu_tlb_sync_global(smmu_domain->smmu);
+}
+
+static void arm_smmu_tlb_inv_context_s1(void *cookie)
 {
struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_cfg *cfg = _domain->cfg;
+   void __iomem *base = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
+
+   writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
+   arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_inv_context_s2(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_device *smmu = smmu_domain->smmu;
-   bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-   void __iomem *base;
+   void __iomem *base = ARM_SMMU_GR0(smmu);
 
-   if (stage1) {
-   base = ARM_SMMU_CB(smmu, cfg->cbndx);
-   writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
-   } else {
-   base = ARM_SMMU_GR0(smmu);
-   writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
-   }
-
-   __arm_smmu_tlb_sync(smmu);
+   writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
+   arm_smmu_tlb_sync_global(smmu);
 }
 
 static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
@@ -617,12 +638,10 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
iova, size_t size,
 {
struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_cfg *cfg = _domain->cfg;
-   struct arm_smmu_device *smmu = 

[PATCH v2 0/4] ARM SMMU TLB sync improvements

2017-03-30 Thread Robin Murphy
Hi Will,

Here's a quick v2 to address your comments and drop the needless meddling
(whaddaya know, it makes the whole lot look simpler!)

I'll put it on my list to take a look at SMMUv3 queue polling as suggested.

Robin.

Robin Murphy (4):
  iommu/arm-smmu: Simplify ASID/VMID handling
  iommu/arm-smmu: Tidy up context bank indexing
  iommu/arm-smmu: Use per-context TLB sync as appropriate
  iommu/arm-smmu: Poll for TLB sync completion more effectively

 drivers/iommu/arm-smmu.c | 182 ++-
 1 file changed, 116 insertions(+), 66 deletions(-)

-- 
2.11.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 4/4] iommu/arm-smmu: Use per-context TLB sync as appropriate

2017-03-30 Thread Robin Murphy
On 30/03/17 15:37, Will Deacon wrote:
> Hi Robin,
> 
> This mostly looks great, but I have a couple of minor comments below.
> 
> On Tue, Mar 07, 2017 at 06:09:07PM +, Robin Murphy wrote:
>> TLB synchronisation typically involves the SMMU blocking all incoming
>> transactions until the TLBs report completion of all outstanding
>> operations. In the common SMMUv2 configuration of a single distributed
>> SMMU serving multiple peripherals, that means that a single unmap
>> request has the potential to bring the hammer down on the entire system
>> if synchronised globally. Since stage 1 contexts, and stage 2 contexts
>> under SMMUv2, offer local sync operations, let's make use of those
>> wherever we can in the hope of minimising global disruption.
>>
>> To that end, rather than add any more branches to the already unwieldy
>> monolithic TLB maintenance ops, break them up into smaller, neater,
>> functions which we can then mix and match as appropriate.
>>
>> Signed-off-by: Robin Murphy 
>> ---
>>  drivers/iommu/arm-smmu.c | 156 
>> ++-
>>  1 file changed, 100 insertions(+), 56 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>> index c8aafe304171..f7411109670f 100644
>> --- a/drivers/iommu/arm-smmu.c
>> +++ b/drivers/iommu/arm-smmu.c
>> @@ -237,6 +237,8 @@ enum arm_smmu_s2cr_privcfg {
>>  #define ARM_SMMU_CB_S1_TLBIVAL  0x620
>>  #define ARM_SMMU_CB_S2_TLBIIPAS20x630
>>  #define ARM_SMMU_CB_S2_TLBIIPAS2L   0x638
>> +#define ARM_SMMU_CB_TLBSYNC 0x7f0
>> +#define ARM_SMMU_CB_TLBSTATUS   0x7f4
>>  #define ARM_SMMU_CB_ATS1PR  0x800
>>  #define ARM_SMMU_CB_ATSR0x8f0
>>  
>> @@ -569,14 +571,13 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
>> int idx)
>>  }
>>  
>>  /* Wait for any pending TLB invalidations to complete */
>> -static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
>> +static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
>> +void __iomem *sync, void __iomem *status)
> 
> Given that you take the arm_smmu_device anyway, I think I'd prefer just
> passing the offsets for sync and status and avoiding the additions
> in the caller (a bit like your other patch in this series ;).

Note that the sole reason for passing the arm_smmu_device is for the
dev_err(), but I neither want to remove that nor duplicate it across the
callers...

However, the concrete reason for not passing offsets is that this
function serves for both global and local syncs, so there is no single
base address that can be assumed. At one point I toyed with just passing
a context bank number (using -1 for "global") but even I thought that
ended up looking awful ;)

>>  static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>> @@ -617,48 +638,66 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned 
>> long iova, size_t size,
>>  {
>>  struct arm_smmu_domain *smmu_domain = cookie;
>>  struct arm_smmu_cfg *cfg = _domain->cfg;
>> -struct arm_smmu_device *smmu = smmu_domain->smmu;
>>  bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
>> -void __iomem *reg;
>> +void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
>> +size_t step;
>>  
>> -if (stage1) {
>> -reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>> -reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
>> -
>> -if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
>> -iova &= ~12UL;
>> -iova |= cfg->asid;
>> -do {
>> -writel_relaxed(iova, reg);
>> -iova += granule;
>> -} while (size -= granule);
>> -} else {
>> -iova >>= 12;
>> -iova |= (u64)cfg->asid << 48;
>> -do {
>> -writeq_relaxed(iova, reg);
>> -iova += granule >> 12;
>> -} while (size -= granule);
>> -}
>> -} else if (smmu->version == ARM_SMMU_V2) {
>> -reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>> +if (stage1)
>> +reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL :
>> +  ARM_SMMU_CB_S1_TLBIVA;
>> +else
>>  reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
>>ARM_SMMU_CB_S2_TLBIIPAS2;
>> -iova >>= 12;
>> -do {
>> -smmu_write_atomic_lq(iova, reg);
>> -iova += granule >> 12;
>> -} while (size -= granule);
>> +
>> +if (stage1 && cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
>> +iova &= ~12UL;
>> +iova |= cfg->asid;
>> +step = granule;
>>  } else {
>> -reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
>> -writel_relaxed(cfg->vmid, reg);
>> +   

RE: [PATCH] iommu/amd: flush IOTLB for specific domains only

2017-03-30 Thread Nath, Arindam
>-Original Message-
>From: Daniel Drake [mailto:dr...@endlessm.com]
>Sent: Thursday, March 30, 2017 7:15 PM
>To: Nath, Arindam
>Cc: j...@8bytes.org; Deucher, Alexander; Bridgman, John; amd-
>g...@lists.freedesktop.org; iommu@lists.linux-foundation.org; Suthikulpanit,
>Suravee; Linux Upstreaming Team
>Subject: Re: [PATCH] iommu/amd: flush IOTLB for specific domains only
>
>On Thu, Mar 30, 2017 at 12:23 AM, Nath, Arindam 
>wrote:
>> Daniel, did you get chance to test this patch?
>
>Not yet. Should we test it alone or alongside "PCI: Blacklist AMD
>Stoney GPU devices for ATS"?

You should test this patch alone.

Thanks,
Arindam

>
>Thanks
>Daniel
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 0/4] ARM SMMU per-context TLB sync

2017-03-30 Thread Will Deacon
On Tue, Mar 07, 2017 at 06:09:03PM +, Robin Murphy wrote:
> The discussion around context-level access for Qualcomm SMMUs reminded
> me to dig up this patch I started ages ago and finish it off. As it's
> ended up, it's now a mini-series, with some new preparatory cleanup
> manifesting as patches 2 and 3. Patch 1 is broken out of patch 3 for
> clarity as somewhat of a fix in its own right, in that it's really an
> entirely unrelated thing which came up in parallel, but happens to
> be inherent to code I'm touching here anyway.

Apart from the first patch, most of this is looking good. Please can you
respin the series without the first patch and with my comments addressed?

Cheers,

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/4] iommu/arm-smmu: Poll for TLB sync completion more effectively

2017-03-30 Thread Will Deacon
On Thu, Mar 23, 2017 at 05:59:40PM +, Robin Murphy wrote:
> On relatively slow development platforms and software models, the
> inefficiency of our TLB sync loop tends not to show up - for instance on
> a Juno r1 board I typically see the TLBI has completed of its own accord
> by the time we get to the sync, such that the latter finishes instantly.
> 
> However, on larger systems doing real I/O, it's less realistic for the
> TLBs to go idle immediately, and at that point falling into the 1MHz
> polling loop turns out to throw away performance drastically. Let's
> strike a balance by polling more than once between pauses, such that we
> have much more chance of catching normal operations completing before
> committing to the fixed delay, but also backing off exponentially, since
> if a sync really hasn't completed within one or two "reasonable time"
> periods, it becomes increasingly unlikely that it ever will.
> 
> Signed-off-by: Robin Murphy 
> ---
>  drivers/iommu/arm-smmu.c | 18 +-
>  1 file changed, 9 insertions(+), 9 deletions(-)

Thanks, I like this patch.

> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index f7411109670f..aa17f3d937a0 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -162,6 +162,7 @@
>  #define ARM_SMMU_GR0_sTLBGSTATUS 0x74
>  #define sTLBGSTATUS_GSACTIVE (1 << 0)
>  #define TLB_LOOP_TIMEOUT 100 /* 1s! */
> +#define TLB_SPIN_COUNT   10
>  
>  /* Stream mapping registers */
>  #define ARM_SMMU_GR0_SMR(n)  (0x800 + ((n) << 2))
> @@ -574,18 +575,17 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
> int idx)
>  static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
>   void __iomem *sync, void __iomem *status)
>  {
> - int count = 0;
> + unsigned int spin_count, delay;
>  
>   writel_relaxed(0, sync);
> - while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
> - cpu_relax();
> - if (++count == TLB_LOOP_TIMEOUT) {
> - dev_err_ratelimited(smmu->dev,
> - "TLB sync timed out -- SMMU may be deadlocked\n");
> - return;
> - }
> - udelay(1);
> + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> + for (spin_count = TLB_SPIN_COUNT; spin_count > 0; spin_count--)
> + if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
> + return;

Can you keep the cpu_relax in the inner loop please?

> + udelay(delay);
>   }
> + dev_err_ratelimited(smmu->dev,
> + "TLB sync timed out -- SMMU may be deadlocked\n");

Whilst we can have WFE-based spinning with SMMUv3, I suppose we should
do something similar in queue_poll_cons... Fancy taking a look?

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 4/4] iommu/arm-smmu: Use per-context TLB sync as appropriate

2017-03-30 Thread Will Deacon
Hi Robin,

This mostly looks great, but I have a couple of minor comments below.

On Tue, Mar 07, 2017 at 06:09:07PM +, Robin Murphy wrote:
> TLB synchronisation typically involves the SMMU blocking all incoming
> transactions until the TLBs report completion of all outstanding
> operations. In the common SMMUv2 configuration of a single distributed
> SMMU serving multiple peripherals, that means that a single unmap
> request has the potential to bring the hammer down on the entire system
> if synchronised globally. Since stage 1 contexts, and stage 2 contexts
> under SMMUv2, offer local sync operations, let's make use of those
> wherever we can in the hope of minimising global disruption.
> 
> To that end, rather than add any more branches to the already unwieldy
> monolithic TLB maintenance ops, break them up into smaller, neater,
> functions which we can then mix and match as appropriate.
> 
> Signed-off-by: Robin Murphy 
> ---
>  drivers/iommu/arm-smmu.c | 156 
> ++-
>  1 file changed, 100 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index c8aafe304171..f7411109670f 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -237,6 +237,8 @@ enum arm_smmu_s2cr_privcfg {
>  #define ARM_SMMU_CB_S1_TLBIVAL   0x620
>  #define ARM_SMMU_CB_S2_TLBIIPAS2 0x630
>  #define ARM_SMMU_CB_S2_TLBIIPAS2L0x638
> +#define ARM_SMMU_CB_TLBSYNC  0x7f0
> +#define ARM_SMMU_CB_TLBSTATUS0x7f4
>  #define ARM_SMMU_CB_ATS1PR   0x800
>  #define ARM_SMMU_CB_ATSR 0x8f0
>  
> @@ -569,14 +571,13 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
> int idx)
>  }
>  
>  /* Wait for any pending TLB invalidations to complete */
> -static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
> +static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
> + void __iomem *sync, void __iomem *status)

Given that you take the arm_smmu_device anyway, I think I'd prefer just
passing the offsets for sync and status and avoiding the additions
in the caller (a bit like your other patch in this series ;).

>  static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
> @@ -617,48 +638,66 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
> iova, size_t size,
>  {
>   struct arm_smmu_domain *smmu_domain = cookie;
>   struct arm_smmu_cfg *cfg = _domain->cfg;
> - struct arm_smmu_device *smmu = smmu_domain->smmu;
>   bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
> - void __iomem *reg;
> + void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
> + size_t step;
>  
> - if (stage1) {
> - reg = ARM_SMMU_CB(smmu, cfg->cbndx);
> - reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
> -
> - if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
> - iova &= ~12UL;
> - iova |= cfg->asid;
> - do {
> - writel_relaxed(iova, reg);
> - iova += granule;
> - } while (size -= granule);
> - } else {
> - iova >>= 12;
> - iova |= (u64)cfg->asid << 48;
> - do {
> - writeq_relaxed(iova, reg);
> - iova += granule >> 12;
> - } while (size -= granule);
> - }
> - } else if (smmu->version == ARM_SMMU_V2) {
> - reg = ARM_SMMU_CB(smmu, cfg->cbndx);
> + if (stage1)
> + reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL :
> +   ARM_SMMU_CB_S1_TLBIVA;
> + else
>   reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
> ARM_SMMU_CB_S2_TLBIIPAS2;
> - iova >>= 12;
> - do {
> - smmu_write_atomic_lq(iova, reg);
> - iova += granule >> 12;
> - } while (size -= granule);
> +
> + if (stage1 && cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
> + iova &= ~12UL;
> + iova |= cfg->asid;
> + step = granule;
>   } else {
> - reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
> - writel_relaxed(cfg->vmid, reg);
> + iova >>= 12;
> + step = granule >> 12;
> + if (stage1)
> + iova |= (u64)cfg->asid << 48;
>   }
> +
> + do {
> + smmu_write_atomic_lq(iova, reg);
> + iova += step;
> + } while (size -= granule);

There seems to be a lot of refactoring going on here, but I'm not entirely
comfortable with the unconditional move to smmu_write_atomic_lq. Given the
way in which arm_smmu_tlb_inv_range_nosync is now called (i.e. only for
stage-1 or SMMUv2 stage-2), then I think you 

Re: [PATCH] arm64/dma-mapping: fix DMA_ATTR_FORCE_CONTIGUOUS mmaping code

2017-03-30 Thread Robin Murphy
On 30/03/17 12:53, Geert Uytterhoeven wrote:
> Hi Robin,
> 
> On Thu, Mar 30, 2017 at 1:46 PM, Robin Murphy  wrote:
>> On 30/03/17 12:16, Andrzej Hajda wrote:
>> [...]
>>> I guess Geert's proposition to create pages permanently is also not
>>> acceptable[2]. So how to fix crashes which appeared after patch adding
>> If I'm not mistaken, creating the pages permanently is what the
>> !DMA_ATTR_FORCE_CONTIGUOUS does? The only difference is where
>> the underlying memory is allocated from.
>>
>> Am I missing something?
> Quoting Robin from his response:
>> in general is is not
>> safe to assume a coherent allocation is backed by struct pages at all
> As I said before I am not familiar with the subject, so it is possible I
> misunderstood something.
 That's from the perspective of external callers of
 dma_alloc_coherent()/dma_alloc_attrs(), wherein
 dma_alloc_from_coherent() may have returned device-specific memory
 without calling into the arch dma_map_ops implementation. Internal to
 the arm64 implementation, however, everything *we* allocate comes from
 either CMA or the normal page allocator, so should always be backed by
 real kernel pages.

 Robin.
>>>
>>> OK, so what do you exactly mean by "The general point still stands", my
>>> patch fixes handling of allocations made internally?
>>
>> That since FORCE_CONTIGUOUS allocations always come from CMA, you can
>> let the existing CMA-based implementations handle them just by fixing up
>> dma_addr appropriately.
>>
>>> Anyway as I understand approach "creating the pages permanently" in
>>> __iommu_alloc_attrs is OK. Is it worth to create patch adding it? It
>>> should solve the issue as well and also looks saner (for me).
>>
>> Sure, you *could* - there's nothing technically wrong with that other
>> than violating a strict interpretation of the iommu-dma API
>> documentation if you pass it into iommu_dma_mmap() - it's just that the
>> only point of using the pages array here in the first place is to cover
>> the general case where an allocation might not be physically contiguous.
>> If you have a guarantee that a given allocation definitely *is* both
>> physically and virtually contiguous, then that's a bunch of extra work
>> you simply have no need to do.
> 
> The same can be said for dma_common_contiguous_remap() below...

Indeed it can. See if you can spot anything I've said in defence of that
particular function ;)

>> If you do want to go down that route, then I would much rather we fix
>> dma_common_contiguous_remap() to leave a valid array in area->pages in
>> the first place, than be temporarily faking them up around individual calls.
> 
> The only point of using the pages array here in the first place is to cover
> the general case in dma_common_pages_remap().
>
> Providing a contiguous variant of map_vm_area() could resolve that.

Isn't that what remap_pfn_range() already is? iommu_dma_mmap() had to
exist specifically because the likes of dma_common_mmap() *are* using
that on the assumption of physically contiguous ranges. I don't really
have the time or inclination to go digging into this myself, but there's
almost certainly room for some improvement and/or cleanup in the common
code too (and as I said, if something can be done there than I would
rather it were tackled head-on than worked around with point fixes in
the arch code).

Robin.

> 
> Gr{oetje,eeting}s,
> 
> Geert
> 
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 
> ge...@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like 
> that.
> -- Linus Torvalds
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/4] iommu/arm-smmu: Handle size mismatches better

2017-03-30 Thread Will Deacon
Hi Robin,

On Tue, Mar 07, 2017 at 06:09:04PM +, Robin Murphy wrote:
> We currently warn if the firmware-described region size differs from the
> SMMU address space size reported by the hardware, but continue to use
> the former to calculate where our context bank base should be,
> effectively guaranteeing that things will not work correctly.
> 
> Since over-mapping is effectively harmless, and under-mapping can be OK
> provided all the usable context banks are still covered, let's let the
> hardware information take precedence in the case of a mismatch, such
> that we get the correct context bank base and in most cases things will
> actually work instead of silently misbehaving. And at worst, if the
> firmware is wrong enough to have not mapped something we actually try to
> use, the resulting out-of-bounds access will hopefully provide a much
> more obvious clue.
> 
> Signed-off-by: Robin Murphy 
> ---
>  drivers/iommu/arm-smmu.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index abf6496843a6..bc7ef6a0c54d 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -1864,10 +1864,12 @@ static int arm_smmu_device_cfg_probe(struct 
> arm_smmu_device *smmu)
>   /* Check for size mismatch of SMMU address space from mapped region */
>   size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 
> 1);
>   size *= 2 << smmu->pgshift;
> - if (smmu->size != size)
> + if (smmu->size != size) {
>   dev_warn(smmu->dev,
>   "SMMU address space size (0x%lx) differs from mapped 
> region size (0x%lx)!\n",
>   size, smmu->size);
> + smmu->size = size;
> + }

I'm not really in favour of this, but I admit that this case is a bit weird.
Basically, we always have two ways to determine the size of the SMMU:

  1. The device-tree reg property, since we need the base address to be
 passed there and this will include a size.

  2. The ID register on the hardware itself

Generally, I prefer that properties passed by firmware override those
baked into the hardware, since this allows us to deal with broken ID
registers easily. In this case, we basically have the size override from
the reg property, so it takes precedence, but we warn if it differs from
the hardware value so hopefully broken DTs are easily diagnosed.

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/9] iommu: add qcom_iommu

2017-03-30 Thread Rob Clark
On Thu, Mar 30, 2017 at 2:19 AM, Archit Taneja  wrote:
> Hi,
>
> On 03/14/2017 08:48 PM, Rob Clark wrote:
>>
>> An iommu driver for Qualcomm "B" family devices which do not completely
>> implement the ARM SMMU spec.  These devices have context-bank register
>> layout that is similar to ARM SMMU, but no global register space (or at
>> least not one that is accessible).
>>
>> Signed-off-by: Rob Clark 
>> Signed-off-by: Stanimir Varbanov 
>> ---
>>  drivers/iommu/Kconfig |  10 +
>>  drivers/iommu/Makefile|   1 +
>>  drivers/iommu/arm-smmu-regs.h |   2 +
>>  drivers/iommu/qcom_iommu.c| 818
>> ++
>>  4 files changed, 831 insertions(+)
>>  create mode 100644 drivers/iommu/qcom_iommu.c
>
>
> 
>
>> +
>> +static int qcom_iommu_add_device(struct device *dev)
>> +{
>> +   struct qcom_iommu_dev *qcom_iommu = __to_iommu(dev->iommu_fwspec);
>
>
> __to_iommu() has a WARN_ON() that gets triggered here for all devices on
> the platform bus that aren't backed by our iommu. We should return -ENODEV
> for all of them without throwing a warning.
>
>> +   struct iommu_group *group;
>> +   struct device_link *link;
>> +
>
>
> We could do something like:
>
> if (fwspec && fwspec->ops == _iommu_ops)
> qcom_iommu = __to_iommu(fwspec);
> else
> qcom_iommu = NULL;

thanks.. I wonder how I wasn't hitting that?

I'll incorporate this (plus small dt bindings doc update) into next
version.. probably won't have time to send until the weekend or next
week

BR,
-R


> Thanks,
> Archit
>
>
>> +   if (!qcom_iommu)
>> +   return -ENODEV;
>> +
>> +   /*
>> +* Establish the link between iommu and master, so that the
>> +* iommu gets runtime enabled/disabled as per the master's
>> +* needs.
>> +*/
>> +   link = device_link_add(dev, qcom_iommu->dev, DL_FLAG_PM_RUNTIME);
>> +   if (!link) {
>> +   dev_err(qcom_iommu->dev, "Unable to create device link
>> between %s and %s\n",
>> +   dev_name(qcom_iommu->dev), dev_name(dev));
>> +   return -ENODEV;
>> +   }
>> +
>> +   group = iommu_group_get_for_dev(dev);
>> +   if (IS_ERR_OR_NULL(group))
>> +   return PTR_ERR_OR_ZERO(group);
>> +
>> +   iommu_group_put(group);
>> +   iommu_device_link(_iommu->iommu, dev);
>> +
>> +   return 0;
>> +}
>> +
>> +static void qcom_iommu_remove_device(struct device *dev)
>> +{
>> +   struct qcom_iommu_dev *qcom_iommu = __to_iommu(dev->iommu_fwspec);
>> +
>> +   if (!qcom_iommu)
>> +   return;
>> +
>> +   iommu_group_remove_device(dev);
>> +   iommu_device_unlink(_iommu->iommu, dev);
>> +   iommu_fwspec_free(dev);
>> +}
>> +
>> +static struct iommu_group *qcom_iommu_device_group(struct device *dev)
>> +{
>> +   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
>> +   struct iommu_group *group = NULL;
>> +   unsigned i;
>> +
>> +   for (i = 0; i < fwspec->num_ids; i++) {
>> +   struct qcom_iommu_ctx *ctx = __to_ctx(fwspec,
>> fwspec->ids[i]);
>> +
>> +   if (group && ctx->group && group != ctx->group)
>> +   return ERR_PTR(-EINVAL);
>> +
>> +   group = ctx->group;
>> +   }
>> +
>> +   if (group)
>> +   return iommu_group_ref_get(group);
>> +
>> +   group = generic_device_group(dev);
>> +
>> +   for (i = 0; i < fwspec->num_ids; i++) {
>> +   struct qcom_iommu_ctx *ctx = __to_ctx(fwspec,
>> fwspec->ids[i]);
>> +   ctx->group = iommu_group_ref_get(group);
>> +   }
>> +
>> +   return group;
>> +}
>> +
>> +static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args
>> *args)
>> +{
>> +   struct platform_device *iommu_pdev;
>> +
>> +   if (args->args_count != 1) {
>> +   dev_err(dev, "incorrect number of iommu params found for
>> %s "
>> +   "(found %d, expected 1)\n",
>> +   args->np->full_name, args->args_count);
>> +   return -EINVAL;
>> +   }
>> +
>> +   if (!dev->iommu_fwspec->iommu_priv) {
>> +   iommu_pdev = of_find_device_by_node(args->np);
>> +   if (WARN_ON(!iommu_pdev))
>> +   return -EINVAL;
>> +
>> +   dev->iommu_fwspec->iommu_priv =
>> platform_get_drvdata(iommu_pdev);
>> +   }
>> +
>> +   return iommu_fwspec_add_ids(dev, >args[0], 1);
>> +}
>> +
>> +static const struct iommu_ops qcom_iommu_ops = {
>> +   .capable= qcom_iommu_capable,
>> +   .domain_alloc   = qcom_iommu_domain_alloc,
>> +   .domain_free= qcom_iommu_domain_free,
>> +   .attach_dev = qcom_iommu_attach_dev,
>> +   .detach_dev = qcom_iommu_detach_dev,
>> +   .map= qcom_iommu_map,
>> +   .unmap  = qcom_iommu_unmap,
>> + 

Re: [PATCH] arm64/dma-mapping: fix DMA_ATTR_FORCE_CONTIGUOUS mmaping code

2017-03-30 Thread Geert Uytterhoeven
Hi Robin,

On Thu, Mar 30, 2017 at 1:46 PM, Robin Murphy  wrote:
> On 30/03/17 12:16, Andrzej Hajda wrote:
> [...]
>> I guess Geert's proposition to create pages permanently is also not
>> acceptable[2]. So how to fix crashes which appeared after patch adding
> If I'm not mistaken, creating the pages permanently is what the
> !DMA_ATTR_FORCE_CONTIGUOUS does? The only difference is where
> the underlying memory is allocated from.
>
> Am I missing something?
 Quoting Robin from his response:
> in general is is not
> safe to assume a coherent allocation is backed by struct pages at all
 As I said before I am not familiar with the subject, so it is possible I
 misunderstood something.
>>> That's from the perspective of external callers of
>>> dma_alloc_coherent()/dma_alloc_attrs(), wherein
>>> dma_alloc_from_coherent() may have returned device-specific memory
>>> without calling into the arch dma_map_ops implementation. Internal to
>>> the arm64 implementation, however, everything *we* allocate comes from
>>> either CMA or the normal page allocator, so should always be backed by
>>> real kernel pages.
>>>
>>> Robin.
>>
>> OK, so what do you exactly mean by "The general point still stands", my
>> patch fixes handling of allocations made internally?
>
> That since FORCE_CONTIGUOUS allocations always come from CMA, you can
> let the existing CMA-based implementations handle them just by fixing up
> dma_addr appropriately.
>
>> Anyway as I understand approach "creating the pages permanently" in
>> __iommu_alloc_attrs is OK. Is it worth to create patch adding it? It
>> should solve the issue as well and also looks saner (for me).
>
> Sure, you *could* - there's nothing technically wrong with that other
> than violating a strict interpretation of the iommu-dma API
> documentation if you pass it into iommu_dma_mmap() - it's just that the
> only point of using the pages array here in the first place is to cover
> the general case where an allocation might not be physically contiguous.
> If you have a guarantee that a given allocation definitely *is* both
> physically and virtually contiguous, then that's a bunch of extra work
> you simply have no need to do.

The same can be said for dma_common_contiguous_remap() below...

> If you do want to go down that route, then I would much rather we fix
> dma_common_contiguous_remap() to leave a valid array in area->pages in
> the first place, than be temporarily faking them up around individual calls.

The only point of using the pages array here in the first place is to cover
the general case in dma_common_pages_remap().

Providing a contiguous variant of map_vm_area() could resolve that.

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] arm64/dma-mapping: fix DMA_ATTR_FORCE_CONTIGUOUS mmaping code

2017-03-30 Thread Robin Murphy
On 30/03/17 12:16, Andrzej Hajda wrote:
[...]
> I guess Geert's proposition to create pages permanently is also not
> acceptable[2]. So how to fix crashes which appeared after patch adding
 If I'm not mistaken, creating the pages permanently is what the
 !DMA_ATTR_FORCE_CONTIGUOUS does? The only difference is where
 the underlying memory is allocated from.

 Am I missing something?
>>> Quoting Robin from his response:
 in general is is not
 safe to assume a coherent allocation is backed by struct pages at all
>>> As I said before I am not familiar with the subject, so it is possible I
>>> misunderstood something.
>> That's from the perspective of external callers of
>> dma_alloc_coherent()/dma_alloc_attrs(), wherein
>> dma_alloc_from_coherent() may have returned device-specific memory
>> without calling into the arch dma_map_ops implementation. Internal to
>> the arm64 implementation, however, everything *we* allocate comes from
>> either CMA or the normal page allocator, so should always be backed by
>> real kernel pages.
>>
>> Robin.
> 
> 
> OK, so what do you exactly mean by "The general point still stands", my
> patch fixes handling of allocations made internally?

That since FORCE_CONTIGUOUS allocations always come from CMA, you can
let the existing CMA-based implementations handle them just by fixing up
dma_addr appropriately.

> Anyway as I understand approach "creating the pages permanently" in
> __iommu_alloc_attrs is OK. Is it worth to create patch adding it? It
> should solve the issue as well and also looks saner (for me).

Sure, you *could* - there's nothing technically wrong with that other
than violating a strict interpretation of the iommu-dma API
documentation if you pass it into iommu_dma_mmap() - it's just that the
only point of using the pages array here in the first place is to cover
the general case where an allocation might not be physically contiguous.
If you have a guarantee that a given allocation definitely *is* both
physically and virtually contiguous, then that's a bunch of extra work
you simply have no need to do.

If you do want to go down that route, then I would much rather we fix
dma_common_contiguous_remap() to leave a valid array in area->pages in
the first place, than be temporarily faking them up around individual calls.

Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] arm64/dma-mapping: fix DMA_ATTR_FORCE_CONTIGUOUS mmaping code

2017-03-30 Thread Andrzej Hajda
Hi Robin,

On 30.03.2017 12:44, Robin Murphy wrote:
> On 30/03/17 09:30, Andrzej Hajda wrote:
>> On 30.03.2017 09:40, Geert Uytterhoeven wrote:
>>> Hi Andrzej,
>>>
>>> On Thu, Mar 30, 2017 at 8:51 AM, Andrzej Hajda  wrote:
 On 29.03.2017 17:33, Robin Murphy wrote:
> On 29/03/17 16:12, Andrzej Hajda wrote:
>> On 29.03.2017 14:55, Robin Murphy wrote:
>>> On 29/03/17 11:05, Andrzej Hajda wrote:
 In case of DMA_ATTR_FORCE_CONTIGUOUS allocations vm_area->pages
 is invalid. __iommu_mmap_attrs and __iommu_get_sgtable cannot use
 it. In first case temporary pages array is passed to iommu_dma_mmap,
 in 2nd case single entry sg table is created directly instead
 of calling helper.

 Fixes: 44176bb ("arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to 
 IOMMU")
 Signed-off-by: Andrzej Hajda 
 ---
 Hi,

 I am not familiar with this framework so please don't be too cruel ;)
 Alternative solution I see is to always create vm_area->pages,
 I do not know which approach is preferred.

 Regards
 Andrzej
 ---
  arch/arm64/mm/dma-mapping.c | 40 
 ++--
  1 file changed, 38 insertions(+), 2 deletions(-)

 diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
 index f7b5401..bba2bc8 100644
 --- a/arch/arm64/mm/dma-mapping.c
 +++ b/arch/arm64/mm/dma-mapping.c
 @@ -704,7 +704,30 @@ static int __iommu_mmap_attrs(struct device *dev, 
 struct vm_area_struct *vma,
return ret;

area = find_vm_area(cpu_addr);
 -  if (WARN_ON(!area || !area->pages))
 +  if (WARN_ON(!area))
>>> >From the look of things, it doesn't seem strictly necessary to change
>>> this, but whether that's a good thing is another matter. I'm not sure
>>> that dma_common_contiguous_remap() should really be leaving a dangling
>>> pointer in area->pages as it apparently does... :/
>>>
 +  return -ENXIO;
 +
 +  if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
 +  struct page *page = vmalloc_to_page(cpu_addr);
 +  unsigned int count = size >> PAGE_SHIFT;
 +  struct page **pages;
 +  unsigned long pfn;
 +  int i;
 +
 +  pages = kmalloc_array(count, sizeof(*pages), GFP_KERNEL);
 +  if (!pages)
 +  return -ENOMEM;
 +
 +  for (i = 0, pfn = page_to_pfn(page); i < count; i++)
 +  pages[i] = pfn_to_page(pfn + i);
 +
 +  ret = iommu_dma_mmap(pages, size, vma);
>>> /**
>>>  * iommu_dma_mmap - Map a buffer into provided user VMA
>>>  * @pages: Array representing buffer from iommu_dma_alloc()
>>>...
>>>
>>> In this case, the buffer has not come from iommu_dma_alloc(), so passing
>>> into iommu_dma_mmap() is wrong by contract, even if having to fake up a
>>> page array wasn't enough of a red flag. Given that a FORCE_CONTIGUOUS
>>> allocation is essentially the same as for the non-IOMMU case, I think it
>>> should be sufficient to defer to that path, i.e.:
>>>
>>> if (attrs & DMA_ATTR_FORCE_CONTIGUOUS)
>>> return __swiotlb_mmap(dev, vma, cpu_addr,
>>> phys_to_dma(virt_to_phys(cpu_addr)),
>>> size, attrs);
>> Maybe I have make mistake somewhere but it does not work, here and below
>> (hangs or crashes). I suppose it can be due to different address
>> translations, my patch uses
>> page = vmalloc_to_page(cpu_addr).
>> And here we have:
>> handle = phys_to_dma(dev, virt_to_phys(cpu_addr)); // in
>> __iommu_mmap_attrs
>> page = phys_to_page(dma_to_phys(dev, handle)); // in
>> __swiotlb_get_sgtable
>> I guess it is similarly in __swiotlb_mmap.
>>
>> Are these translations equivalent?
> Ah, my mistake, sorry - I managed to forget that cpu_addr is always
> remapped for FORCE_CONTIGUOUS (*and* somehow overlook the use of
> vmalloc_to_page() in the patch itself), so the virt_to_phys() part of my
> example is indeed bogus. The general point still stands, though.
 I guess Geert's proposition to create pages permanently is also not
 acceptable[2]. So how to fix crashes which appeared after patch adding
>>> If I'm not mistaken, creating the pages permanently is what the
>>> !DMA_ATTR_FORCE_CONTIGUOUS does? The only difference is where
>>> the underlying memory is allocated from.
>>>
>>> Am I missing something?
>> Quoting Robin from his response:
>>> in general is is not
>>> safe to assume a coherent allocation is backed by struct pages 

Re: [PATCH] arm64/dma-mapping: fix DMA_ATTR_FORCE_CONTIGUOUS mmaping code

2017-03-30 Thread Robin Murphy
On 30/03/17 09:30, Andrzej Hajda wrote:
> On 30.03.2017 09:40, Geert Uytterhoeven wrote:
>> Hi Andrzej,
>>
>> On Thu, Mar 30, 2017 at 8:51 AM, Andrzej Hajda  wrote:
>>> On 29.03.2017 17:33, Robin Murphy wrote:
 On 29/03/17 16:12, Andrzej Hajda wrote:
> On 29.03.2017 14:55, Robin Murphy wrote:
>> On 29/03/17 11:05, Andrzej Hajda wrote:
>>> In case of DMA_ATTR_FORCE_CONTIGUOUS allocations vm_area->pages
>>> is invalid. __iommu_mmap_attrs and __iommu_get_sgtable cannot use
>>> it. In first case temporary pages array is passed to iommu_dma_mmap,
>>> in 2nd case single entry sg table is created directly instead
>>> of calling helper.
>>>
>>> Fixes: 44176bb ("arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to 
>>> IOMMU")
>>> Signed-off-by: Andrzej Hajda 
>>> ---
>>> Hi,
>>>
>>> I am not familiar with this framework so please don't be too cruel ;)
>>> Alternative solution I see is to always create vm_area->pages,
>>> I do not know which approach is preferred.
>>>
>>> Regards
>>> Andrzej
>>> ---
>>>  arch/arm64/mm/dma-mapping.c | 40 
>>> ++--
>>>  1 file changed, 38 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
>>> index f7b5401..bba2bc8 100644
>>> --- a/arch/arm64/mm/dma-mapping.c
>>> +++ b/arch/arm64/mm/dma-mapping.c
>>> @@ -704,7 +704,30 @@ static int __iommu_mmap_attrs(struct device *dev, 
>>> struct vm_area_struct *vma,
>>>return ret;
>>>
>>>area = find_vm_area(cpu_addr);
>>> -  if (WARN_ON(!area || !area->pages))
>>> +  if (WARN_ON(!area))
>> >From the look of things, it doesn't seem strictly necessary to change
>> this, but whether that's a good thing is another matter. I'm not sure
>> that dma_common_contiguous_remap() should really be leaving a dangling
>> pointer in area->pages as it apparently does... :/
>>
>>> +  return -ENXIO;
>>> +
>>> +  if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
>>> +  struct page *page = vmalloc_to_page(cpu_addr);
>>> +  unsigned int count = size >> PAGE_SHIFT;
>>> +  struct page **pages;
>>> +  unsigned long pfn;
>>> +  int i;
>>> +
>>> +  pages = kmalloc_array(count, sizeof(*pages), GFP_KERNEL);
>>> +  if (!pages)
>>> +  return -ENOMEM;
>>> +
>>> +  for (i = 0, pfn = page_to_pfn(page); i < count; i++)
>>> +  pages[i] = pfn_to_page(pfn + i);
>>> +
>>> +  ret = iommu_dma_mmap(pages, size, vma);
>> /**
>>  * iommu_dma_mmap - Map a buffer into provided user VMA
>>  * @pages: Array representing buffer from iommu_dma_alloc()
>>...
>>
>> In this case, the buffer has not come from iommu_dma_alloc(), so passing
>> into iommu_dma_mmap() is wrong by contract, even if having to fake up a
>> page array wasn't enough of a red flag. Given that a FORCE_CONTIGUOUS
>> allocation is essentially the same as for the non-IOMMU case, I think it
>> should be sufficient to defer to that path, i.e.:
>>
>> if (attrs & DMA_ATTR_FORCE_CONTIGUOUS)
>> return __swiotlb_mmap(dev, vma, cpu_addr,
>> phys_to_dma(virt_to_phys(cpu_addr)),
>> size, attrs);
> Maybe I have make mistake somewhere but it does not work, here and below
> (hangs or crashes). I suppose it can be due to different address
> translations, my patch uses
> page = vmalloc_to_page(cpu_addr).
> And here we have:
> handle = phys_to_dma(dev, virt_to_phys(cpu_addr)); // in
> __iommu_mmap_attrs
> page = phys_to_page(dma_to_phys(dev, handle)); // in
> __swiotlb_get_sgtable
> I guess it is similarly in __swiotlb_mmap.
>
> Are these translations equivalent?
 Ah, my mistake, sorry - I managed to forget that cpu_addr is always
 remapped for FORCE_CONTIGUOUS (*and* somehow overlook the use of
 vmalloc_to_page() in the patch itself), so the virt_to_phys() part of my
 example is indeed bogus. The general point still stands, though.
>>> I guess Geert's proposition to create pages permanently is also not
>>> acceptable[2]. So how to fix crashes which appeared after patch adding
>> If I'm not mistaken, creating the pages permanently is what the
>> !DMA_ATTR_FORCE_CONTIGUOUS does? The only difference is where
>> the underlying memory is allocated from.
>>
>> Am I missing something?
> 
> Quoting Robin from his response:
>> in general is is not
>> safe to assume a coherent allocation is backed by struct pages at all
> 
> As I said before I am not familiar with the subject, so it is possible I
> misunderstood something.

That's from the perspective of external 

Re: [RFC PATCH 1/3] of/pci: dma-ranges to account highest possible host bridge dma_mask

2017-03-30 Thread Oza Oza via iommu
On Tue, Mar 28, 2017 at 7:43 PM, Rob Herring  wrote:
> On Tue, Mar 28, 2017 at 12:27 AM, Oza Oza  wrote:
>> On Mon, Mar 27, 2017 at 8:16 PM, Rob Herring  wrote:
>>> On Sat, Mar 25, 2017 at 12:31 AM, Oza Pawandeep  
>>> wrote:
 it is possible that PCI device supports 64-bit DMA addressing,
 and thus it's driver sets device's dma_mask to DMA_BIT_MASK(64),
 however PCI host bridge may have limitations on the inbound
 transaction addressing. As an example, consider NVME SSD device
 connected to iproc-PCIe controller.

 Currently, the IOMMU DMA ops only considers PCI device dma_mask
 when allocating an IOVA. This is particularly problematic on
 ARM/ARM64 SOCs where the IOMMU (i.e. SMMU) translates IOVA to
 PA for in-bound transactions only after PCI Host has forwarded
 these transactions on SOC IO bus. This means on such ARM/ARM64
 SOCs the IOVA of in-bound transactions has to honor the addressing
 restrictions of the PCI Host.

 current pcie frmework and of framework integration assumes dma-ranges
 in a way where memory-mapped devices define their dma-ranges.
 dma-ranges: (child-bus-address, parent-bus-address, length).

 but iproc based SOCs and even Rcar based SOCs has PCI world dma-ranges.
 dma-ranges = <0x4300 0x00 0x00 0x00 0x00 0x80 0x00>;
>>>
>>> If you implement a common function, then I expect to see other users
>>> converted to use it. There's also PCI hosts in arch/powerpc that parse
>>> dma-ranges.
>>
>> the common function should be similar to what
>> of_pci_get_host_bridge_resources is doing right now.
>> it parses ranges property right now.
>>
>> the new function would look look following.
>>
>> of_pci_get_dma_ranges(struct device_node *dev, struct list_head *resources)
>> where resources would return the dma-ranges.
>>
>> but right now if you see the patch, of_dma_configure calls the new
>> function, which actually returns the largest possible size.
>> so this new function has to be generic in a way where other PCI hosts
>> can use it. but certainly iproc(Broadcom SOC) , rcar based SOCs can
>> use it for sure.
>>
>> although having powerpc using it;  is a separate exercise, since I do
>> not have any access to other PCI hosts such as powerpc. but we can
>> workout with them on thsi forum if required.
>
> You don't need h/w. You can analyze what parts are common, write
> patches to convert to common implementation, and build test. The PPC
> and rcar folks can test on h/w.
>
> Rob


Hi Rob,

I have addressed your comment and made generic function.
Gentle request to have a look at following approach and patch.

[RFC PATCH 2/2] of/pci: call pci specific dma-ranges instead of memory-mapped.
[RFC PATCH 1/2] of/pci: implement inbound dma-ranges for PCI

I have tested this on our platform, with and without iommu, and seems to work.

let me know your view on this.

Regards,
Oza.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH] of: Fix DMA configuration for non-DT masters

2017-03-30 Thread Oza Oza via iommu
On Thu, Mar 30, 2017 at 8:51 AM, Oza Oza  wrote:
> On Wed, Mar 29, 2017 at 11:12 PM, Robin Murphy  wrote:
>> On 29/03/17 06:46, Oza Oza wrote:
>>> On Wed, Mar 29, 2017 at 10:23 AM, Oza Oza  wrote:
 On Wed, Mar 29, 2017 at 12:27 AM, Robin Murphy  
 wrote:
> For PCI masters not represented in DT, we pass the OF node of their
> associated host bridge to of_dma_configure(), such that they can inherit
> the appropriate DMA configuration from whatever is described there.
> Unfortunately, whilst this has worked for the "dma-coherent" property,
> it turns out to miss the case where the host bridge node has a non-empty
> "dma-ranges", since nobody is expecting the 'device' to be a bus itself.
>
> It transpires, though, that the de-facto interface since the prototype
> change in 1f5c69aa51f9 ("of: Move of_dma_configure() to device.c to help
> re-use") is very clear-cut: either the master_np argument is redundant
> with dev->of_node, or dev->of_node is NULL and master_np is the relevant
> parent bus. Let's ratify that behaviour, then teach the whole
> of_dma_configure() pipeline to cope with both cases properly.
>
> Signed-off-by: Robin Murphy 
>>
>> [...]
>>

 pci and memory mapped device world is different.
>>
>> ???
>>
>> No they aren't. There is nothing magic about PCI. PCI *is* a
>> memory-mapped bus.
>>
>> The only PCI-specific aspect here is the Linux code path which passes a
>> host controller node into of_dma_configure() where the latter expects a
>> node for the actual endpoint device. It managed to work for
>> "dma-coherent", because that may appear either directly on a device node
>> or on any of its parent buses, but "dma-ranges" is *only* valid for DT
>> nodes representing buses, thus reveals that using a parent bus to stand
>> in for a device isn't actually correct. I only posted that horrible hack
>> patch to prove the point that having a child node to represent the
>> actual device is indeed sufficient to make of_dma_configure() work
>> correctly for PCI masters as-is (modulo the other issues).
>>
>>> of course if you talk
 from iommu perspective, all the master are same for IOMMU
>>
>> I don't understand what you mean by that. There's nothing about IOMMUs
>> here, it's purely about parsing DT properties correctly.
>>
 but the original intention of the patch is to try to solve 2 problems.
 please refer to https://lkml.org/lkml/2017/3/29/10
>>
>> One patch should not solve two separate problems anyway. Taking a step
>> back, there are at least 3 things that this discussion has brought up:
>>
>> 1) The way in which we call of_dma_configure() for PCI devices causes
>> the "dma-ranges" property on PCI host controllers to be ignored.
>>
>> 2) of_dma_get_range() only considers the first entry in any "dma-ranges"
>> property.
>>
>> 3) When of_dma_configure() does set a DMA mask, there is nothing on
>> arm64 and other architectures to prevent drivers overriding that with a
>> wider mask later.
>>
>> Those are 3 separate problems, to solve with 3 or more separate patches,
>> and I have deliberately put the second and third to one side for the
>> moment. This patch fixes problem 1.
>>
 1) expose generic API for pci world clients to configure their
 dma-ranges. right now there is none.
 2) same API can be used by IOMMU to derive dma_mask.

 while the current patch posted to handle dma-ranges for both memory
 mapped and pci devices, which I think is overdoing.
>>
>> No, of_dma_configure() handles the "dma-ranges" property as it is
>> defined in the DT spec to describe the mapping between a child bus
>> address space and a parent bus address space. Whether those
>> memory-mapped parent and child buses are PCI, ISA, AMBA, HyperTransport,
>> or anything else is irrelevant other than for the encoding of the
>> address specifiers. All this patch does is sort out the cases where we
>> have a real device node to start at, from the cases where we don't and
>> so start directly at the device's parent bus node instead.
>>
 besides we have different configuration of dma-ranges based on iommu
 is enabled or not enabled.
>>
>> That doesn't sound right, unless you mean the firmware actually programs
>> the host controller's AXI bridge differently for system configurations
>> where the IOMMU is expected to be used or not? (and even then, I don't
>> really see why it would be necessary to do that...)
>>
  #define PCIE_DMA_RANGES dma-ranges = <0x4300 0x00 0x8000 0x00
 0x8000 0x00 0x8000 \
   0x4300 0x08 0x 0x08
 0x 0x08 0x \
   0x4300 0x80 0x 0x80
 0x 0x40 0x>;
 Not sure if this patch will consider above dma-ranges.

 my 

[RFC PATCH 1/2] of/pci: implement inbound dma-ranges for PCI

2017-03-30 Thread Oza Pawandeep via iommu
current device frmework and of framework integration assumes dma-ranges
in a way where memory-mapped devices define their dma-ranges.
dma-ranges: (child-bus-address, parent-bus-address, length).

but iproc based SOCs and other SOCs(rcar) have PCI world dma-ranges.
dma-ranges = <0x4300 0x00 0x00 0x00 0x00 0x80 0x00>;

of_dma_configure is specifically witten to take care of memory mapped devices.
but no implementation exists for pci to take care of pcie based memory ranges.
in fact pci world doesnt seem to define standard dma-ranges

this exposes intrface not only to the pci host driver, but also
it aims to provide an interface to callers such as of_dma_configure.
so then the returned size get best possible (largest) dma_mask.
for e.g.
dma-ranges = <0x4300 0x00 0x00 0x00 0x00 0x80 0x00>;
we should get dev->coherent_dma_mask=0x7f.

also this patch intends to handle multiple inbound windows and dma-ranges.
it is left to the caller, how it wants to use them.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c
index 0ee42c3..5299438 100644
--- a/drivers/of/of_pci.c
+++ b/drivers/of/of_pci.c
@@ -283,6 +283,80 @@ int of_pci_get_host_bridge_resources(struct device_node 
*dev,
return err;
 }
 EXPORT_SYMBOL_GPL(of_pci_get_host_bridge_resources);
+
+/**
+ * of_pci_get_dma_ranges - Parse PCI host bridge inbound resources from DT
+ * @np: device node of the host bridge having the dma-ranges property
+ * @resources: list where the range of resources will be added after DT parsing
+ *
+ * It is the caller's job to free the @resources list.
+ *
+ * This function will parse the "dma-ranges" property of a PCI host bridge 
device
+ * node and setup the resource mapping based on its content.
+ *
+ * It returns zero if the range parsing has been successful or a standard error
+ * value if it failed.
+ */
+
+int of_pci_get_dma_ranges(struct device_node *np, struct list_head *resources)
+{
+   struct device_node *node = of_node_get(np);
+   int rlen;
+   int ret = 0;
+   const int na = 3, ns = 2;
+   struct resource *res;
+   struct of_pci_range_parser parser;
+   struct of_pci_range range;
+
+   if (!node)
+   return -EINVAL;
+
+   parser.node = node;
+   parser.pna = of_n_addr_cells(node);
+   parser.np = parser.pna + na + ns;
+
+   parser.range = of_get_property(node, "dma-ranges", );
+
+   if (!parser.range) {
+   pr_debug("pcie device has no dma-ranges defined for 
node(%s)\n", np->full_name);
+   ret = -ENODEV;
+   goto out;
+   }
+
+   parser.end = parser.range + rlen / sizeof(__be32);
+
+   for_each_of_pci_range(, ) {
+   /*
+* If we failed translation or got a zero-sized region
+* then skip this range
+*/
+   if (range.cpu_addr == OF_BAD_ADDR || range.size == 0)
+   continue;
+
+   res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+   if (!res) {
+   ret = -ENOMEM;
+   goto parse_failed;
+   }
+
+   ret = of_pci_range_to_resource(, np, res);
+   if (ret) {
+   kfree(res);
+   continue;
+   }
+
+   pci_add_resource_offset(resources, res, res->start - 
range.pci_addr);
+}
+
+   return ret;
+
+parse_failed:
+   pci_free_resource_list(resources);
+out:
+   of_node_put(node);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(of_pci_get_dma_ranges);
 #endif /* CONFIG_OF_ADDRESS */
 
 #ifdef CONFIG_PCI_MSI
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index 0e0974e..8509e3d 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -76,6 +76,7 @@ static inline void of_pci_check_probe_only(void) { }
 int of_pci_get_host_bridge_resources(struct device_node *dev,
unsigned char busno, unsigned char bus_max,
struct list_head *resources, resource_size_t *io_base);
+int of_pci_get_dma_ranges(struct device_node *np, struct list_head *resources);
 #else
 static inline int of_pci_get_host_bridge_resources(struct device_node *dev,
unsigned char busno, unsigned char bus_max,
@@ -83,6 +84,11 @@ static inline int of_pci_get_host_bridge_resources(struct 
device_node *dev,
 {
return -EINVAL;
 }
+
+static inline int of_pci_get_dma_ranges(struct device_node *np, struct 
list_head *resources)
+{
+   return -EINVAL;
+}
 #endif
 
 #if defined(CONFIG_OF) && defined(CONFIG_PCI_MSI)
-- 
1.9.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] arm64/dma-mapping: fix DMA_ATTR_FORCE_CONTIGUOUS mmaping code

2017-03-30 Thread Andrzej Hajda
On 30.03.2017 09:40, Geert Uytterhoeven wrote:
> Hi Andrzej,
>
> On Thu, Mar 30, 2017 at 8:51 AM, Andrzej Hajda  wrote:
>> On 29.03.2017 17:33, Robin Murphy wrote:
>>> On 29/03/17 16:12, Andrzej Hajda wrote:
 On 29.03.2017 14:55, Robin Murphy wrote:
> On 29/03/17 11:05, Andrzej Hajda wrote:
>> In case of DMA_ATTR_FORCE_CONTIGUOUS allocations vm_area->pages
>> is invalid. __iommu_mmap_attrs and __iommu_get_sgtable cannot use
>> it. In first case temporary pages array is passed to iommu_dma_mmap,
>> in 2nd case single entry sg table is created directly instead
>> of calling helper.
>>
>> Fixes: 44176bb ("arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to 
>> IOMMU")
>> Signed-off-by: Andrzej Hajda 
>> ---
>> Hi,
>>
>> I am not familiar with this framework so please don't be too cruel ;)
>> Alternative solution I see is to always create vm_area->pages,
>> I do not know which approach is preferred.
>>
>> Regards
>> Andrzej
>> ---
>>  arch/arm64/mm/dma-mapping.c | 40 
>> ++--
>>  1 file changed, 38 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
>> index f7b5401..bba2bc8 100644
>> --- a/arch/arm64/mm/dma-mapping.c
>> +++ b/arch/arm64/mm/dma-mapping.c
>> @@ -704,7 +704,30 @@ static int __iommu_mmap_attrs(struct device *dev, 
>> struct vm_area_struct *vma,
>>return ret;
>>
>>area = find_vm_area(cpu_addr);
>> -  if (WARN_ON(!area || !area->pages))
>> +  if (WARN_ON(!area))
> >From the look of things, it doesn't seem strictly necessary to change
> this, but whether that's a good thing is another matter. I'm not sure
> that dma_common_contiguous_remap() should really be leaving a dangling
> pointer in area->pages as it apparently does... :/
>
>> +  return -ENXIO;
>> +
>> +  if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
>> +  struct page *page = vmalloc_to_page(cpu_addr);
>> +  unsigned int count = size >> PAGE_SHIFT;
>> +  struct page **pages;
>> +  unsigned long pfn;
>> +  int i;
>> +
>> +  pages = kmalloc_array(count, sizeof(*pages), GFP_KERNEL);
>> +  if (!pages)
>> +  return -ENOMEM;
>> +
>> +  for (i = 0, pfn = page_to_pfn(page); i < count; i++)
>> +  pages[i] = pfn_to_page(pfn + i);
>> +
>> +  ret = iommu_dma_mmap(pages, size, vma);
> /**
>  * iommu_dma_mmap - Map a buffer into provided user VMA
>  * @pages: Array representing buffer from iommu_dma_alloc()
>...
>
> In this case, the buffer has not come from iommu_dma_alloc(), so passing
> into iommu_dma_mmap() is wrong by contract, even if having to fake up a
> page array wasn't enough of a red flag. Given that a FORCE_CONTIGUOUS
> allocation is essentially the same as for the non-IOMMU case, I think it
> should be sufficient to defer to that path, i.e.:
>
> if (attrs & DMA_ATTR_FORCE_CONTIGUOUS)
> return __swiotlb_mmap(dev, vma, cpu_addr,
> phys_to_dma(virt_to_phys(cpu_addr)),
> size, attrs);
 Maybe I have make mistake somewhere but it does not work, here and below
 (hangs or crashes). I suppose it can be due to different address
 translations, my patch uses
 page = vmalloc_to_page(cpu_addr).
 And here we have:
 handle = phys_to_dma(dev, virt_to_phys(cpu_addr)); // in
 __iommu_mmap_attrs
 page = phys_to_page(dma_to_phys(dev, handle)); // in
 __swiotlb_get_sgtable
 I guess it is similarly in __swiotlb_mmap.

 Are these translations equivalent?
>>> Ah, my mistake, sorry - I managed to forget that cpu_addr is always
>>> remapped for FORCE_CONTIGUOUS (*and* somehow overlook the use of
>>> vmalloc_to_page() in the patch itself), so the virt_to_phys() part of my
>>> example is indeed bogus. The general point still stands, though.
>> I guess Geert's proposition to create pages permanently is also not
>> acceptable[2]. So how to fix crashes which appeared after patch adding
> If I'm not mistaken, creating the pages permanently is what the
> !DMA_ATTR_FORCE_CONTIGUOUS does? The only difference is where
> the underlying memory is allocated from.
>
> Am I missing something?

Quoting Robin from his response:
> in general is is not
> safe to assume a coherent allocation is backed by struct pages at all

As I said before I am not familiar with the subject, so it is possible I
misunderstood something.

Regards
Andrzej


>
> Thanks!
>
>> support to DMA_ATTR_FORCE_CONTIGUOUS in IOMMU on arm64 [1]?
>> Maybe temporary solution is to drop the patch until proper handling of
>> mmapping is proposed, 

Re: [PATCH] arm64/dma-mapping: fix DMA_ATTR_FORCE_CONTIGUOUS mmaping code

2017-03-30 Thread Geert Uytterhoeven
Hi Andrzej,

On Thu, Mar 30, 2017 at 8:51 AM, Andrzej Hajda  wrote:
> On 29.03.2017 17:33, Robin Murphy wrote:
>> On 29/03/17 16:12, Andrzej Hajda wrote:
>>> On 29.03.2017 14:55, Robin Murphy wrote:
 On 29/03/17 11:05, Andrzej Hajda wrote:
> In case of DMA_ATTR_FORCE_CONTIGUOUS allocations vm_area->pages
> is invalid. __iommu_mmap_attrs and __iommu_get_sgtable cannot use
> it. In first case temporary pages array is passed to iommu_dma_mmap,
> in 2nd case single entry sg table is created directly instead
> of calling helper.
>
> Fixes: 44176bb ("arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to 
> IOMMU")
> Signed-off-by: Andrzej Hajda 
> ---
> Hi,
>
> I am not familiar with this framework so please don't be too cruel ;)
> Alternative solution I see is to always create vm_area->pages,
> I do not know which approach is preferred.
>
> Regards
> Andrzej
> ---
>  arch/arm64/mm/dma-mapping.c | 40 ++--
>  1 file changed, 38 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index f7b5401..bba2bc8 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -704,7 +704,30 @@ static int __iommu_mmap_attrs(struct device *dev, 
> struct vm_area_struct *vma,
>return ret;
>
>area = find_vm_area(cpu_addr);
> -  if (WARN_ON(!area || !area->pages))
> +  if (WARN_ON(!area))
 >From the look of things, it doesn't seem strictly necessary to change
 this, but whether that's a good thing is another matter. I'm not sure
 that dma_common_contiguous_remap() should really be leaving a dangling
 pointer in area->pages as it apparently does... :/

> +  return -ENXIO;
> +
> +  if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
> +  struct page *page = vmalloc_to_page(cpu_addr);
> +  unsigned int count = size >> PAGE_SHIFT;
> +  struct page **pages;
> +  unsigned long pfn;
> +  int i;
> +
> +  pages = kmalloc_array(count, sizeof(*pages), GFP_KERNEL);
> +  if (!pages)
> +  return -ENOMEM;
> +
> +  for (i = 0, pfn = page_to_pfn(page); i < count; i++)
> +  pages[i] = pfn_to_page(pfn + i);
> +
> +  ret = iommu_dma_mmap(pages, size, vma);
 /**
  * iommu_dma_mmap - Map a buffer into provided user VMA
  * @pages: Array representing buffer from iommu_dma_alloc()
...

 In this case, the buffer has not come from iommu_dma_alloc(), so passing
 into iommu_dma_mmap() is wrong by contract, even if having to fake up a
 page array wasn't enough of a red flag. Given that a FORCE_CONTIGUOUS
 allocation is essentially the same as for the non-IOMMU case, I think it
 should be sufficient to defer to that path, i.e.:

 if (attrs & DMA_ATTR_FORCE_CONTIGUOUS)
 return __swiotlb_mmap(dev, vma, cpu_addr,
 phys_to_dma(virt_to_phys(cpu_addr)),
 size, attrs);
>>> Maybe I have make mistake somewhere but it does not work, here and below
>>> (hangs or crashes). I suppose it can be due to different address
>>> translations, my patch uses
>>> page = vmalloc_to_page(cpu_addr).
>>> And here we have:
>>> handle = phys_to_dma(dev, virt_to_phys(cpu_addr)); // in
>>> __iommu_mmap_attrs
>>> page = phys_to_page(dma_to_phys(dev, handle)); // in
>>> __swiotlb_get_sgtable
>>> I guess it is similarly in __swiotlb_mmap.
>>>
>>> Are these translations equivalent?
>> Ah, my mistake, sorry - I managed to forget that cpu_addr is always
>> remapped for FORCE_CONTIGUOUS (*and* somehow overlook the use of
>> vmalloc_to_page() in the patch itself), so the virt_to_phys() part of my
>> example is indeed bogus. The general point still stands, though.
>
> I guess Geert's proposition to create pages permanently is also not
> acceptable[2]. So how to fix crashes which appeared after patch adding

If I'm not mistaken, creating the pages permanently is what the
!DMA_ATTR_FORCE_CONTIGUOUS does? The only difference is where
the underlying memory is allocated from.

Am I missing something?

Thanks!

> support to DMA_ATTR_FORCE_CONTIGUOUS in IOMMU on arm64 [1]?
> Maybe temporary solution is to drop the patch until proper handling of
> mmapping is proposed, without it the patch looks incomplete and causes
> regression.
> Moreover __iommu_alloc_attrs uses also dma_common_contiguous_remap which
> also assumes existence of struct pages.
>
> [1]: https://patchwork.kernel.org/patch/9609551/
> [2]: https://www.spinics.net/lists/arm-kernel/msg572688.html

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 

Re: [PATCH] arm64/dma-mapping: fix DMA_ATTR_FORCE_CONTIGUOUS mmaping code

2017-03-30 Thread Andrzej Hajda
On 29.03.2017 17:33, Robin Murphy wrote:
> On 29/03/17 16:12, Andrzej Hajda wrote:
>> On 29.03.2017 14:55, Robin Murphy wrote:
>>> On 29/03/17 11:05, Andrzej Hajda wrote:
 In case of DMA_ATTR_FORCE_CONTIGUOUS allocations vm_area->pages
 is invalid. __iommu_mmap_attrs and __iommu_get_sgtable cannot use
 it. In first case temporary pages array is passed to iommu_dma_mmap,
 in 2nd case single entry sg table is created directly instead
 of calling helper.

 Fixes: 44176bb ("arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to 
 IOMMU")
 Signed-off-by: Andrzej Hajda 
 ---
 Hi,

 I am not familiar with this framework so please don't be too cruel ;)
 Alternative solution I see is to always create vm_area->pages,
 I do not know which approach is preferred.

 Regards
 Andrzej
 ---
  arch/arm64/mm/dma-mapping.c | 40 ++--
  1 file changed, 38 insertions(+), 2 deletions(-)

 diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
 index f7b5401..bba2bc8 100644
 --- a/arch/arm64/mm/dma-mapping.c
 +++ b/arch/arm64/mm/dma-mapping.c
 @@ -704,7 +704,30 @@ static int __iommu_mmap_attrs(struct device *dev, 
 struct vm_area_struct *vma,
return ret;
  
area = find_vm_area(cpu_addr);
 -  if (WARN_ON(!area || !area->pages))
 +  if (WARN_ON(!area))
>>> >From the look of things, it doesn't seem strictly necessary to change
>>> this, but whether that's a good thing is another matter. I'm not sure
>>> that dma_common_contiguous_remap() should really be leaving a dangling
>>> pointer in area->pages as it apparently does... :/
>>>
 +  return -ENXIO;
 +
 +  if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
 +  struct page *page = vmalloc_to_page(cpu_addr);
 +  unsigned int count = size >> PAGE_SHIFT;
 +  struct page **pages;
 +  unsigned long pfn;
 +  int i;
 +
 +  pages = kmalloc_array(count, sizeof(*pages), GFP_KERNEL);
 +  if (!pages)
 +  return -ENOMEM;
 +
 +  for (i = 0, pfn = page_to_pfn(page); i < count; i++)
 +  pages[i] = pfn_to_page(pfn + i);
 +
 +  ret = iommu_dma_mmap(pages, size, vma);
>>> /**
>>>  * iommu_dma_mmap - Map a buffer into provided user VMA
>>>  * @pages: Array representing buffer from iommu_dma_alloc()
>>>...
>>>
>>> In this case, the buffer has not come from iommu_dma_alloc(), so passing
>>> into iommu_dma_mmap() is wrong by contract, even if having to fake up a
>>> page array wasn't enough of a red flag. Given that a FORCE_CONTIGUOUS
>>> allocation is essentially the same as for the non-IOMMU case, I think it
>>> should be sufficient to defer to that path, i.e.:
>>>
>>> if (attrs & DMA_ATTR_FORCE_CONTIGUOUS)
>>> return __swiotlb_mmap(dev, vma, cpu_addr,
>>> phys_to_dma(virt_to_phys(cpu_addr)),
>>> size, attrs);
>> Maybe I have make mistake somewhere but it does not work, here and below
>> (hangs or crashes). I suppose it can be due to different address
>> translations, my patch uses
>> page = vmalloc_to_page(cpu_addr).
>> And here we have:
>> handle = phys_to_dma(dev, virt_to_phys(cpu_addr)); // in
>> __iommu_mmap_attrs
>> page = phys_to_page(dma_to_phys(dev, handle)); // in
>> __swiotlb_get_sgtable
>> I guess it is similarly in __swiotlb_mmap.
>>
>> Are these translations equivalent?
> Ah, my mistake, sorry - I managed to forget that cpu_addr is always
> remapped for FORCE_CONTIGUOUS (*and* somehow overlook the use of
> vmalloc_to_page() in the patch itself), so the virt_to_phys() part of my
> example is indeed bogus. The general point still stands, though.

I guess Geert's proposition to create pages permanently is also not
acceptable[2]. So how to fix crashes which appeared after patch adding
support to DMA_ATTR_FORCE_CONTIGUOUS in IOMMU on arm64 [1]?
Maybe temporary solution is to drop the patch until proper handling of
mmapping is proposed, without it the patch looks incomplete and causes
regression.
Moreover __iommu_alloc_attrs uses also dma_common_contiguous_remap which
also assumes existence of struct pages.

[1]: https://patchwork.kernel.org/patch/9609551/
[2]: https://www.spinics.net/lists/arm-kernel/msg572688.html

Regards
Andrzej


>
> Robin.
>
>>
>> Regards
>> Andrzej
>>
 +  kfree(pages);
 +
 +  return ret;
 +  }
 +
 +  if (WARN_ON(!area->pages))
return -ENXIO;
  
return iommu_dma_mmap(area->pages, size, vma);
 @@ -717,7 +740,20 @@ static int __iommu_get_sgtable(struct device *dev, 
 struct sg_table *sgt,
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
struct vm_struct *area = find_vm_area(cpu_addr);
  
 -  if 

RE: [PATCH] iommu/amd: flush IOTLB for specific domains only

2017-03-30 Thread Nath, Arindam
>-Original Message-
>From: Nath, Arindam
>Sent: Monday, March 27, 2017 5:57 PM
>To: 'Daniel Drake'
>Cc: j...@8bytes.org; Deucher, Alexander; Bridgman, John; amd-
>g...@lists.freedesktop.org; iommu@lists.linux-foundation.org; Suthikulpanit,
>Suravee; Linux Upstreaming Team
>Subject: RE: [PATCH] iommu/amd: flush IOTLB for specific domains only
>
>>-Original Message-
>>From: Daniel Drake [mailto:dr...@endlessm.com]
>>Sent: Monday, March 27, 2017 5:56 PM
>>To: Nath, Arindam
>>Cc: j...@8bytes.org; Deucher, Alexander; Bridgman, John; amd-
>>g...@lists.freedesktop.org; iommu@lists.linux-foundation.org; Suthikulpanit,
>>Suravee; Linux Upstreaming Team
>>Subject: Re: [PATCH] iommu/amd: flush IOTLB for specific domains only
>>
>>Hi Arindam,
>>
>>You CC'd me on this - does this mean that it is a fix for the issue
>>described in the thread "amd-iommu: can't boot with amdgpu, AMD-Vi:
>>Completion-Wait loop timed out" ?
>
>Yes Daniel, please test this patch to confirm if the issue gets resolved.

Daniel, did you get chance to test this patch?

Thanks,
Arindam

>
>Thanks,
>Arindam
>
>>
>>Thanks
>>Daniel
>>
>>
>>On Mon, Mar 27, 2017 at 12:17 AM,   wrote:
>>> From: Arindam Nath 
>>>
>>> The idea behind flush queues is to defer the IOTLB flushing
>>> for domains for which the mappings are no longer valid. We
>>> add such domains in queue_add(), and when the queue size
>>> reaches FLUSH_QUEUE_SIZE, we perform __queue_flush().
>>>
>>> Since we have already taken lock before __queue_flush()
>>> is called, we need to make sure the IOTLB flushing is
>>> performed as quickly as possible.
>>>
>>> In the current implementation, we perform IOTLB flushing
>>> for all domains irrespective of which ones were actually
>>> added in the flush queue initially. This can be quite
>>> expensive especially for domains for which unmapping is
>>> not required at this point of time.
>>>
>>> This patch makes use of domain information in
>>> 'struct flush_queue_entry' to make sure we only flush
>>> IOTLBs for domains who need it, skipping others.
>>>
>>> Signed-off-by: Arindam Nath 
>>> ---
>>>  drivers/iommu/amd_iommu.c | 15 ---
>>>  1 file changed, 8 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
>>> index 98940d1..6a9a048 100644
>>> --- a/drivers/iommu/amd_iommu.c
>>> +++ b/drivers/iommu/amd_iommu.c
>>> @@ -2227,15 +2227,16 @@ static struct iommu_group
>>*amd_iommu_device_group(struct device *dev)
>>>
>>>  static void __queue_flush(struct flush_queue *queue)
>>>  {
>>> -   struct protection_domain *domain;
>>> -   unsigned long flags;
>>> int idx;
>>>
>>> -   /* First flush TLB of all known domains */
>>> -   spin_lock_irqsave(_iommu_pd_lock, flags);
>>> -   list_for_each_entry(domain, _iommu_pd_list, list)
>>> -   domain_flush_tlb(domain);
>>> -   spin_unlock_irqrestore(_iommu_pd_lock, flags);
>>> +   /* First flush TLB of all domains which were added to flush queue */
>>> +   for (idx = 0; idx < queue->next; ++idx) {
>>> +   struct flush_queue_entry *entry;
>>> +
>>> +   entry = queue->entries + idx;
>>> +
>>> +   domain_flush_tlb(>dma_dom->domain);
>>> +   }
>>>
>>> /* Wait until flushes have completed */
>>> domain_flush_complete(NULL);
>>> --
>>> 1.9.1
>>>
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/9] iommu: add qcom_iommu

2017-03-30 Thread Archit Taneja

Hi,

On 03/14/2017 08:48 PM, Rob Clark wrote:

An iommu driver for Qualcomm "B" family devices which do not completely
implement the ARM SMMU spec.  These devices have context-bank register
layout that is similar to ARM SMMU, but no global register space (or at
least not one that is accessible).

Signed-off-by: Rob Clark 
Signed-off-by: Stanimir Varbanov 
---
 drivers/iommu/Kconfig |  10 +
 drivers/iommu/Makefile|   1 +
 drivers/iommu/arm-smmu-regs.h |   2 +
 drivers/iommu/qcom_iommu.c| 818 ++
 4 files changed, 831 insertions(+)
 create mode 100644 drivers/iommu/qcom_iommu.c





+
+static int qcom_iommu_add_device(struct device *dev)
+{
+   struct qcom_iommu_dev *qcom_iommu = __to_iommu(dev->iommu_fwspec);


__to_iommu() has a WARN_ON() that gets triggered here for all devices on
the platform bus that aren't backed by our iommu. We should return -ENODEV
for all of them without throwing a warning.


+   struct iommu_group *group;
+   struct device_link *link;
+


We could do something like:

if (fwspec && fwspec->ops == _iommu_ops)
qcom_iommu = __to_iommu(fwspec);
else
qcom_iommu = NULL;

Thanks,
Archit


+   if (!qcom_iommu)
+   return -ENODEV;
+
+   /*
+* Establish the link between iommu and master, so that the
+* iommu gets runtime enabled/disabled as per the master's
+* needs.
+*/
+   link = device_link_add(dev, qcom_iommu->dev, DL_FLAG_PM_RUNTIME);
+   if (!link) {
+   dev_err(qcom_iommu->dev, "Unable to create device link between %s 
and %s\n",
+   dev_name(qcom_iommu->dev), dev_name(dev));
+   return -ENODEV;
+   }
+
+   group = iommu_group_get_for_dev(dev);
+   if (IS_ERR_OR_NULL(group))
+   return PTR_ERR_OR_ZERO(group);
+
+   iommu_group_put(group);
+   iommu_device_link(_iommu->iommu, dev);
+
+   return 0;
+}
+
+static void qcom_iommu_remove_device(struct device *dev)
+{
+   struct qcom_iommu_dev *qcom_iommu = __to_iommu(dev->iommu_fwspec);
+
+   if (!qcom_iommu)
+   return;
+
+   iommu_group_remove_device(dev);
+   iommu_device_unlink(_iommu->iommu, dev);
+   iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *qcom_iommu_device_group(struct device *dev)
+{
+   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_group *group = NULL;
+   unsigned i;
+
+   for (i = 0; i < fwspec->num_ids; i++) {
+   struct qcom_iommu_ctx *ctx = __to_ctx(fwspec, fwspec->ids[i]);
+
+   if (group && ctx->group && group != ctx->group)
+   return ERR_PTR(-EINVAL);
+
+   group = ctx->group;
+   }
+
+   if (group)
+   return iommu_group_ref_get(group);
+
+   group = generic_device_group(dev);
+
+   for (i = 0; i < fwspec->num_ids; i++) {
+   struct qcom_iommu_ctx *ctx = __to_ctx(fwspec, fwspec->ids[i]);
+   ctx->group = iommu_group_ref_get(group);
+   }
+
+   return group;
+}
+
+static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args 
*args)
+{
+   struct platform_device *iommu_pdev;
+
+   if (args->args_count != 1) {
+   dev_err(dev, "incorrect number of iommu params found for %s "
+   "(found %d, expected 1)\n",
+   args->np->full_name, args->args_count);
+   return -EINVAL;
+   }
+
+   if (!dev->iommu_fwspec->iommu_priv) {
+   iommu_pdev = of_find_device_by_node(args->np);
+   if (WARN_ON(!iommu_pdev))
+   return -EINVAL;
+
+   dev->iommu_fwspec->iommu_priv = 
platform_get_drvdata(iommu_pdev);
+   }
+
+   return iommu_fwspec_add_ids(dev, >args[0], 1);
+}
+
+static const struct iommu_ops qcom_iommu_ops = {
+   .capable= qcom_iommu_capable,
+   .domain_alloc   = qcom_iommu_domain_alloc,
+   .domain_free= qcom_iommu_domain_free,
+   .attach_dev = qcom_iommu_attach_dev,
+   .detach_dev = qcom_iommu_detach_dev,
+   .map= qcom_iommu_map,
+   .unmap  = qcom_iommu_unmap,
+   .map_sg = default_iommu_map_sg,
+   .iova_to_phys   = qcom_iommu_iova_to_phys,
+   .add_device = qcom_iommu_add_device,
+   .remove_device  = qcom_iommu_remove_device,
+   .device_group   = qcom_iommu_device_group,
+   .of_xlate   = qcom_iommu_of_xlate,
+   .pgsize_bitmap  = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+};
+
+static int qcom_iommu_enable_clocks(struct qcom_iommu_dev *qcom_iommu)
+{
+   int ret;
+
+   ret = clk_prepare_enable(qcom_iommu->iface_clk);
+   if (ret) {
+   dev_err(qcom_iommu->dev, "Couldn't enable iface_clk\n");
+   return ret;
+   }
+
+   ret =