Re: [PATCH 4/4] iommu/arm-smmu: Use per-context TLB sync as appropriate

2017-03-30 Thread Robin Murphy
On 30/03/17 15:37, Will Deacon wrote:
> Hi Robin,
> 
> This mostly looks great, but I have a couple of minor comments below.
> 
> On Tue, Mar 07, 2017 at 06:09:07PM +, Robin Murphy wrote:
>> TLB synchronisation typically involves the SMMU blocking all incoming
>> transactions until the TLBs report completion of all outstanding
>> operations. In the common SMMUv2 configuration of a single distributed
>> SMMU serving multiple peripherals, that means that a single unmap
>> request has the potential to bring the hammer down on the entire system
>> if synchronised globally. Since stage 1 contexts, and stage 2 contexts
>> under SMMUv2, offer local sync operations, let's make use of those
>> wherever we can in the hope of minimising global disruption.
>>
>> To that end, rather than add any more branches to the already unwieldy
>> monolithic TLB maintenance ops, break them up into smaller, neater,
>> functions which we can then mix and match as appropriate.
>>
>> Signed-off-by: Robin Murphy 
>> ---
>>  drivers/iommu/arm-smmu.c | 156 
>> ++-
>>  1 file changed, 100 insertions(+), 56 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>> index c8aafe304171..f7411109670f 100644
>> --- a/drivers/iommu/arm-smmu.c
>> +++ b/drivers/iommu/arm-smmu.c
>> @@ -237,6 +237,8 @@ enum arm_smmu_s2cr_privcfg {
>>  #define ARM_SMMU_CB_S1_TLBIVAL  0x620
>>  #define ARM_SMMU_CB_S2_TLBIIPAS20x630
>>  #define ARM_SMMU_CB_S2_TLBIIPAS2L   0x638
>> +#define ARM_SMMU_CB_TLBSYNC 0x7f0
>> +#define ARM_SMMU_CB_TLBSTATUS   0x7f4
>>  #define ARM_SMMU_CB_ATS1PR  0x800
>>  #define ARM_SMMU_CB_ATSR0x8f0
>>  
>> @@ -569,14 +571,13 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
>> int idx)
>>  }
>>  
>>  /* Wait for any pending TLB invalidations to complete */
>> -static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
>> +static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
>> +void __iomem *sync, void __iomem *status)
> 
> Given that you take the arm_smmu_device anyway, I think I'd prefer just
> passing the offsets for sync and status and avoiding the additions
> in the caller (a bit like your other patch in this series ;).

Note that the sole reason for passing the arm_smmu_device is for the
dev_err(), but I neither want to remove that nor duplicate it across the
callers...

However, the concrete reason for not passing offsets is that this
function serves for both global and local syncs, so there is no single
base address that can be assumed. At one point I toyed with just passing
a context bank number (using -1 for "global") but even I thought that
ended up looking awful ;)

>>  static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>> @@ -617,48 +638,66 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned 
>> long iova, size_t size,
>>  {
>>  struct arm_smmu_domain *smmu_domain = cookie;
>>  struct arm_smmu_cfg *cfg = _domain->cfg;
>> -struct arm_smmu_device *smmu = smmu_domain->smmu;
>>  bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
>> -void __iomem *reg;
>> +void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
>> +size_t step;
>>  
>> -if (stage1) {
>> -reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>> -reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
>> -
>> -if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
>> -iova &= ~12UL;
>> -iova |= cfg->asid;
>> -do {
>> -writel_relaxed(iova, reg);
>> -iova += granule;
>> -} while (size -= granule);
>> -} else {
>> -iova >>= 12;
>> -iova |= (u64)cfg->asid << 48;
>> -do {
>> -writeq_relaxed(iova, reg);
>> -iova += granule >> 12;
>> -} while (size -= granule);
>> -}
>> -} else if (smmu->version == ARM_SMMU_V2) {
>> -reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>> +if (stage1)
>> +reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL :
>> +  ARM_SMMU_CB_S1_TLBIVA;
>> +else
>>  reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
>>ARM_SMMU_CB_S2_TLBIIPAS2;
>> -iova >>= 12;
>> -do {
>> -smmu_write_atomic_lq(iova, reg);
>> -iova += granule >> 12;
>> -} while (size -= granule);
>> +
>> +if (stage1 && cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
>> +iova &= ~12UL;
>> +iova |= cfg->asid;
>> +step = granule;
>>  } else {
>> -reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
>> -writel_relaxed(cfg->vmid, reg);
>> +   

Re: [PATCH 4/4] iommu/arm-smmu: Use per-context TLB sync as appropriate

2017-03-30 Thread Will Deacon
Hi Robin,

This mostly looks great, but I have a couple of minor comments below.

On Tue, Mar 07, 2017 at 06:09:07PM +, Robin Murphy wrote:
> TLB synchronisation typically involves the SMMU blocking all incoming
> transactions until the TLBs report completion of all outstanding
> operations. In the common SMMUv2 configuration of a single distributed
> SMMU serving multiple peripherals, that means that a single unmap
> request has the potential to bring the hammer down on the entire system
> if synchronised globally. Since stage 1 contexts, and stage 2 contexts
> under SMMUv2, offer local sync operations, let's make use of those
> wherever we can in the hope of minimising global disruption.
> 
> To that end, rather than add any more branches to the already unwieldy
> monolithic TLB maintenance ops, break them up into smaller, neater,
> functions which we can then mix and match as appropriate.
> 
> Signed-off-by: Robin Murphy 
> ---
>  drivers/iommu/arm-smmu.c | 156 
> ++-
>  1 file changed, 100 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index c8aafe304171..f7411109670f 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -237,6 +237,8 @@ enum arm_smmu_s2cr_privcfg {
>  #define ARM_SMMU_CB_S1_TLBIVAL   0x620
>  #define ARM_SMMU_CB_S2_TLBIIPAS2 0x630
>  #define ARM_SMMU_CB_S2_TLBIIPAS2L0x638
> +#define ARM_SMMU_CB_TLBSYNC  0x7f0
> +#define ARM_SMMU_CB_TLBSTATUS0x7f4
>  #define ARM_SMMU_CB_ATS1PR   0x800
>  #define ARM_SMMU_CB_ATSR 0x8f0
>  
> @@ -569,14 +571,13 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
> int idx)
>  }
>  
>  /* Wait for any pending TLB invalidations to complete */
> -static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
> +static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
> + void __iomem *sync, void __iomem *status)

Given that you take the arm_smmu_device anyway, I think I'd prefer just
passing the offsets for sync and status and avoiding the additions
in the caller (a bit like your other patch in this series ;).

>  static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
> @@ -617,48 +638,66 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
> iova, size_t size,
>  {
>   struct arm_smmu_domain *smmu_domain = cookie;
>   struct arm_smmu_cfg *cfg = _domain->cfg;
> - struct arm_smmu_device *smmu = smmu_domain->smmu;
>   bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
> - void __iomem *reg;
> + void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
> + size_t step;
>  
> - if (stage1) {
> - reg = ARM_SMMU_CB(smmu, cfg->cbndx);
> - reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
> -
> - if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
> - iova &= ~12UL;
> - iova |= cfg->asid;
> - do {
> - writel_relaxed(iova, reg);
> - iova += granule;
> - } while (size -= granule);
> - } else {
> - iova >>= 12;
> - iova |= (u64)cfg->asid << 48;
> - do {
> - writeq_relaxed(iova, reg);
> - iova += granule >> 12;
> - } while (size -= granule);
> - }
> - } else if (smmu->version == ARM_SMMU_V2) {
> - reg = ARM_SMMU_CB(smmu, cfg->cbndx);
> + if (stage1)
> + reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL :
> +   ARM_SMMU_CB_S1_TLBIVA;
> + else
>   reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
> ARM_SMMU_CB_S2_TLBIIPAS2;
> - iova >>= 12;
> - do {
> - smmu_write_atomic_lq(iova, reg);
> - iova += granule >> 12;
> - } while (size -= granule);
> +
> + if (stage1 && cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
> + iova &= ~12UL;
> + iova |= cfg->asid;
> + step = granule;
>   } else {
> - reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
> - writel_relaxed(cfg->vmid, reg);
> + iova >>= 12;
> + step = granule >> 12;
> + if (stage1)
> + iova |= (u64)cfg->asid << 48;
>   }
> +
> + do {
> + smmu_write_atomic_lq(iova, reg);
> + iova += step;
> + } while (size -= granule);

There seems to be a lot of refactoring going on here, but I'm not entirely
comfortable with the unconditional move to smmu_write_atomic_lq. Given the
way in which arm_smmu_tlb_inv_range_nosync is now called (i.e. only for
stage-1 or SMMUv2 stage-2), then I think you 

[PATCH 4/4] iommu/arm-smmu: Use per-context TLB sync as appropriate

2017-03-07 Thread Robin Murphy
TLB synchronisation typically involves the SMMU blocking all incoming
transactions until the TLBs report completion of all outstanding
operations. In the common SMMUv2 configuration of a single distributed
SMMU serving multiple peripherals, that means that a single unmap
request has the potential to bring the hammer down on the entire system
if synchronised globally. Since stage 1 contexts, and stage 2 contexts
under SMMUv2, offer local sync operations, let's make use of those
wherever we can in the hope of minimising global disruption.

To that end, rather than add any more branches to the already unwieldy
monolithic TLB maintenance ops, break them up into smaller, neater,
functions which we can then mix and match as appropriate.

Signed-off-by: Robin Murphy 
---
 drivers/iommu/arm-smmu.c | 156 ++-
 1 file changed, 100 insertions(+), 56 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index c8aafe304171..f7411109670f 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -237,6 +237,8 @@ enum arm_smmu_s2cr_privcfg {
 #define ARM_SMMU_CB_S1_TLBIVAL 0x620
 #define ARM_SMMU_CB_S2_TLBIIPAS2   0x630
 #define ARM_SMMU_CB_S2_TLBIIPAS2L  0x638
+#define ARM_SMMU_CB_TLBSYNC0x7f0
+#define ARM_SMMU_CB_TLBSTATUS  0x7f4
 #define ARM_SMMU_CB_ATS1PR 0x800
 #define ARM_SMMU_CB_ATSR   0x8f0
 
@@ -569,14 +571,13 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
int idx)
 }
 
 /* Wait for any pending TLB invalidations to complete */
-static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
+   void __iomem *sync, void __iomem *status)
 {
int count = 0;
-   void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
 
-   writel_relaxed(0, gr0_base + ARM_SMMU_GR0_sTLBGSYNC);
-   while (readl_relaxed(gr0_base + ARM_SMMU_GR0_sTLBGSTATUS)
-  & sTLBGSTATUS_GSACTIVE) {
+   writel_relaxed(0, sync);
+   while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
cpu_relax();
if (++count == TLB_LOOP_TIMEOUT) {
dev_err_ratelimited(smmu->dev,
@@ -587,29 +588,49 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device 
*smmu)
}
 }
 
-static void arm_smmu_tlb_sync(void *cookie)
+static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
 {
-   struct arm_smmu_domain *smmu_domain = cookie;
-   __arm_smmu_tlb_sync(smmu_domain->smmu);
+   void __iomem *base = ARM_SMMU_GR0(smmu);
+
+   __arm_smmu_tlb_sync(smmu, base + ARM_SMMU_GR0_sTLBGSYNC,
+   base + ARM_SMMU_GR0_sTLBGSTATUS);
 }
 
-static void arm_smmu_tlb_inv_context(void *cookie)
+static void arm_smmu_tlb_sync_context(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
+   struct arm_smmu_device *smmu = smmu_domain->smmu;
+   void __iomem *base = ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx);
+
+   __arm_smmu_tlb_sync(smmu, base + ARM_SMMU_CB_TLBSYNC,
+   base + ARM_SMMU_CB_TLBSTATUS);
+}
+
+static void arm_smmu_tlb_sync_vmid(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
+
+   arm_smmu_tlb_sync_global(smmu_domain->smmu);
+}
+
+static void arm_smmu_tlb_inv_context_s1(void *cookie)
 {
struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_cfg *cfg = _domain->cfg;
+   void __iomem *base = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
+
+   writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
+   arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_inv_context_s2(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_device *smmu = smmu_domain->smmu;
-   bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-   void __iomem *base;
+   void __iomem *base = ARM_SMMU_GR0(smmu);
 
-   if (stage1) {
-   base = ARM_SMMU_CB(smmu, cfg->cbndx);
-   writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
-   } else {
-   base = ARM_SMMU_GR0(smmu);
-   writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
-   }
-
-   __arm_smmu_tlb_sync(smmu);
+   writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
+   arm_smmu_tlb_sync_global(smmu);
 }
 
 static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
@@ -617,48 +638,66 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long 
iova, size_t size,
 {
struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_cfg *cfg = _domain->cfg;
-   struct arm_smmu_device *smmu = smmu_domain->smmu;
bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-   void __iomem *reg;
+   void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
+   

Re: [PATCH 4/4] iommu/arm-smmu: Use per-context TLB sync as appropriate

2016-02-09 Thread Will Deacon
On Tue, Jan 26, 2016 at 06:06:37PM +, Robin Murphy wrote:
> TLB synchronisation is a mighty big hammmer to bring down on the
> transaction stream, typically stalling all in-flight transactions until
> the sync completes. Since in most cases (except at stage 2 on SMMUv1)
> a per-context sync operation is available, prefer that over the global
> operation when performing TLB maintenance for a single domain, to avoid
> unecessarily disrupting ongoing traffic in other contexts.
> 
> Signed-off-by: Robin Murphy 
> ---
>  drivers/iommu/arm-smmu.c | 32 
>  1 file changed, 24 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 18e0e10..bf1895c 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -219,6 +219,8 @@
>  #define ARM_SMMU_CB_S1_TLBIVAL   0x620
>  #define ARM_SMMU_CB_S2_TLBIIPAS2 0x630
>  #define ARM_SMMU_CB_S2_TLBIIPAS2L0x638
> +#define ARM_SMMU_CB_TLBSYNC  0x7f0
> +#define ARM_SMMU_CB_TLBSTATUS0x7f4
>  #define ARM_SMMU_CB_ATS1PR   0x800
>  #define ARM_SMMU_CB_ATSR 0x8f0
>  
> @@ -546,14 +548,22 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
> int idx)
>  }
>  
>  /* Wait for any pending TLB invalidations to complete */
> -static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
> +static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int cbndx)
>  {
>   int count = 0;
> - void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
> + void __iomem *base, __iomem *status;
>  
> - writel_relaxed(0, gr0_base + ARM_SMMU_GR0_sTLBGSYNC);
> - while (readl_relaxed(gr0_base + ARM_SMMU_GR0_sTLBGSTATUS)
> -& sTLBGSTATUS_GSACTIVE) {
> + if (cbndx < 0) {
> + base = ARM_SMMU_GR0(smmu);
> + status = base + ARM_SMMU_GR0_sTLBGSTATUS;
> + writel_relaxed(0, base + ARM_SMMU_GR0_sTLBGSYNC);
> + } else {
> + base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cbndx);
> + status = base + ARM_SMMU_CB_TLBSTATUS;
> + writel_relaxed(0, base + ARM_SMMU_CB_TLBSYNC);
> + }
> +
> + while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
>   cpu_relax();
>   if (++count == TLB_LOOP_TIMEOUT) {
>   dev_err_ratelimited(smmu->dev,
> @@ -567,7 +577,13 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device 
> *smmu)
>  static void arm_smmu_tlb_sync(void *cookie)
>  {
>   struct arm_smmu_domain *smmu_domain = cookie;
> - __arm_smmu_tlb_sync(smmu_domain->smmu);
> + int cbndx = smmu_domain->cfg.cbndx;
> +
> + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2 &&
> + smmu_domain->smmu->version < ARM_SMMU_V2)
> + cbndx = -1;

I think it would be cleaner just to override the sync function pointer
when we initialise a stage-2 page table for an SMMUv1 implementation.

Any reason not to go that way?

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 4/4] iommu/arm-smmu: Use per-context TLB sync as appropriate

2016-01-26 Thread Robin Murphy
TLB synchronisation is a mighty big hammmer to bring down on the
transaction stream, typically stalling all in-flight transactions until
the sync completes. Since in most cases (except at stage 2 on SMMUv1)
a per-context sync operation is available, prefer that over the global
operation when performing TLB maintenance for a single domain, to avoid
unecessarily disrupting ongoing traffic in other contexts.

Signed-off-by: Robin Murphy 
---
 drivers/iommu/arm-smmu.c | 32 
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 18e0e10..bf1895c 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -219,6 +219,8 @@
 #define ARM_SMMU_CB_S1_TLBIVAL 0x620
 #define ARM_SMMU_CB_S2_TLBIIPAS2   0x630
 #define ARM_SMMU_CB_S2_TLBIIPAS2L  0x638
+#define ARM_SMMU_CB_TLBSYNC0x7f0
+#define ARM_SMMU_CB_TLBSTATUS  0x7f4
 #define ARM_SMMU_CB_ATS1PR 0x800
 #define ARM_SMMU_CB_ATSR   0x8f0
 
@@ -546,14 +548,22 @@ static void __arm_smmu_free_bitmap(unsigned long *map, 
int idx)
 }
 
 /* Wait for any pending TLB invalidations to complete */
-static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int cbndx)
 {
int count = 0;
-   void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+   void __iomem *base, __iomem *status;
 
-   writel_relaxed(0, gr0_base + ARM_SMMU_GR0_sTLBGSYNC);
-   while (readl_relaxed(gr0_base + ARM_SMMU_GR0_sTLBGSTATUS)
-  & sTLBGSTATUS_GSACTIVE) {
+   if (cbndx < 0) {
+   base = ARM_SMMU_GR0(smmu);
+   status = base + ARM_SMMU_GR0_sTLBGSTATUS;
+   writel_relaxed(0, base + ARM_SMMU_GR0_sTLBGSYNC);
+   } else {
+   base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cbndx);
+   status = base + ARM_SMMU_CB_TLBSTATUS;
+   writel_relaxed(0, base + ARM_SMMU_CB_TLBSYNC);
+   }
+
+   while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
cpu_relax();
if (++count == TLB_LOOP_TIMEOUT) {
dev_err_ratelimited(smmu->dev,
@@ -567,7 +577,13 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device 
*smmu)
 static void arm_smmu_tlb_sync(void *cookie)
 {
struct arm_smmu_domain *smmu_domain = cookie;
-   __arm_smmu_tlb_sync(smmu_domain->smmu);
+   int cbndx = smmu_domain->cfg.cbndx;
+
+   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2 &&
+   smmu_domain->smmu->version < ARM_SMMU_V2)
+   cbndx = -1;
+
+   __arm_smmu_tlb_sync(smmu_domain->smmu, cbndx);
 }
 
 static void arm_smmu_tlb_inv_context(void *cookie)
@@ -588,7 +604,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
   base + ARM_SMMU_GR0_TLBIVMID);
}
 
-   __arm_smmu_tlb_sync(smmu);
+   __arm_smmu_tlb_sync(smmu, cfg->cbndx);
 }
 
 static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
@@ -1534,7 +1550,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device 
*smmu)
reg &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT);
 
/* Push the button */
-   __arm_smmu_tlb_sync(smmu);
+   __arm_smmu_tlb_sync(smmu, -1);
writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
 }
 
-- 
2.7.0.25.gfc10eb5.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu