from:"Sai Prakash Ranjan"

[PATCHv3] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-07-09 Thread Sai Prakash Ranjan

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v3:
 * Move this debug feature to arm-smmu-qcom-debug.c (Will Deacon).
 * Keep single ratelimit state and remove local variable (Robin).

Changes in v2:
 * Use scm call consistently so that it works on older chipsets where
   some of these regs are secure registers.
 * Add device specific data to get the implementation defined register
   offsets.

---
 drivers/iommu/Kconfig |  10 ++
 drivers/iommu/arm/arm-smmu/Makefile   |   1 +
 .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c  | 142 ++
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c|  32 +++-
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h|  28 
 drivers/iommu/arm/arm-smmu/arm-smmu.c |   6 +-
 drivers/iommu/arm/arm-smmu/arm-smmu.h |   1 +
 7 files changed, 211 insertions(+), 9 deletions(-)
 create mode 100644 drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
 create mode 100644 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c79a0df090c0..5c5cb5bee8b6 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -363,6 +363,16 @@ config ARM_SMMU_QCOM
  When running on a Qualcomm platform that has the custom variant
  of the ARM SMMU, this needs to be built into the SMMU driver.
 
+config ARM_SMMU_QCOM_DEBUG
+   bool "ARM SMMU QCOM implementation defined debug support"
+   depends on ARM_SMMU_QCOM
+   help
+ Support for implementation specific debug features in ARM SMMU
+ hardware found in QTI platforms.
+
+ Say Y here to enable debug for issues such as TLB sync timeouts
+ which requires implementation defined register dumps.
+
 config ARM_SMMU_V3
tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
depends on ARM64
diff --git a/drivers/iommu/arm/arm-smmu/Makefile 
b/drivers/iommu/arm/arm-smmu/Makefile
index b0cc01aa20c9..2a5a95e8e3f9 100644
--- a/drivers/iommu/arm/arm-smmu/Makefile
+++ b/drivers/iommu/arm/arm-smmu/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_ARM_SMMU) += arm_smmu.o
 arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o
 arm_smmu-$(CONFIG_ARM_SMMU_QCOM) += arm-smmu-qcom.o
+arm_smmu-$(CONFIG_ARM_SMMU_QCOM_DEBUG) += arm-smmu-qcom-debug.o
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
new file mode 100644
index ..6eed8e67a0ca
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+
+#include "arm-smmu.h"
+#include "arm-smmu-qcom.h"
+
+enum qcom_smmu_impl_reg_offset {
+   QCOM_SMMU_TBU_PWR_STATUS,
+   QCOM_SMMU_STATS_SYNC_INV_TBU_ACK,
+   QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR,
+};
+
+struct qcom_smmu_config {
+   const u32 *reg_offset;
+};
+
+void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu)
+{
+   int ret;
+   u32 tbu_pwr_status, sync_inv_ack, sync_inv_progress;
+   struct qcom_smmu *qsmmu = container_of(smmu, struct qcom_smmu, smmu);
+   const struct qcom_smmu_config *cfg;
+   static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+   if (__ratelimit(&rs)) {
+   dev_err(smmu->dev, "TLB sync timed out -- SMMU may be 
deadlocked\n");
+
+   cfg = qsmmu->cfg;
+   if (!cfg)
+   return;
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_TBU_PWR_STATUS],
+   &tbu_pwr_status);
+   if (ret)
+   dev_err(smmu->dev,
+   "Failed to read TBU power status: %d\n", ret);
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_STATS_SYNC_INV_TBU_ACK],
+   &sync_inv_ack);
+   if (ret)
+   dev_err(smmu->dev,
+   "Failed to read TBU sync/inv ack status: %d\n", 
ret);
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR],
+

Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-07-07 Thread Sai Prakash Ranjan


Hi Robin,

On 7/6/2022 10:15 PM, Robin Murphy wrote:

On 2022-05-26 05:14, Sai Prakash Ranjan wrote:

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v2:
  * Use scm call consistently so that it works on older chipsets where
    some of these regs are secure registers.
  * Add device specific data to get the implementation defined register
    offsets.

---
  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++---
  drivers/iommu/arm/arm-smmu/arm-smmu.c  |   2 +
  drivers/iommu/arm/arm-smmu/arm-smmu.h  |   1 +
  3 files changed, 146 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 7820711c4560..bb68aa85b28b 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -5,13 +5,27 @@
    #include 
  #include 
+#include 
  #include 
  #include 
    #include "arm-smmu.h"
  +#define QCOM_DUMMY_VAL    -1
+
+enum qcom_smmu_impl_reg_offset {
+    QCOM_SMMU_TBU_PWR_STATUS,
+    QCOM_SMMU_STATS_SYNC_INV_TBU_ACK,
+    QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR,
+};
+
+struct qcom_smmu_config {
+    const u32 *reg_offset;
+};
+
  struct qcom_smmu {
  struct arm_smmu_device smmu;
+    const struct qcom_smmu_config *cfg;
  bool bypass_quirk;
  u8 bypass_cbndx;
  u32 stall_enabled;
@@ -22,6 +36,56 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device 
*smmu)
  return container_of(smmu, struct qcom_smmu, smmu);
  }
  +static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
+    int sync, int status)
+{
+    int ret;
+    unsigned int spin_cnt, delay;
+    u32 reg, tbu_pwr_status, sync_inv_ack, sync_inv_progress;
+    struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+    const struct qcom_smmu_config *cfg;
+
+    arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
+    for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+    for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+    reg = arm_smmu_readl(smmu, page, status);
+    if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
+    return;
+    cpu_relax();
+    }
+    udelay(delay);
+    }
+
+    dev_err_ratelimited(smmu->dev,
+    "TLB sync timed out -- SMMU may be deadlocked\n");


Maybe consider a single ratelimit state for the whole function so all the 
output stays together. If things go sufficiently wrong, mixed up bits of 
partial output from different events may be misleadingly unhelpful (and at the 
very least it'll be up to 5x more effective at the intent of limiting log spam).



Right, makes sense. Will change it.


+    cfg = qsmmu->cfg;
+    if (!cfg)
+    return;
+
+    ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_TBU_PWR_STATUS],
+    &tbu_pwr_status);
+    if (ret)
+    dev_err_ratelimited(smmu->dev,
+    "Failed to read TBU power status: %d\n", ret);
+
+    ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_STATS_SYNC_INV_TBU_ACK],
+    &sync_inv_ack);
+    if (ret)
+    dev_err_ratelimited(smmu->dev,
+    "Failed to read TBU sync/inv ack status: %d\n", ret);
+
+    ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR],
+    &sync_inv_progress);
+    if (ret)
+    dev_err_ratelimited(smmu->dev,
+    "Failed to read TCU syn/inv progress: %d\n", ret);
+
+    dev_err_ratelimited(smmu->dev,
+    "TBU: power_status %#x sync_inv_ack %#x sync_inv_progress 
%#x\n",
+    tbu_pwr_status, sync_inv_ack, sync_inv_progress);
+}
+
  static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int 
idx,
  u32 reg)
  {
@@ -374,6 +438,7 @@ static const struct arm_smmu_impl qcom_smmu_impl = {
  .def_domain_type = qcom_smmu_def_domain_type,
  .reset = qcom_smmu500_reset,
  .write_s2cr = qcom_smmu_write_s2cr,
+    .tlb_sync = qcom_smmu_tlb_sync,
  };
    static const struct arm_smmu_impl qcom_adreno_smmu_impl = {
@@ -382,12 +447,84 @@ static const struct arm_smmu_impl qcom_adreno_smmu_impl = 
{
  .reset = qcom_smmu500_reset,
  .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank,
  .write_sctlr = qcom_adreno_smmu_write_sctlr,
+    .tlb_sync = qcom_smmu_t

Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-07-06 Thread Sai Prakash Ranjan


On 7/6/2022 5:26 PM, Will Deacon wrote:

On Thu, May 26, 2022 at 09:44:03AM +0530, Sai Prakash Ranjan wrote:

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v2:
  * Use scm call consistently so that it works on older chipsets where
some of these regs are secure registers.
  * Add device specific data to get the implementation defined register
offsets.

---
  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++---
  drivers/iommu/arm/arm-smmu/arm-smmu.c  |   2 +
  drivers/iommu/arm/arm-smmu/arm-smmu.h  |   1 +
  3 files changed, 146 insertions(+), 18 deletions(-)

If this is useful to you, then I suppose it's something we could support,
however I'm pretty worried about our ability to maintain/scale this stuff
as it is extended to support additional SoCs and other custom debugging
features.

Perhaps you could stick it all in arm-smmu-qcom-debug.c and have a new
config option for that, so at least it's even further out of the way?

Will


Sounds good to me, will do that.

Thanks,
Sai
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-07-04 Thread Sai Prakash Ranjan


On 6/23/2022 11:32 AM, Sai Prakash Ranjan wrote:

On 5/26/2022 9:44 AM, Sai Prakash Ranjan wrote:

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v2:
  * Use scm call consistently so that it works on older chipsets where
    some of these regs are secure registers.
  * Add device specific data to get the implementation defined register
    offsets.

---
  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++---
  drivers/iommu/arm/arm-smmu/arm-smmu.c  |   2 +
  drivers/iommu/arm/arm-smmu/arm-smmu.h  |   1 +
  3 files changed, 146 insertions(+), 18 deletions(-)


Any comments on this patch?


Gentle Ping !!

Thanks,
Sai
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-06-22 Thread Sai Prakash Ranjan


On 5/26/2022 9:44 AM, Sai Prakash Ranjan wrote:

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v2:
  * Use scm call consistently so that it works on older chipsets where
some of these regs are secure registers.
  * Add device specific data to get the implementation defined register
offsets.

---
  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++---
  drivers/iommu/arm/arm-smmu/arm-smmu.c  |   2 +
  drivers/iommu/arm/arm-smmu/arm-smmu.h  |   1 +
  3 files changed, 146 insertions(+), 18 deletions(-)


Any comments on this patch?

Thanks,
Sai
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-06-09 Thread Sai Prakash Ranjan


Hi Vincent,

On 6/9/2022 2:52 AM, Vincent Knecht wrote:

Le jeudi 26 mai 2022 à 09:44 +0530, Sai Prakash Ranjan a écrit :

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v2:
  * Use scm call consistently so that it works on older chipsets where
    some of these regs are secure registers.
  * Add device specific data to get the implementation defined register
    offsets.

---
  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++---
  drivers/iommu/arm/arm-smmu/arm-smmu.c  |   2 +
  drivers/iommu/arm/arm-smmu/arm-smmu.h  |   1 +
  3 files changed, 146 insertions(+), 18 deletions(-)

Hi Sai, and thanks for this patch !

I've encountered TLB sync timeouts with msm8939 SoC recently.
What would be needed to add to this patch so this SoC is supported ?
Like, where could one check the values to be used in an equivalent
of qcom_smmu_impl0_reg_offset values for this SoC (if any change needed) ?
Current values are not found by simply greping in downstream/vendor dtsi/dts 
files...


These are implementation defined registers and some might not be present on 
older SoCs
and sometimes they don't add this support in downstream kernels even if the 
registers
are present.

I looked up the IP doc for msm8939 and I could find only TBU_PWR_STATUS 
register and
you can use the same offset for it as given in this patch.

Thanks,
Sai
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-05-25 Thread Sai Prakash Ranjan

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v2:
 * Use scm call consistently so that it works on older chipsets where
   some of these regs are secure registers.
 * Add device specific data to get the implementation defined register
   offsets.

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++---
 drivers/iommu/arm/arm-smmu/arm-smmu.c  |   2 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |   1 +
 3 files changed, 146 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 7820711c4560..bb68aa85b28b 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -5,13 +5,27 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
 #include "arm-smmu.h"
 
+#define QCOM_DUMMY_VAL -1
+
+enum qcom_smmu_impl_reg_offset {
+   QCOM_SMMU_TBU_PWR_STATUS,
+   QCOM_SMMU_STATS_SYNC_INV_TBU_ACK,
+   QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR,
+};
+
+struct qcom_smmu_config {
+   const u32 *reg_offset;
+};
+
 struct qcom_smmu {
struct arm_smmu_device smmu;
+   const struct qcom_smmu_config *cfg;
bool bypass_quirk;
u8 bypass_cbndx;
u32 stall_enabled;
@@ -22,6 +36,56 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device 
*smmu)
return container_of(smmu, struct qcom_smmu, smmu);
 }
 
+static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
+   int sync, int status)
+{
+   int ret;
+   unsigned int spin_cnt, delay;
+   u32 reg, tbu_pwr_status, sync_inv_ack, sync_inv_progress;
+   struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+   const struct qcom_smmu_config *cfg;
+
+   arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
+   for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+   for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+   reg = arm_smmu_readl(smmu, page, status);
+   if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
+   return;
+   cpu_relax();
+   }
+   udelay(delay);
+   }
+
+   dev_err_ratelimited(smmu->dev,
+   "TLB sync timed out -- SMMU may be deadlocked\n");
+
+   cfg = qsmmu->cfg;
+   if (!cfg)
+   return;
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_TBU_PWR_STATUS],
+   &tbu_pwr_status);
+   if (ret)
+   dev_err_ratelimited(smmu->dev,
+   "Failed to read TBU power status: %d\n", 
ret);
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_STATS_SYNC_INV_TBU_ACK],
+   &sync_inv_ack);
+   if (ret)
+   dev_err_ratelimited(smmu->dev,
+   "Failed to read TBU sync/inv ack status: 
%d\n", ret);
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + 
cfg->reg_offset[QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR],
+   &sync_inv_progress);
+   if (ret)
+   dev_err_ratelimited(smmu->dev,
+   "Failed to read TCU syn/inv progress: 
%d\n", ret);
+
+   dev_err_ratelimited(smmu->dev,
+   "TBU: power_status %#x sync_inv_ack %#x 
sync_inv_progress %#x\n",
+   tbu_pwr_status, sync_inv_ack, sync_inv_progress);
+}
+
 static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int idx,
u32 reg)
 {
@@ -374,6 +438,7 @@ static const struct arm_smmu_impl qcom_smmu_impl = {
.def_domain_type = qcom_smmu_def_domain_type,
.reset = qcom_smmu500_reset,
.write_s2cr = qcom_smmu_write_s2cr,
+   .tlb_sync = qcom_smmu_tlb_sync,
 };
 
 static const struct arm_smmu_impl qcom_adreno_smmu_impl = {
@@ -382,12 +447,84 @@ static const struct arm_smmu_impl qcom_adreno_smmu_impl = 
{
.reset = qcom_smmu500_reset,
.alloc_context_bank = qcom_adreno_smmu_alloc_context_bank,
.write_sctlr = qcom_adreno_smmu_write_sctlr,
+   .tlb_sync = qcom_smmu_tlb_sync,
+};
+
+/* Implementation Defined Register Space 0 register offsets */
+static const u32 qcom_smmu_impl0_reg_offset[]

Re: [PATCH] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-05-23 Thread Sai Prakash Ranjan


On 5/23/2022 10:48 PM, Sai Prakash Ranjan wrote:

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---
  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 50 ++
  drivers/iommu/arm/arm-smmu/arm-smmu.c  |  2 +
  drivers/iommu/arm/arm-smmu/arm-smmu.h  |  4 ++
  3 files changed, 56 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 7820711c4560..22e9a0085475 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -5,11 +5,19 @@
  
  #include 

  #include 
+#include 
  #include 
  #include 
  
  #include "arm-smmu.h"
  
+#define QCOM_DUMMY_VAL	-1

+
+/* Implementation Defined Register Space 0 registers */
+#define QCOM_SMMU_STATS_SYNC_INV_TBU_ACK   0x5dc
+#define QCOM_SMMU_TBU_PWR_STATUS   0x2204
+#define QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR   0x2670
+
  struct qcom_smmu {
struct arm_smmu_device smmu;
bool bypass_quirk;
@@ -22,6 +30,46 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device 
*smmu)
return container_of(smmu, struct qcom_smmu, smmu);
  }
  
+static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,

+   int sync, int status)
+{
+   u32 sync_inv_ack, sync_inv_progress, tbu_pwr_status;
+   unsigned int spin_cnt, delay;
+   u32 reg;
+   int ret;
+
+   arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
+   for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+   for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+   reg = arm_smmu_readl(smmu, page, status);
+   if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
+   return;
+   cpu_relax();
+   }
+   udelay(delay);
+   }
+
+   sync_inv_ack = arm_smmu_readl(smmu, ARM_SMMU_IMPL_DEF0,
+ QCOM_SMMU_STATS_SYNC_INV_TBU_ACK);


Sorry, this doesn't work always, looks like on earlier chipsets this is a 
secure register and
reading it from non-secure world would probably blow. Also this register can be 
in other
implementation defined space for different chipsets. So I think we can use SCM 
call here
and have a device specific data based on already existing compatible for QCOM 
SoCs to
identify IMP_DEF space used.


+   ret = qcom_scm_io_readl(smmu->ioaddr + QCOM_SMMU_TBU_PWR_STATUS,
+   &tbu_pwr_status);
+   if (ret)
+   dev_err_ratelimited(smmu->dev,
+   "Failed to read TBU power status: %d\n", 
ret);
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + 
QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR,
+   &sync_inv_progress);
+   if (ret)
+   dev_err_ratelimited(smmu->dev,
+   "Failed to read SAFE WAIT counter: %d\n", 
ret);
+
+   dev_err_ratelimited(smmu->dev,
+   "TLB sync timed out -- SMMU may be deadlocked\n"
+   "TBU: sync_inv_ack %#x power_status %#x 
sync_inv_progress %#x\n",
+   sync_inv_ack, tbu_pwr_status, sync_inv_progress);
+}
+
  static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int 
idx,
u32 reg)
  {
@@ -374,6 +422,7 @@ static const struct arm_smmu_impl qcom_smmu_impl = {
.def_domain_type = qcom_smmu_def_domain_type,
.reset = qcom_smmu500_reset,
.write_s2cr = qcom_smmu_write_s2cr,
+   .tlb_sync = qcom_smmu_tlb_sync,
  };
  
  static const struct arm_smmu_impl qcom_adreno_smmu_impl = {

@@ -382,6 +431,7 @@ static const struct arm_smmu_impl qcom_adreno_smmu_impl = {
.reset = qcom_smmu500_reset,
.alloc_context_bank = qcom_adreno_smmu_alloc_context_bank,
.write_sctlr = qcom_adreno_smmu_write_sctlr,
+   .tlb_sync = qcom_smmu_tlb_sync,
  };
  
  static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu,

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 2ed3594f384e..4c5b51109835 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2099,6 +2099,8 @@ static int arm_smmu_device_probe(struct platform_device 
*pdev)
if (IS_ERR(smmu->base))

[PATCH] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts

2022-05-23 Thread Sai Prakash Ranjan

TLB sync timeouts can be due to various reasons such as TBU power down
or pending TCU/TBU invalidation/sync and so on. Debugging these often
require dumping of some implementation defined registers to know the
status of TBU/TCU operations and some of these registers are not
accessible in non-secure world such as from kernel and requires SMC
calls to read them in the secure world. So, add this debug support
to dump implementation defined registers for TLB sync timeout issues.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 50 ++
 drivers/iommu/arm/arm-smmu/arm-smmu.c  |  2 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  4 ++
 3 files changed, 56 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 7820711c4560..22e9a0085475 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -5,11 +5,19 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
 #include "arm-smmu.h"
 
+#define QCOM_DUMMY_VAL -1
+
+/* Implementation Defined Register Space 0 registers */
+#define QCOM_SMMU_STATS_SYNC_INV_TBU_ACK   0x5dc
+#define QCOM_SMMU_TBU_PWR_STATUS   0x2204
+#define QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR   0x2670
+
 struct qcom_smmu {
struct arm_smmu_device smmu;
bool bypass_quirk;
@@ -22,6 +30,46 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device 
*smmu)
return container_of(smmu, struct qcom_smmu, smmu);
 }
 
+static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
+   int sync, int status)
+{
+   u32 sync_inv_ack, sync_inv_progress, tbu_pwr_status;
+   unsigned int spin_cnt, delay;
+   u32 reg;
+   int ret;
+
+   arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
+   for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+   for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+   reg = arm_smmu_readl(smmu, page, status);
+   if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
+   return;
+   cpu_relax();
+   }
+   udelay(delay);
+   }
+
+   sync_inv_ack = arm_smmu_readl(smmu, ARM_SMMU_IMPL_DEF0,
+ QCOM_SMMU_STATS_SYNC_INV_TBU_ACK);
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + QCOM_SMMU_TBU_PWR_STATUS,
+   &tbu_pwr_status);
+   if (ret)
+   dev_err_ratelimited(smmu->dev,
+   "Failed to read TBU power status: %d\n", 
ret);
+
+   ret = qcom_scm_io_readl(smmu->ioaddr + 
QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR,
+   &sync_inv_progress);
+   if (ret)
+   dev_err_ratelimited(smmu->dev,
+   "Failed to read SAFE WAIT counter: %d\n", 
ret);
+
+   dev_err_ratelimited(smmu->dev,
+   "TLB sync timed out -- SMMU may be deadlocked\n"
+   "TBU: sync_inv_ack %#x power_status %#x 
sync_inv_progress %#x\n",
+   sync_inv_ack, tbu_pwr_status, sync_inv_progress);
+}
+
 static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int idx,
u32 reg)
 {
@@ -374,6 +422,7 @@ static const struct arm_smmu_impl qcom_smmu_impl = {
.def_domain_type = qcom_smmu_def_domain_type,
.reset = qcom_smmu500_reset,
.write_s2cr = qcom_smmu_write_s2cr,
+   .tlb_sync = qcom_smmu_tlb_sync,
 };
 
 static const struct arm_smmu_impl qcom_adreno_smmu_impl = {
@@ -382,6 +431,7 @@ static const struct arm_smmu_impl qcom_adreno_smmu_impl = {
.reset = qcom_smmu500_reset,
.alloc_context_bank = qcom_adreno_smmu_alloc_context_bank,
.write_sctlr = qcom_adreno_smmu_write_sctlr,
+   .tlb_sync = qcom_smmu_tlb_sync,
 };
 
 static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 2ed3594f384e..4c5b51109835 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2099,6 +2099,8 @@ static int arm_smmu_device_probe(struct platform_device 
*pdev)
if (IS_ERR(smmu->base))
return PTR_ERR(smmu->base);
ioaddr = res->start;
+   smmu->ioaddr = ioaddr;
+
/*
 * The resource size should effectively match the value of SMMU_TOP;
 * stash that temporarily until we know PAGESIZE to validate it with.
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 2b9b42fb6f30..8cf6567d970f 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-s

Re: [PATCHv4] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-08-11 Thread Sai Prakash Ranjan


On 2021-08-11 21:23, Robin Murphy wrote:

On 2021-08-11 11:30, Will Deacon wrote:

On Wed, Aug 11, 2021 at 11:37:25AM +0530, Sai Prakash Ranjan wrote:
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c

index f7da8953afbe..3904b598e0f9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -327,9 +327,16 @@ static void arm_smmu_tlb_inv_range_s2(unsigned 
long iova, size_t size,
  static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t 
size,

 size_t granule, void *cookie)
  {
-   arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
- ARM_SMMU_CB_S1_TLBIVA);
-   arm_smmu_tlb_sync_context(cookie);
+   struct arm_smmu_domain *smmu_domain = cookie;
+   struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+
+   if (cfg->flush_walk_prefer_tlbiasid) {
+   arm_smmu_tlb_inv_context_s1(cookie);


Hmm, this introduces an unconditional wmb() if tlbiasid is preferred. 
I
think that should be predicated on ARM_SMMU_FEAT_COHERENT_WALK like it 
is

for the by-VA ops. Worth doing as a separate patch.


+   } else {
+   arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
+ ARM_SMMU_CB_S1_TLBIVA);
+   arm_smmu_tlb_sync_context(cookie);
+   }
  }
static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather 
*gather,
@@ -765,8 +772,10 @@ static int arm_smmu_init_domain_context(struct 
iommu_domain *domain,

.iommu_dev  = smmu->dev,
};
  - if (!iommu_get_dma_strict(domain))
+   if (!iommu_get_dma_strict(domain)) {
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
+   cfg->flush_walk_prefer_tlbiasid = true;


This is going to interact badly with Robin's series to allow dynamic
transition to non-strict mode, as we don't have a mechanism to switch
over to the by-ASID behaviour. Yes, it should _work_, but it's ugly 
having

different TLBI behaviour just because of the how the domain became
non-strict.

Robin -- I think this originated from your idea at [1]. Any idea how 
to make
it work with your other series, or shall we drop this part for now and 
leave

the TLB invalidation behaviour the same for now?


Yeah, I'd say drop it - I'm currently half an hour into a first
attempt at removing io_pgtable_tlb_flush_walk() entirely, which would
make it moot for non-strict anyway.



I have dropped it and sent a v5.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv5] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-08-11 Thread Sai Prakash Ranjan

Currently for iommu_unmap() of large scatter-gather list with page size
elements, the majority of time is spent in flushing of partial walks in
__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA).

For example: to unmap a 32MB scatter-gather list with page size elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge
overhead.

On qcom implementation, there are several performance improvements for
TLB cache invalidations in HW like wait-for-safe (for realtime clients
such as camera and display) and few others to allow for cache
lookups/updates when TLBI is in progress for the same context bank.
So the cost of over-invalidation is less compared to the unmap latency
on several usecases like camera which deals with large buffers. So,
ASID based TLB invalidations (TLBIASID) can be used to invalidate the
entire context for partial walk flush thereby improving the unmap
latency.

For this example of 32MB scatter-gather list unmap, this change results
in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192
TLBIVAs thereby increasing the performance of unmaps drastically.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

Real world data also shows big difference in unmap performance as below:

There were reports of camera frame drops because of high overhead in
iommu unmap without this optimization because of frequent unmaps issued
by camera of about 100MB/s taking more than 100ms thereby causing frame
drops.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v5:
 * Drop non-strict mode change as it will conflict with Robin's series

Changes in v4:
 * Use a flag in struct arm_smmu_cfg to prefer TLBIASID (Will)

Changes in v3:
 * Move the logic to arm-smmu driver from io-pgtable (Robin)
 * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for 
qcom impl

Changes in v2:
 * Add a quirk to choose tlb_flush_all in partial walk flush
 * Set the quirk for QTI SoC implementation

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 13 ++---
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 +
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 9b9d13ec5a88..55690af1b25d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -193,6 +193,8 @@ static int qcom_adreno_smmu_init_context(struct 
arm_smmu_domain *smmu_domain,
 {
struct adreno_smmu_priv *priv;
 
+   smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -235,6 +237,14 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {
{ }
 };
 
+static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain,
+   struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+   smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
+
+   return 0;
+}
+
 static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
 {
unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 
1);
@@ -358,6 +368,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu)
 }
 
 static const struct arm_smmu_impl qcom_smmu_impl = {
+   .init_context = qcom_smmu_init_context,
.cfg_probe = qcom_smmu_cfg_probe,
.def_domain_type = qcom_smmu_def_domain_type,
.reset = qcom_smmu500_reset,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index f7da8953afbe..67b660b0551d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -327,9

Re: [PATCHv4] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-08-11 Thread Sai Prakash Ranjan




On 2021-08-11 16:00, Will Deacon wrote:

On Wed, Aug 11, 2021 at 11:37:25AM +0530, Sai Prakash Ranjan wrote:
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c

index f7da8953afbe..3904b598e0f9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -327,9 +327,16 @@ static void arm_smmu_tlb_inv_range_s2(unsigned 
long iova, size_t size,

 static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t size,
 size_t granule, void *cookie)
 {
-   arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
- ARM_SMMU_CB_S1_TLBIVA);
-   arm_smmu_tlb_sync_context(cookie);
+   struct arm_smmu_domain *smmu_domain = cookie;
+   struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+
+   if (cfg->flush_walk_prefer_tlbiasid) {
+   arm_smmu_tlb_inv_context_s1(cookie);


Hmm, this introduces an unconditional wmb() if tlbiasid is preferred. I
think that should be predicated on ARM_SMMU_FEAT_COHERENT_WALK like it 
is

for the by-VA ops. Worth doing as a separate patch.



Ok I will keep this as-is for now then.


+   } else {
+   arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie,
+ ARM_SMMU_CB_S1_TLBIVA);
+   arm_smmu_tlb_sync_context(cookie);
+   }
 }

 static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather 
*gather,
@@ -765,8 +772,10 @@ static int arm_smmu_init_domain_context(struct 
iommu_domain *domain,

.iommu_dev  = smmu->dev,
};

-   if (!iommu_get_dma_strict(domain))
+   if (!iommu_get_dma_strict(domain)) {
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
+   cfg->flush_walk_prefer_tlbiasid = true;


This is going to interact badly with Robin's series to allow dynamic
transition to non-strict mode, as we don't have a mechanism to switch
over to the by-ASID behaviour. Yes, it should _work_, but it's ugly 
having

different TLBI behaviour just because of the how the domain became
non-strict.

Robin -- I think this originated from your idea at [1]. Any idea how to 
make
it work with your other series, or shall we drop this part for now and 
leave

the TLB invalidation behaviour the same for now?

Will

[1] 
https://lore.kernel.org/r/da62ff1c-9b49-34d3-69a1-1a674e4a3...@arm.com


Right, I think we can drop this non-strict change for now because it 
also makes
it a pain to backport it to 5.4/5.10 kernels because of large number of 
changes
in dma apis in recent kernels. I will let you and Robin decide if it's 
ok to

drop this change and introduce it later with a different patch.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-08-10 Thread Sai Prakash Ranjan


On 2021-08-10 23:38, Will Deacon wrote:

On Tue, Aug 03, 2021 at 11:09:17AM +0530, Sai Prakash Ranjan wrote:

On 2021-08-02 21:13, Will Deacon wrote:
> On Wed, Jun 23, 2021 at 07:12:01PM +0530, Sai Prakash Ranjan wrote:
> > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > index d3c6f54110a5..f3845e822565 100644
> > --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> > @@ -341,6 +341,12 @@ static void arm_smmu_tlb_add_page_s1(struct
> > iommu_iotlb_gather *gather,
> > ARM_SMMU_CB_S1_TLBIVAL);
> >  }
> >
> > +static void arm_smmu_tlb_inv_walk_impl_s1(unsigned long iova,
> > size_t size,
> > +  size_t granule, void *cookie)
> > +{
> > + arm_smmu_tlb_inv_context_s1(cookie);
> > +}
> > +
> >  static void arm_smmu_tlb_inv_walk_s2(unsigned long iova, size_t size,
> >size_t granule, void *cookie)
> >  {
> > @@ -388,6 +394,12 @@ static const struct iommu_flush_ops
> > arm_smmu_s1_tlb_ops = {
> >   .tlb_add_page   = arm_smmu_tlb_add_page_s1,
> >  };
> >
> > +const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops = {
> > + .tlb_flush_all  = arm_smmu_tlb_inv_context_s1,
> > + .tlb_flush_walk = arm_smmu_tlb_inv_walk_impl_s1,
> > + .tlb_add_page   = arm_smmu_tlb_add_page_s1,
> > +};
>
> Hmm, dunno about this. Wouldn't it be a lot cleaner if the
> tlb_flush_walk
> callbacks just did the right thing based on the smmu_domain (maybe in
> the
> arm_smmu_cfg?) rather than having an entirely new set of ops just
> because
> they're const and you can't overide the bit you want?
>
> I don't think there's really an awful lot qcom-specific about the
> principle
> here -- there's a trade-off between over-invalidation and invalidation
> latency. That happens on the CPU as well.
>

Sorry didn't understand, based on smmu_domain what? How do we make
this implementation specific? Do you mean something like a quirk?
The reason we didn't make this common was because nvidia folks weren't
so happy with that, you can find the discussion in this thread [1].

[1] 
https://lore.kernel.org/lkml/20210609145315.25750-1-saiprakash.ran...@codeaurora.org/


The ->tlb_flush_walk() callbacks take a 'void *cookie' which, for this
driver, is a 'struct arm_smmu_domain *'. From that, you can get to the
'struct arm_smmu_cfg' which could have something as coarse as:

boolflush_walk_prefer_tlbiasid;

which you can set when you initialise the domain (maybe in the
->init_context callback?). It shouldn't affect anybody else.



Ah ok, you meant a new flag in arm_smmu_cfg, right getting it from 
cookie

is no big deal but nonetheless thanks for detailing it. I have made the
changes and sent a v4 after testing.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv4] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-08-10 Thread Sai Prakash Ranjan

Currently for iommu_unmap() of large scatter-gather list with page size
elements, the majority of time is spent in flushing of partial walks in
__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA).

For example: to unmap a 32MB scatter-gather list with page size elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge
overhead.

On qcom implementation, there are several performance improvements for
TLB cache invalidations in HW like wait-for-safe (for realtime clients
such as camera and display) and few others to allow for cache
lookups/updates when TLBI is in progress for the same context bank.
So the cost of over-invalidation is less compared to the unmap latency
on several usecases like camera which deals with large buffers. So,
ASID based TLB invalidations (TLBIASID) can be used to invalidate the
entire context for partial walk flush thereby improving the unmap
latency.

Non-strict mode can use this by default for all platforms given its
all about over-invalidation saving time on individual unmaps and
non-deterministic generally.

For this example of 32MB scatter-gather list unmap, this change results
in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192
TLBIVAs thereby increasing the performance of unmaps drastically.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

Real world data also shows big difference in unmap performance as below:

There were reports of camera frame drops because of high overhead in
iommu unmap without this optimization because of frequent unmaps issued
by camera of about 100MB/s taking more than 100ms thereby causing frame
drops.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v4:
 * Use a flag in struct arm_smmu_cfg to prefer TLBIASID (Will)

Changes in v3:
 * Move the logic to arm-smmu driver from io-pgtable (Robin)
 * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for 
qcom impl

Changes in v2:
 * Add a quirk to choose tlb_flush_all in partial walk flush
 * Set the quirk for QTI SoC implementation

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 17 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 +
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 9b9d13ec5a88..55690af1b25d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -193,6 +193,8 @@ static int qcom_adreno_smmu_init_context(struct 
arm_smmu_domain *smmu_domain,
 {
struct adreno_smmu_priv *priv;
 
+   smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -235,6 +237,14 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {
{ }
 };
 
+static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain,
+   struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+   smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
+
+   return 0;
+}
+
 static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
 {
unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 
1);
@@ -358,6 +368,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu)
 }
 
 static const struct arm_smmu_impl qcom_smmu_impl = {
+   .init_context = qcom_smmu_init_context,
.cfg_probe = qcom_smmu_cfg_probe,
.def_domain_type = qcom_smmu_def_domain_type,
.reset = qcom_smmu500_reset,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index f7da8953afbe..3904b598e0f9 100644
--- a/drivers/iommu/arm/a

Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-10 Thread Sai Prakash Ranjan


On 2021-08-10 14:46, Will Deacon wrote:

On Mon, Aug 09, 2021 at 11:17:40PM +0530, Sai Prakash Ranjan wrote:

On 2021-08-09 23:10, Will Deacon wrote:
> On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote:
> > On Mon, Aug 9, 2021 at 10:05 AM Will Deacon  wrote:
> > > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote:
> > > > But I suppose we could call it instead IOMMU_QCOM_LLC or something
> > > > like that to make it more clear that it is not necessarily something
> > > > that would work with a different outer level cache implementation?
> > >
> > > ... or we could just deal with the problem so that other people can reuse
> > > the code. I haven't really understood the reluctance to solve this 
properly.
> > >
> > > Am I missing some reason this isn't solvable?
> >
> > Oh, was there another way to solve it (other than foregoing setting
> > INC_OCACHE in the pgtables)?  Maybe I misunderstood, is there a
> > corresponding setting on the MMU pgtables side of things?
>
> Right -- we just need to program the CPU's MMU with the matching memory
> attributes! It's a bit more fiddly if you're just using ioremap_wc()
> though, as it's usually the DMA API which handles the attributes under
> the
> hood.
>
> Anyway, sorry, I should've said that explicitly earlier on. We've done
> this
> sort of thing in the Android tree so I assumed Sai knew what needed to
> be
> done and then I didn't think to explain to you :(
>

Right I was aware of that but even in the android tree there is no 
user :)


I'm assuming there are vendor modules using it there, otherwise we 
wouldn't
have been asked to put it in. Since you work at Qualcomm, maybe you 
could

talk to your colleagues (Isaac and Patrick) directly?



Right I will check with them regarding the vendor modules in android.

I think we can't have a new memory type without any user right in 
upstream

like android tree?


Correct. But I don't think we should be adding IOMMU_* anything 
upstream

if we don't have a user.



Agreed, once we have the fix for GPU crash I can continue further on 
using

this properly.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks

2021-08-09 Thread Sai Prakash Ranjan


On 2021-08-03 11:36, Sai Prakash Ranjan wrote:

On 2021-08-02 21:42, Will Deacon wrote:

On Tue, Jul 27, 2021 at 03:03:22PM +0530, Sai Prakash Ranjan wrote:
Some clocks for SMMU can have parent as XO such as 
gpu_cc_hub_cx_int_clk
of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states 
in
such cases, we would need to drop the XO clock vote in unprepare call 
and
this unprepare callback for XO is in RPMh (Resource Power 
Manager-Hardened)
clock driver which controls RPMh managed clock resources for new QTI 
SoCs

and is a blocking call.

Given we cannot have a sleeping calls such as clk_bulk_prepare() and
clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu
operations like map and unmap can be in atomic context and are in 
fast
path, add this prepare and unprepare call to drop the XO vote only 
for
system pm callbacks since it is not a fast path and we expect the 
system

to enter deep sleep states with system pm as opposed to runtime pm.

This is a similar sequence of clock requests (prepare,enable and
disable,unprepare) in arm-smmu probe and remove.

Signed-off-by: Sai Prakash Ranjan 
Co-developed-by: Rajendra Nayak 
Signed-off-by: Rajendra Nayak 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)


[+Rob]

How does this work with that funny GPU which writes to the SMMU 
registers
directly? Does the SMMU need to remain independently clocked for that 
to

work or is it all in the same clock domain?



As Rob mentioned, device link should take care of all the dependencies 
between

SMMU and its consumers. But not sure how the question relates to this
patch as this
change is for system pm and not runtime pm, so it is exactly the 
sequence of

SMMU probe/remove which if works currently for that GPU SMMU, then it
should work
just fine for system suspend and resume as well.

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c

index d3c6f54110a5..9561ba4c5d39 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2277,6 +2277,13 @@ static int __maybe_unused 
arm_smmu_runtime_suspend(struct device *dev)


 static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
 {
+   int ret;
+   struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+   ret = clk_bulk_prepare(smmu->num_clks, smmu->clks);
+   if (ret)
+   return ret;
+
if (pm_runtime_suspended(dev))
return 0;


If we subsequently fail to enable the clks in 
arm_smmu_runtime_resume()

should we unprepare them again?



If we are unable to turn on the clks then its fatal and we will not
live for long.



Nonetheless, it won't hurt to unprepare if clk enable fails as that is
the correct thing anyway, so I have added it and sent a v2.

Thanks,
Sai




Will

@@ -2285,10 +2292,19 @@ static int __maybe_unused 
arm_smmu_pm_resume(struct device *dev)


 static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
 {
+   int ret = 0;
+   struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
if (pm_runtime_suspended(dev))
-   return 0;
+   goto clk_unprepare;

-   return arm_smmu_runtime_suspend(dev);
+   ret = arm_smmu_runtime_suspend(dev);
+   if (ret)
+   return ret;
+
+clk_unprepare:
+   clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+   return ret;
 }

 static const struct dev_pm_ops arm_smmu_pm_ops = {
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation



--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks

2021-08-09 Thread Sai Prakash Ranjan

Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk
of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in
such cases, we would need to drop the XO clock vote in unprepare call and
this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened)
clock driver which controls RPMh managed clock resources for new QTI SoCs.

Given we cannot have a sleeping calls such as clk_bulk_prepare() and
clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu
operations like map and unmap can be in atomic context and are in fast
path, add this prepare and unprepare call to drop the XO vote only for
system pm callbacks since it is not a fast path and we expect the system
to enter deep sleep states with system pm as opposed to runtime pm.

This is a similar sequence of clock requests (prepare,enable and
disable,unprepare) in arm-smmu probe and remove.

Signed-off-by: Sai Prakash Ranjan 
Co-developed-by: Rajendra Nayak 
Signed-off-by: Rajendra Nayak 
---

Changes in v2:
 * Add clk unprepare when clk enable fails in resume (Will)

---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 26 +++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index d3c6f54110a5..da8ef9d82d79 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2277,18 +2277,38 @@ static int __maybe_unused 
arm_smmu_runtime_suspend(struct device *dev)
 
 static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
 {
+   int ret;
+   struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+   ret = clk_bulk_prepare(smmu->num_clks, smmu->clks);
+   if (ret)
+   return ret;
+
if (pm_runtime_suspended(dev))
return 0;
 
-   return arm_smmu_runtime_resume(dev);
+   ret = arm_smmu_runtime_resume(dev);
+   if (ret)
+   clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+
+   return ret;
 }
 
 static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
 {
+   int ret = 0;
+   struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
if (pm_runtime_suspended(dev))
-   return 0;
+   goto clk_unprepare;
 
-   return arm_smmu_runtime_suspend(dev);
+   ret = arm_smmu_runtime_suspend(dev);
+   if (ret)
+   return ret;
+
+clk_unprepare:
+   clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+   return ret;
 }
 
 static const struct dev_pm_ops arm_smmu_pm_ops = {
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-09 Thread Sai Prakash Ranjan


On 2021-08-10 00:00, Rob Clark wrote:

On Mon, Aug 9, 2021 at 11:11 AM Sai Prakash Ranjan
 wrote:


On 2021-08-09 23:37, Rob Clark wrote:
> On Mon, Aug 9, 2021 at 10:47 AM Sai Prakash Ranjan
>  wrote:
>>
>> On 2021-08-09 23:10, Will Deacon wrote:
>> > On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote:
>> >> On Mon, Aug 9, 2021 at 10:05 AM Will Deacon  wrote:
>> >> >
>> >> > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote:
>> >> > > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon  wrote:
>> >> > > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote:
>> >> > > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon  
wrote:
>> >> > > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote:
>> >> > > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon  
wrote:
>> >> > > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash 
Ranjan wrote:
>> >> > > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote:
>> >> > > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash 
Ranjan wrote:
>> >> > > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused 
IOMMU_SYS_CACHE_ONLY flag")
>> >> > > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and 
along with it went
>> >> > > > > > > > > > > the memory type setting required for the non-coherent 
masters to use
>> >> > > > > > > > > > > system cache. Now that system cache support for GPU 
is added, we will
>> >> > > > > > > > > > > need to set the right PTE attribute for GPU buffers 
to be sys cached.
>> >> > > > > > > > > > > Without this, the system cache lines are not 
allocated for GPU.
>> >> > > > > > > > > > >
>> >> > > > > > > > > > > So the patches in this series introduces a new prot 
flag IOMMU_LLC,
>> >> > > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to 
IO_PGTABLE_QUIRK_PTW_LLC
>> >> > > > > > > > > > > and makes GPU the user of this protection flag.
>> >> > > > > > > > > >
>> >> > > > > > > > > > Thank you for the patchset! Are you planning to refresh 
it, as it does
>> >> > > > > > > > > > not apply anymore?
>> >> > > > > > > > > >
>> >> > > > > > > > >
>> >> > > > > > > > > I was waiting on Will's reply [1]. If there are no 
changes needed, then
>> >> > > > > > > > > I can repost the patch.
>> >> > > > > > > >
>> >> > > > > > > > I still think you need to handle the mismatched alias, no? 
You're adding
>> >> > > > > > > > a new memory type to the SMMU which doesn't exist on the 
CPU side. That
>> >> > > > > > > > can't be right.
>> >> > > > > > > >
>> >> > > > > > >
>> >> > > > > > > Just curious, and maybe this is a dumb question, but what is 
your
>> >> > > > > > > concern about mismatched aliases?  I mean the cache hierarchy 
on the
>> >> > > > > > > GPU device side (anything beyond the LLC) is pretty different 
and
>> >> > > > > > > doesn't really care about the smmu pgtable attributes..
>> >> > > > > >
>> >> > > > > > If the CPU accesses a shared buffer with different attributes 
to those which
>> >> > > > > > the device is using then you fall into the "mismatched memory 
attributes"
>> >> > > > > > part of the Arm architecture. It's reasonably unforgiving (you 
should go and
>> >> > > > > > read it) and in some cases can apply to speculative accesses as 
well, but
>> >> > > > > > the end result is typically loss of coherency.
>> >> > > > >
>> >> > > > > Ok, I might have a few other sections to read first to decipher 
the
>> >> > >

Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-09 Thread Sai Prakash Ranjan


On 2021-08-09 23:37, Rob Clark wrote:

On Mon, Aug 9, 2021 at 10:47 AM Sai Prakash Ranjan
 wrote:


On 2021-08-09 23:10, Will Deacon wrote:
> On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote:
>> On Mon, Aug 9, 2021 at 10:05 AM Will Deacon  wrote:
>> >
>> > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote:
>> > > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon  wrote:
>> > > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote:
>> > > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon  wrote:
>> > > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote:
>> > > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon  
wrote:
>> > > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan 
wrote:
>> > > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote:
>> > > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash 
Ranjan wrote:
>> > > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused 
IOMMU_SYS_CACHE_ONLY flag")
>> > > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along 
with it went
>> > > > > > > > > > > the memory type setting required for the non-coherent 
masters to use
>> > > > > > > > > > > system cache. Now that system cache support for GPU is 
added, we will
>> > > > > > > > > > > need to set the right PTE attribute for GPU buffers to 
be sys cached.
>> > > > > > > > > > > Without this, the system cache lines are not allocated 
for GPU.
>> > > > > > > > > > >
>> > > > > > > > > > > So the patches in this series introduces a new prot flag 
IOMMU_LLC,
>> > > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to 
IO_PGTABLE_QUIRK_PTW_LLC
>> > > > > > > > > > > and makes GPU the user of this protection flag.
>> > > > > > > > > >
>> > > > > > > > > > Thank you for the patchset! Are you planning to refresh 
it, as it does
>> > > > > > > > > > not apply anymore?
>> > > > > > > > > >
>> > > > > > > > >
>> > > > > > > > > I was waiting on Will's reply [1]. If there are no changes 
needed, then
>> > > > > > > > > I can repost the patch.
>> > > > > > > >
>> > > > > > > > I still think you need to handle the mismatched alias, no? 
You're adding
>> > > > > > > > a new memory type to the SMMU which doesn't exist on the CPU 
side. That
>> > > > > > > > can't be right.
>> > > > > > > >
>> > > > > > >
>> > > > > > > Just curious, and maybe this is a dumb question, but what is your
>> > > > > > > concern about mismatched aliases?  I mean the cache hierarchy on 
the
>> > > > > > > GPU device side (anything beyond the LLC) is pretty different and
>> > > > > > > doesn't really care about the smmu pgtable attributes..
>> > > > > >
>> > > > > > If the CPU accesses a shared buffer with different attributes to 
those which
>> > > > > > the device is using then you fall into the "mismatched memory 
attributes"
>> > > > > > part of the Arm architecture. It's reasonably unforgiving (you 
should go and
>> > > > > > read it) and in some cases can apply to speculative accesses as 
well, but
>> > > > > > the end result is typically loss of coherency.
>> > > > >
>> > > > > Ok, I might have a few other sections to read first to decipher the
>> > > > > terminology..
>> > > > >
>> > > > > But my understanding of LLC is that it looks just like system memory
>> > > > > to the CPU and GPU (I think that would make it "the point of
>> > > > > coherence" between the GPU and CPU?)  If that is true, shouldn't it 
be
>> > > > > invisible from the point of view of different CPU mapping options?
>> > > >
>> > > > You could certainly build a system where mismatched attributes do

Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-09 Thread Sai Prakash Ranjan


On 2021-08-09 23:10, Will Deacon wrote:

On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote:

On Mon, Aug 9, 2021 at 10:05 AM Will Deacon  wrote:
>
> On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote:
> > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon  wrote:
> > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote:
> > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon  wrote:
> > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote:
> > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon  wrote:
> > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan 
wrote:
> > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote:
> > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan 
wrote:
> > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY 
flag")
> > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along 
with it went
> > > > > > > > > > the memory type setting required for the non-coherent 
masters to use
> > > > > > > > > > system cache. Now that system cache support for GPU is 
added, we will
> > > > > > > > > > need to set the right PTE attribute for GPU buffers to be 
sys cached.
> > > > > > > > > > Without this, the system cache lines are not allocated for 
GPU.
> > > > > > > > > >
> > > > > > > > > > So the patches in this series introduces a new prot flag 
IOMMU_LLC,
> > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to 
IO_PGTABLE_QUIRK_PTW_LLC
> > > > > > > > > > and makes GPU the user of this protection flag.
> > > > > > > > >
> > > > > > > > > Thank you for the patchset! Are you planning to refresh it, 
as it does
> > > > > > > > > not apply anymore?
> > > > > > > > >
> > > > > > > >
> > > > > > > > I was waiting on Will's reply [1]. If there are no changes 
needed, then
> > > > > > > > I can repost the patch.
> > > > > > >
> > > > > > > I still think you need to handle the mismatched alias, no? You're 
adding
> > > > > > > a new memory type to the SMMU which doesn't exist on the CPU 
side. That
> > > > > > > can't be right.
> > > > > > >
> > > > > >
> > > > > > Just curious, and maybe this is a dumb question, but what is your
> > > > > > concern about mismatched aliases?  I mean the cache hierarchy on the
> > > > > > GPU device side (anything beyond the LLC) is pretty different and
> > > > > > doesn't really care about the smmu pgtable attributes..
> > > > >
> > > > > If the CPU accesses a shared buffer with different attributes to 
those which
> > > > > the device is using then you fall into the "mismatched memory 
attributes"
> > > > > part of the Arm architecture. It's reasonably unforgiving (you should 
go and
> > > > > read it) and in some cases can apply to speculative accesses as well, 
but
> > > > > the end result is typically loss of coherency.
> > > >
> > > > Ok, I might have a few other sections to read first to decipher the
> > > > terminology..
> > > >
> > > > But my understanding of LLC is that it looks just like system memory
> > > > to the CPU and GPU (I think that would make it "the point of
> > > > coherence" between the GPU and CPU?)  If that is true, shouldn't it be
> > > > invisible from the point of view of different CPU mapping options?
> > >
> > > You could certainly build a system where mismatched attributes don't cause
> > > loss of coherence, but as it's not guaranteed by the architecture and the
> > > changes proposed here affect APIs which are exposed across SoCs, then I
> > > don't think it helps much.
> > >
> >
> > Hmm, the description of the new mapping flag is that it applies only
> > to transparent outer level cache:
> >
> > +/*
> > + * Non-coherent masters can use this page protection flag to set cacheable
> > + * memory attributes for only a transparent outer level of cache, also 
known a

Re: [PATCH] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks

2021-08-02 Thread Sai Prakash Ranjan


On 2021-08-02 21:42, Will Deacon wrote:

On Tue, Jul 27, 2021 at 03:03:22PM +0530, Sai Prakash Ranjan wrote:
Some clocks for SMMU can have parent as XO such as 
gpu_cc_hub_cx_int_clk
of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states 
in
such cases, we would need to drop the XO clock vote in unprepare call 
and
this unprepare callback for XO is in RPMh (Resource Power 
Manager-Hardened)
clock driver which controls RPMh managed clock resources for new QTI 
SoCs

and is a blocking call.

Given we cannot have a sleeping calls such as clk_bulk_prepare() and
clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu
operations like map and unmap can be in atomic context and are in fast
path, add this prepare and unprepare call to drop the XO vote only for
system pm callbacks since it is not a fast path and we expect the 
system

to enter deep sleep states with system pm as opposed to runtime pm.

This is a similar sequence of clock requests (prepare,enable and
disable,unprepare) in arm-smmu probe and remove.

Signed-off-by: Sai Prakash Ranjan 
Co-developed-by: Rajendra Nayak 
Signed-off-by: Rajendra Nayak 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)


[+Rob]

How does this work with that funny GPU which writes to the SMMU 
registers
directly? Does the SMMU need to remain independently clocked for that 
to

work or is it all in the same clock domain?



As Rob mentioned, device link should take care of all the dependencies 
between
SMMU and its consumers. But not sure how the question relates to this 
patch as this
change is for system pm and not runtime pm, so it is exactly the 
sequence of
SMMU probe/remove which if works currently for that GPU SMMU, then it 
should work

just fine for system suspend and resume as well.

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c

index d3c6f54110a5..9561ba4c5d39 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2277,6 +2277,13 @@ static int __maybe_unused 
arm_smmu_runtime_suspend(struct device *dev)


 static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
 {
+   int ret;
+   struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+   ret = clk_bulk_prepare(smmu->num_clks, smmu->clks);
+   if (ret)
+   return ret;
+
if (pm_runtime_suspended(dev))
return 0;


If we subsequently fail to enable the clks in arm_smmu_runtime_resume()
should we unprepare them again?



If we are unable to turn on the clks then its fatal and we will not live 
for long.


Thanks,
Sai


Will

@@ -2285,10 +2292,19 @@ static int __maybe_unused 
arm_smmu_pm_resume(struct device *dev)


 static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
 {
+   int ret = 0;
+   struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
if (pm_runtime_suspended(dev))
-   return 0;
+   goto clk_unprepare;

-   return arm_smmu_runtime_suspend(dev);
+   ret = arm_smmu_runtime_suspend(dev);
+   if (ret)
+   return ret;
+
+clk_unprepare:
+   clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+   return ret;
 }

 static const struct dev_pm_ops arm_smmu_pm_ops = {
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation



--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-08-02 Thread Sai Prakash Ranjan


On 2021-08-02 21:13, Will Deacon wrote:

On Wed, Jun 23, 2021 at 07:12:01PM +0530, Sai Prakash Ranjan wrote:
Currently for iommu_unmap() of large scatter-gather list with page 
size
elements, the majority of time is spent in flushing of partial walks 
in

__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA).

For example: to unmap a 32MB scatter-gather list with page size 
elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize 
(2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs 
(2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a 
huge

overhead.

On qcom implementation, there are several performance improvements for
TLB cache invalidations in HW like wait-for-safe (for realtime clients
such as camera and display) and few others to allow for cache
lookups/updates when TLBI is in progress for the same context bank.
So the cost of over-invalidation is less compared to the unmap latency
on several usecases like camera which deals with large buffers. So,
ASID based TLB invalidations (TLBIASID) can be used to invalidate the
entire context for partial walk flush thereby improving the unmap
latency.

Non-strict mode can use this by default for all platforms given its
all about over-invalidation saving time on individual unmaps and
non-deterministic generally.

For this example of 32MB scatter-gather list unmap, this change 
results

in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192
TLBIVAs thereby increasing the performance of unmaps drastically.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

This is further reduced once the map/unmap_pages() support gets in 
which

will result in just 1 TLBIASID as compared to 16 TLBIASIDs.

Real world data also shows big difference in unmap performance as 
below:


There were reports of camera frame drops because of high overhead in
iommu unmap without this optimization because of frequent unmaps 
issued
by camera of about 100MB/s taking more than 100ms thereby causing 
frame

drops.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v3:
 * Move the logic to arm-smmu driver from io-pgtable (Robin)
 * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use 
it for qcom impl


Changes in v2:
 * Add a quirk to choose tlb_flush_all in partial walk flush
 * Set the quirk for QTI SoC implementation

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 13 +
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 17 -
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c

index 7771d40176de..218c71465819 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -10,6 +10,8 @@

 #include "arm-smmu.h"

+extern const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops;
+
 struct qcom_smmu {
struct arm_smmu_device smmu;
bool bypass_quirk;
@@ -146,6 +148,8 @@ static int qcom_adreno_smmu_init_context(struct 
arm_smmu_domain *smmu_domain,

 {
struct adreno_smmu_priv *priv;

+   pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops;
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -185,6 +189,14 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {

{ }
 };

+static int qcom_smmu_init_context(struct arm_smmu_domain 
*smmu_domain,

+   struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+   pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops;
+
+   return 0;
+}
+
 static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
 {
 	unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups 
- 1);
@@ -308,6 +320,7 @@ static int qcom_smmu500_reset(struct 
arm_smmu_device *smmu)

 }

 static const struct arm_smmu_impl qcom_smmu_impl = {
+   .i

Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-07-28 Thread Sai Prakash Ranjan


Hi Georgi,

On 2021-07-28 19:30, Georgi Djakov wrote:

On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan wrote:

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to set the right PTE attribute for GPU buffers to be sys cached.
Without this, the system cache lines are not allocated for GPU.

So the patches in this series introduces a new prot flag IOMMU_LLC,
renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
and makes GPU the user of this protection flag.


Hi Sai,

Thank you for the patchset! Are you planning to refresh it, as it does
not apply anymore?



I was waiting on Will's reply [1]. If there are no changes needed, then
I can repost the patch.

[1] 
https://lore.kernel.org/lkml/21239ba603d0bdc4e4c696588a905...@codeaurora.org/


Thanks,
Sai





The series slightly depends on following 2 patches posted earlier and
is based on msm-next branch:
 * https://lore.kernel.org/patchwork/patch/1363008/
 * https://lore.kernel.org/patchwork/patch/1363010/

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Rename last-level cache quirk to
IO_PGTABLE_QUIRK_PTW_LLC
  iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
  drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 3 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/gpu/drm/msm/msm_iommu.c | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h   | 4 
 drivers/iommu/io-pgtable-arm.c  | 9 ++---
 include/linux/io-pgtable.h  | 6 +++---
 include/linux/iommu.h   | 6 ++
 7 files changed, 26 insertions(+), 7 deletions(-)


base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation



--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks

2021-07-27 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-07-27 16:03, Robin Murphy wrote:

On 2021-07-27 11:25, Robin Murphy wrote:

On 2021-07-27 10:33, Sai Prakash Ranjan wrote:
Some clocks for SMMU can have parent as XO such as 
gpu_cc_hub_cx_int_clk
of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states 
in
such cases, we would need to drop the XO clock vote in unprepare call 
and
this unprepare callback for XO is in RPMh (Resource Power 
Manager-Hardened)
clock driver which controls RPMh managed clock resources for new QTI 
SoCs

and is a blocking call.

Given we cannot have a sleeping calls such as clk_bulk_prepare() and
clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu
operations like map and unmap can be in atomic context and are in 
fast
path, add this prepare and unprepare call to drop the XO vote only 
for
system pm callbacks since it is not a fast path and we expect the 
system

to enter deep sleep states with system pm as opposed to runtime pm.

This is a similar sequence of clock requests (prepare,enable and
disable,unprepare) in arm-smmu probe and remove.


Nope. We call arm_smmu_rpm_get(), which may resume the device, from 
atomic contexts. clk_prepare() may sleep. This doesn't work.


Urgh, or maybe I skimmed the commit message too lightly *and* managed
to totally misread the patch, sorry :(

I'll wake up some more and try again later...



No worries, we took our time looking through that many times before 
posting this :)


Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks

2021-07-27 Thread Sai Prakash Ranjan

Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk
of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in
such cases, we would need to drop the XO clock vote in unprepare call and
this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened)
clock driver which controls RPMh managed clock resources for new QTI SoCs
and is a blocking call.

Given we cannot have a sleeping calls such as clk_bulk_prepare() and
clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu
operations like map and unmap can be in atomic context and are in fast
path, add this prepare and unprepare call to drop the XO vote only for
system pm callbacks since it is not a fast path and we expect the system
to enter deep sleep states with system pm as opposed to runtime pm.

This is a similar sequence of clock requests (prepare,enable and
disable,unprepare) in arm-smmu probe and remove.

Signed-off-by: Sai Prakash Ranjan 
Co-developed-by: Rajendra Nayak 
Signed-off-by: Rajendra Nayak 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index d3c6f54110a5..9561ba4c5d39 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2277,6 +2277,13 @@ static int __maybe_unused 
arm_smmu_runtime_suspend(struct device *dev)
 
 static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
 {
+   int ret;
+   struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+   ret = clk_bulk_prepare(smmu->num_clks, smmu->clks);
+   if (ret)
+   return ret;
+
if (pm_runtime_suspended(dev))
return 0;
 
@@ -2285,10 +2292,19 @@ static int __maybe_unused arm_smmu_pm_resume(struct 
device *dev)
 
 static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
 {
+   int ret = 0;
+   struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
if (pm_runtime_suspended(dev))
-   return 0;
+   goto clk_unprepare;
 
-   return arm_smmu_runtime_suspend(dev);
+   ret = arm_smmu_runtime_suspend(dev);
+   if (ret)
+   return ret;
+
+clk_unprepare:
+   clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+   return ret;
 }
 
 static const struct dev_pm_ops arm_smmu_pm_ops = {
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-07-20 Thread Sai Prakash Ranjan


Hi Robin, Will,

On 2021-07-12 09:39, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-23 19:12, Sai Prakash Ranjan wrote:
Currently for iommu_unmap() of large scatter-gather list with page 
size
elements, the majority of time is spent in flushing of partial walks 
in

__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA).

For example: to unmap a 32MB scatter-gather list with page size 
elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize 
(2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs 
(2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a 
huge

overhead.

On qcom implementation, there are several performance improvements for
TLB cache invalidations in HW like wait-for-safe (for realtime clients
such as camera and display) and few others to allow for cache
lookups/updates when TLBI is in progress for the same context bank.
So the cost of over-invalidation is less compared to the unmap latency
on several usecases like camera which deals with large buffers. So,
ASID based TLB invalidations (TLBIASID) can be used to invalidate the
entire context for partial walk flush thereby improving the unmap
latency.

Non-strict mode can use this by default for all platforms given its
all about over-invalidation saving time on individual unmaps and
non-deterministic generally.

For this example of 32MB scatter-gather list unmap, this change 
results

in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192
TLBIVAs thereby increasing the performance of unmaps drastically.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

This is further reduced once the map/unmap_pages() support gets in 
which

will result in just 1 TLBIASID as compared to 16 TLBIASIDs.

Real world data also shows big difference in unmap performance as 
below:


There were reports of camera frame drops because of high overhead in
iommu unmap without this optimization because of frequent unmaps 
issued
by camera of about 100MB/s taking more than 100ms thereby causing 
frame

drops.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v3:
 * Move the logic to arm-smmu driver from io-pgtable (Robin)
 * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use
it for qcom impl

Changes in v2:
 * Add a quirk to choose tlb_flush_all in partial walk flush
 * Set the quirk for QTI SoC implementation

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 13 +
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 17 -
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 7771d40176de..218c71465819 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -10,6 +10,8 @@

 #include "arm-smmu.h"

+extern const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops;
+
 struct qcom_smmu {
struct arm_smmu_device smmu;
bool bypass_quirk;
@@ -146,6 +148,8 @@ static int qcom_adreno_smmu_init_context(struct
arm_smmu_domain *smmu_domain,
 {
struct adreno_smmu_priv *priv;

+   pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops;
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -185,6 +189,14 @@ static const struct of_device_id
qcom_smmu_client_of_match[] __maybe_unused = {
{ }
 };

+static int qcom_smmu_init_context(struct arm_smmu_domain 
*smmu_domain,

+   struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+   pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops;
+
+   return 0;
+}
+
 static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
 {
 	unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups 
- 1);
@@ -308,6 +320,7 @@ static int qcom_smmu500_reset(struct 
arm_smmu_device *smmu)

 }

 static const struct arm_smmu_impl qcom

Re: [PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-07-11 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-23 19:12, Sai Prakash Ranjan wrote:

Currently for iommu_unmap() of large scatter-gather list with page size
elements, the majority of time is spent in flushing of partial walks in
__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA).

For example: to unmap a 32MB scatter-gather list with page size 
elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize 
(2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs 
(2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a 
huge

overhead.

On qcom implementation, there are several performance improvements for
TLB cache invalidations in HW like wait-for-safe (for realtime clients
such as camera and display) and few others to allow for cache
lookups/updates when TLBI is in progress for the same context bank.
So the cost of over-invalidation is less compared to the unmap latency
on several usecases like camera which deals with large buffers. So,
ASID based TLB invalidations (TLBIASID) can be used to invalidate the
entire context for partial walk flush thereby improving the unmap
latency.

Non-strict mode can use this by default for all platforms given its
all about over-invalidation saving time on individual unmaps and
non-deterministic generally.

For this example of 32MB scatter-gather list unmap, this change results
in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192
TLBIVAs thereby increasing the performance of unmaps drastically.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

This is further reduced once the map/unmap_pages() support gets in 
which

will result in just 1 TLBIASID as compared to 16 TLBIASIDs.

Real world data also shows big difference in unmap performance as 
below:


There were reports of camera frame drops because of high overhead in
iommu unmap without this optimization because of frequent unmaps issued
by camera of about 100MB/s taking more than 100ms thereby causing frame
drops.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v3:
 * Move the logic to arm-smmu driver from io-pgtable (Robin)
 * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use
it for qcom impl

Changes in v2:
 * Add a quirk to choose tlb_flush_all in partial walk flush
 * Set the quirk for QTI SoC implementation

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 13 +
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 17 -
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 7771d40176de..218c71465819 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -10,6 +10,8 @@

 #include "arm-smmu.h"

+extern const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops;
+
 struct qcom_smmu {
struct arm_smmu_device smmu;
bool bypass_quirk;
@@ -146,6 +148,8 @@ static int qcom_adreno_smmu_init_context(struct
arm_smmu_domain *smmu_domain,
 {
struct adreno_smmu_priv *priv;

+   pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops;
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -185,6 +189,14 @@ static const struct of_device_id
qcom_smmu_client_of_match[] __maybe_unused = {
{ }
 };

+static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain,
+   struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+   pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops;
+
+   return 0;
+}
+
 static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
 {
 	unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 
1);
@@ -308,6 +320,7 @@ static int qcom_smmu500_reset(struct 
arm_smmu_device *smmu)

 }

 static const struct arm_smmu_impl qcom_smmu_impl = {
+   .init_context = qcom_smmu_init_context,
.cfg_probe = qcom_smmu

Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-06-30 Thread Sai Prakash Ranjan


Hi Will,

On 2021-03-25 23:03, Will Deacon wrote:

On Tue, Mar 09, 2021 at 12:10:44PM +0530, Sai Prakash Ranjan wrote:

On 2021-02-05 17:38, Sai Prakash Ranjan wrote:
> On 2021-02-04 03:16, Will Deacon wrote:
> > On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote:
> > > On 2021-02-01 23:50, Jordan Crouse wrote:
> > > > On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:
> > > > > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
> > > > > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > > > > > > On 2021-01-29 14:35, Will Deacon wrote:
> > > > > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan 
wrote:
> > > > > > > > > +#define IOMMU_LLC(1 << 6)
> > > > > > > >
> > > > > > > > On reflection, I'm a bit worried about exposing this because I 
think it
> > > > > > > > will
> > > > > > > > introduce a mismatched virtual alias with the CPU (we don't 
even have a
> > > > > > > > MAIR
> > > > > > > > set up for this memory type). Now, we also have that issue for 
the PTW,
> > > > > > > > but
> > > > > > > > since we always use cache maintenance (i.e. the streaming API) 
for
> > > > > > > > publishing the page-tables to a non-coheren walker, it works 
out.
> > > > > > > > However,
> > > > > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API 
coherent
> > > > > > > > allocation, then they're potentially in for a nasty surprise 
due to the
> > > > > > > > mismatched outer-cacheability attributes.
> > > > > > > >
> > > > > > >
> > > > > > > Can't we add the syscached memory type similar to what is done on 
android?
> > > > > >
> > > > > > Maybe. How does the GPU driver map these things on the CPU side?
> > > > >
> > > > > Currently we use writecombine mappings for everything, although there
> > > > > are some cases that we'd like to use cached (but have not merged
> > > > > patches that would give userspace a way to flush/invalidate)
> > > > >
> > > >
> > > > LLC/system cache doesn't have a relationship with the CPU cache.  Its
> > > > just a
> > > > little accelerator that sits on the connection from the GPU to DDR and
> > > > caches
> > > > accesses. The hint that Sai is suggesting is used to mark the buffers as
> > > > 'no-write-allocate' to prevent GPU write operations from being cached in
> > > > the LLC
> > > > which a) isn't interesting and b) takes up cache space for read
> > > > operations.
> > > >
> > > > Its easiest to think of the LLC as a bonus accelerator that has no cost
> > > > for
> > > > us to use outside of the unfortunate per buffer hint.
> > > >
> > > > We do have to worry about the CPU cache w.r.t I/O coherency (which is a
> > > > different hint) and in that case we have all of concerns that Will
> > > > identified.
> > > >
> > >
> > > For mismatched outer cacheability attributes which Will
> > > mentioned, I was
> > > referring to [1] in android kernel.
> >
> > I've lost track of the conversation here :/
> >
> > When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also
> > mapped
> > into the CPU and with what attributes? Rob said "writecombine for
> > everything" -- does that mean ioremap_wc() / MEMREMAP_WC?
> >
>
> Rob answered this.
>
> > Finally, we need to be careful when we use the word "hint" as
> > "allocation
> > hint" has a specific meaning in the architecture, and if we only
> > mismatch on
> > those then we're actually ok. But I think IOMMU_LLC is more than
> > just a
> > hint, since it actually drives eviction policy (i.e. it enables
> > writeback).
> >
> > Sorry for the pedantry, but I just want to make sure we're all talking
> > about the same things!
> >
>
> Sorry for the confusion which probably was caused by my mentioning of
> android, NWA(no write allocate) is an allocation hint which we can
> ignore
> for now as it is not introduced yet in upstream.
>

Re: [PATCHv2 1/3] iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush

2021-06-23 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-23 00:07, Robin Murphy wrote:

On 2021-06-22 15:27, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-22 17:41, Robin Murphy wrote:

On 2021-06-22 08:11, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-21 21:15, Robin Murphy wrote:

On 2021-06-18 03:51, Sai Prakash Ranjan wrote:
Add a quirk IO_PGTABLE_QUIRK_TLB_INV_ALL to invalidate entire 
context
with tlb_flush_all() callback in partial walk flush to improve 
unmap
performance on select few platforms where the cost of 
over-invalidation

is less than the unmap latency.


I still think this doesn't belong anywhere near io-pgtable at all.
It's a driver-internal decision how exactly it implements a 
non-leaf

invalidation, and that may be more complex than a predetermined
boolean decision. For example, I've just realised for SMMUv3 we 
can't

invalidate multiple levels of table at once with a range command,
since if we assume the whole thing is mapped at worst-case page
granularity we may fail to invalidate any parts which are mapped as
intermediate-level blocks. If invalidating a 1GB region (with 4KB
granule) means having to fall back to 256K non-range commands, we 
may

not want to invalidate by VA then, even though doing so for a 2MB
region is still optimal.

It's also quite feasible that drivers might want to do this for 
leaf

invalidations too - if you don't like issuing 512 commands to
invalidate 2MB, do you like issuing 511 commands to invalidate 
2044KB?
- and at that point the logic really has to be in the driver 
anyway.




Ok I will move this to tlb_flush_walk() functions in the drivers. In 
the previous
v1 thread, you suggested to make the choice in 
iommu_get_dma_strict() test,
I assume you meant the test in iommu_dma_init_domain() with a flag 
or was it
the leaf driver(ex:arm-smmu.c) test of iommu_get_dma_strict() in 
init_domain?


Yes, I meant literally inside the same condition where we currently
set "pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;" in
arm_smmu_init_domain_context().



Ok got it, thanks.

I am still a bit confused on where this flag would be? Should this 
be a part

of struct iommu_domain?


Well, if you were to rewrite the config with an alternative set of
flush_ops at that point it would be implicit. For a flag, probably
either in arm_smmu_domain or arm_smmu_impl. Maybe a flag would be 
less

useful than generalising straight to a "maximum number of by-VA
invalidations it's worth sending individually" threshold value?


But then we would still need some flag to make this implementation
specific (qcom specific for now) and this threshold would just be
another condition although it would have been useful if this was
generic enough.


Well, for that approach I assume we could do something like
special-case 0, or if it's a mutable per-domain value maybe just
initialise it to SIZE_MAX or whatever such that it would never be
reached in practice. Whichever way, it was meant to be implied that
anything at the domain level would still be subject to final
adjustment by the init_context hook.



Ok that should work, so I went ahead with another set of flush_ops
and posted out v3.

Thanks,
Sai



It's clear to me what overall shape and separation of responsibility 
is

most logical, but beyond that I don't have a particularly strong
opinion on the exact implementation; I've just been chucking ideas
around :)



Your ideas are very informative and useful :)

Thanks,
Sai



--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation

2021-06-23 Thread Sai Prakash Ranjan

Currently for iommu_unmap() of large scatter-gather list with page size
elements, the majority of time is spent in flushing of partial walks in
__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA).

For example: to unmap a 32MB scatter-gather list with page size elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge
overhead.

On qcom implementation, there are several performance improvements for
TLB cache invalidations in HW like wait-for-safe (for realtime clients
such as camera and display) and few others to allow for cache
lookups/updates when TLBI is in progress for the same context bank.
So the cost of over-invalidation is less compared to the unmap latency
on several usecases like camera which deals with large buffers. So,
ASID based TLB invalidations (TLBIASID) can be used to invalidate the
entire context for partial walk flush thereby improving the unmap
latency.

Non-strict mode can use this by default for all platforms given its
all about over-invalidation saving time on individual unmaps and
non-deterministic generally.

For this example of 32MB scatter-gather list unmap, this change results
in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192
TLBIVAs thereby increasing the performance of unmaps drastically.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

This is further reduced once the map/unmap_pages() support gets in which
will result in just 1 TLBIASID as compared to 16 TLBIASIDs.

Real world data also shows big difference in unmap performance as below:

There were reports of camera frame drops because of high overhead in
iommu unmap without this optimization because of frequent unmaps issued
by camera of about 100MB/s taking more than 100ms thereby causing frame
drops.

Signed-off-by: Sai Prakash Ranjan 
---

Changes in v3:
 * Move the logic to arm-smmu driver from io-pgtable (Robin)
 * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for 
qcom impl

Changes in v2:
 * Add a quirk to choose tlb_flush_all in partial walk flush
 * Set the quirk for QTI SoC implementation

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 13 +
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 17 -
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 7771d40176de..218c71465819 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -10,6 +10,8 @@
 
 #include "arm-smmu.h"
 
+extern const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops;
+
 struct qcom_smmu {
struct arm_smmu_device smmu;
bool bypass_quirk;
@@ -146,6 +148,8 @@ static int qcom_adreno_smmu_init_context(struct 
arm_smmu_domain *smmu_domain,
 {
struct adreno_smmu_priv *priv;
 
+   pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops;
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -185,6 +189,14 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {
{ }
 };
 
+static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain,
+   struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+   pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops;
+
+   return 0;
+}
+
 static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
 {
unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 
1);
@@ -308,6 +320,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu)
 }
 
 static const struct arm_smmu_impl qcom_smmu_impl = {
+   .init_context = qcom_smmu_init_context,
.cfg_probe = qcom_smmu_cfg_probe,
.def_domain_type = qcom_

Re: [PATCHv2 1/3] iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush

2021-06-22 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-22 17:41, Robin Murphy wrote:

On 2021-06-22 08:11, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-21 21:15, Robin Murphy wrote:

On 2021-06-18 03:51, Sai Prakash Ranjan wrote:
Add a quirk IO_PGTABLE_QUIRK_TLB_INV_ALL to invalidate entire 
context

with tlb_flush_all() callback in partial walk flush to improve unmap
performance on select few platforms where the cost of 
over-invalidation

is less than the unmap latency.


I still think this doesn't belong anywhere near io-pgtable at all.
It's a driver-internal decision how exactly it implements a non-leaf
invalidation, and that may be more complex than a predetermined
boolean decision. For example, I've just realised for SMMUv3 we can't
invalidate multiple levels of table at once with a range command,
since if we assume the whole thing is mapped at worst-case page
granularity we may fail to invalidate any parts which are mapped as
intermediate-level blocks. If invalidating a 1GB region (with 4KB
granule) means having to fall back to 256K non-range commands, we may
not want to invalidate by VA then, even though doing so for a 2MB
region is still optimal.

It's also quite feasible that drivers might want to do this for leaf
invalidations too - if you don't like issuing 512 commands to
invalidate 2MB, do you like issuing 511 commands to invalidate 
2044KB?

- and at that point the logic really has to be in the driver anyway.



Ok I will move this to tlb_flush_walk() functions in the drivers. In 
the previous
v1 thread, you suggested to make the choice in iommu_get_dma_strict() 
test,
I assume you meant the test in iommu_dma_init_domain() with a flag or 
was it
the leaf driver(ex:arm-smmu.c) test of iommu_get_dma_strict() in 
init_domain?


Yes, I meant literally inside the same condition where we currently
set "pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;" in
arm_smmu_init_domain_context().



Ok got it, thanks.

I am still a bit confused on where this flag would be? Should this be 
a part

of struct iommu_domain?


Well, if you were to rewrite the config with an alternative set of
flush_ops at that point it would be implicit. For a flag, probably
either in arm_smmu_domain or arm_smmu_impl. Maybe a flag would be less
useful than generalising straight to a "maximum number of by-VA
invalidations it's worth sending individually" threshold value?


But then we would still need some flag to make this implementation
specific (qcom specific for now) and this threshold would just be
another condition although it would have been useful if this was
generic enough.


It's clear to me what overall shape and separation of responsibility is
most logical, but beyond that I don't have a particularly strong
opinion on the exact implementation; I've just been chucking ideas
around :)



Your ideas are very informative and useful :)

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2 1/3] iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush

2021-06-22 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-21 21:15, Robin Murphy wrote:

On 2021-06-18 03:51, Sai Prakash Ranjan wrote:

Add a quirk IO_PGTABLE_QUIRK_TLB_INV_ALL to invalidate entire context
with tlb_flush_all() callback in partial walk flush to improve unmap
performance on select few platforms where the cost of 
over-invalidation

is less than the unmap latency.


I still think this doesn't belong anywhere near io-pgtable at all.
It's a driver-internal decision how exactly it implements a non-leaf
invalidation, and that may be more complex than a predetermined
boolean decision. For example, I've just realised for SMMUv3 we can't
invalidate multiple levels of table at once with a range command,
since if we assume the whole thing is mapped at worst-case page
granularity we may fail to invalidate any parts which are mapped as
intermediate-level blocks. If invalidating a 1GB region (with 4KB
granule) means having to fall back to 256K non-range commands, we may
not want to invalidate by VA then, even though doing so for a 2MB
region is still optimal.

It's also quite feasible that drivers might want to do this for leaf
invalidations too - if you don't like issuing 512 commands to
invalidate 2MB, do you like issuing 511 commands to invalidate 2044KB?
- and at that point the logic really has to be in the driver anyway.



Ok I will move this to tlb_flush_walk() functions in the drivers. In the 
previous
v1 thread, you suggested to make the choice in iommu_get_dma_strict() 
test,
I assume you meant the test in iommu_dma_init_domain() with a flag or 
was it
the leaf driver(ex:arm-smmu.c) test of iommu_get_dma_strict() in 
init_domain?


I am still a bit confused on where this flag would be? Should this be a 
part

of struct iommu_domain?

Thanks,
Sai




Signed-off-by: Sai Prakash Ranjan 
---
  drivers/iommu/io-pgtable-arm.c | 3 ++-
  include/linux/io-pgtable.h | 5 +
  2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 87def58e79b5..5d362f2214bd 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -768,7 +768,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg 
*cfg, void *cookie)

if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
IO_PGTABLE_QUIRK_ARM_TTBR1 |
-   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
+   IO_PGTABLE_QUIRK_TLB_INV_ALL))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4d40dfa75b55..45441592a0e6 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -82,6 +82,10 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
 *  attributes set in the TCR for a non-coherent page-table walker.
+*
+* IO_PGTABLE_QUIRK_TLB_INV_ALL: Use TLBIALL/TLBIASID to invalidate
+*  entire context for partial walk flush to increase unmap
+*  performance on select few platforms.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -89,6 +93,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
+   #define IO_PGTABLE_QUIRK_TLB_INV_ALLBIT(7)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;



--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2 2/3] iommu/io-pgtable: Optimize partial walk flush for large scatter-gather list

2021-06-20 Thread Sai Prakash Ranjan


Hi,

On 2021-06-19 03:39, Doug Anderson wrote:

Hi,

On Thu, Jun 17, 2021 at 7:51 PM Sai Prakash Ranjan
 wrote:


Currently for iommu_unmap() of large scatter-gather list with page 
size
elements, the majority of time is spent in flushing of partial walks 
in

__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA) which do not support
range based invalidations like on arm-smmu-v3.2.

For example: to unmap a 32MB scatter-gather list with page size 
elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize 
(2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs 
(2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a 
huge

overhead.

So instead use tlb_flush_all() callback (TLBIALL/TLBIASID) to 
invalidate
the entire context for partial walk flush on select few platforms 
where

cost of over-invalidation is less than unmap latency


It would probably be worth punching this description up a little bit.
Elsewhere you said in more detail why this over-invalidation is less
of a big deal for the Qualcomm SMMU. It's probably worth saying
something like that here, too. Like this bit paraphrased from your
other email:

On qcom impl, we have several performance improvements for TLB cache
invalidations in HW like wait-for-safe (for realtime clients such as
camera and display) and few others to allow for cache lookups/updates
when TLBI is in progress for the same context bank.



Sure will add this info as well in the next version.




using the newly
introduced quirk IO_PGTABLE_QUIRK_TLB_INV_ALL. We also do this for
non-strict mode given its all about over-invalidation saving time on
individual unmaps and non-deterministic generally.


As per usual I'm mostly clueless, but I don't quite understand why you
want this new behavior for non-strict mode. To me it almost seems like
the opposite? Specifically, non-strict mode is already outside the
critical path today and so there's no need to optimize it. I'm
probably not explaining myself clearly, but I guess i'm thinking:

a) today for strict, unmap is in the critical path and it's important
to get it out of there. Getting it out of the critical path is so
important that we're willing to over-invalidate to speed up the
critical path.

b) today for non-strict, unmap is not in the critical path.

So I would almost expect your patch to _disable_ your new feature for
non-strict mappings, not auto-enable your new feature for non-strict
mappings.

If I'm babbling, feel free to ignore. ;-) Looking back, I guess Robin
was the one that suggested the behavior you're implementing, so it's
more likely he's right than I am. ;-)



Thanks for taking a look. Non-strict mode is only for leaf entries and
dma domains and this optimization is for non-leaf entries and is 
applicable
for both, see __arm_lpae_unmap(). In other words, if you have 
iommu.strict=0
(non-strict mode) and try unmapping a large sg buffer as the problem 
described
in the commit text, you would still go via this path in unmap and see 
the

delay without this patch. So what Robin suggested is that, let's do this
unconditionally for all users with non-strict mode as opposed to only
restricting it to implementation specific in case of strict mode.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-17 Thread Sai Prakash Ranjan

On 2021-06-15 17:21, Sai Prakash Ranjan wrote:
> Hi Krishna,
> 
> On 2021-06-14 23:18, Krishna Reddy wrote:
>>> Right but we won't know until we profile the specific usecases or try them 
>>> in
>>> generic workload to see if they affect the performance. Sure, over 
>>> invalidation is
>>> a concern where multiple buffers can be mapped to same context and the cache
>>> is not usable at the time for lookup and such but we don't do it for small 
>>> buffers
>>> and only for large buffers which means thousands of TLB entry mappings in
>>> which case TLBIASID is preferred (note: I mentioned the HW team
>>> recommendation to use it for anything greater than 128 TLB entries) in my
>>> earlier reply. And also note that we do this only for partial walk flush, 
>>> we are not
>>> arbitrarily changing all the TLBIs to ASID based.
>>
>> Most of the heavy bw use cases does involve processing larger buffers.
>> When the physical memory is allocated dis-contiguously at page_size
>> (let's use 4KB here)
>> granularity, each aligned 2MB chunks IOVA unmap would involve
>> performing a TLBIASID
>> as 2MB is not a leaf. Essentially, It happens all the time during
>> large buffer unmaps and
>> potentially impact active traffic on other large buffers. Depending on how 
>> much
>> latency HW engines can absorb, the overflow/underflow issues for ISO
>> engines can be
>> sporadic and vendor specific.
>> Performing TLBIASID as default for all SoCs is not a safe operation.
>>
> 
> Ok so from what I gather from this is that its not easy to test for the
> negative impact and you don't have data on such yet and the behaviour is
> very vendor specific. To add on qcom impl, we have several performance
> improvements for TLB cache invalidations in HW like wait-for-safe(for realtime
> clients such as camera and display) and few others to allow for cache
> lookups/updates when TLBI is in progress for the same context bank, so atleast
> we are good here.
> 
>>
>>> I am no camera expert but from what the camera team mentioned is that there
>>> is a thread which frees memory(large unused memory buffers) periodically 
>>> which
>>> ends up taking around 100+ms and causing some camera test failures with
>>> frame drops. Parallel efforts are already being made to optimize this usage 
>>> of
>>> thread but as I mentioned previously, this is *not a camera specific*, lets 
>>> say
>>> someone else invokes such large unmaps, it's going to face the same issue.
>>
>> From the above, It doesn't look like the root cause of frame drops is
>> fully understood.
>> Why is 100+ms delay causing camera frame drop?  Is the same thread
>> submitting the buffers
>> to camera after unmap is complete? If not, how is the unmap latency
>> causing issue here?
>>
> 
> Ok since you are interested in camera usecase, I have requested for more 
> details
> from the camera team and will give it once they comeback. However I don't 
> think
> its good to have unmap latency at all and that is being addressed by this 
> patch.
> 

As promised, here are some more details shared by camera team:

Mapping of a framework buffer happens at the time of process request and 
unmapping
of a framework buffer happens once the buffer is available from hardware and 
result
will be notified to camera framework.
 * When there is a delay in unmapping of a buffer, result notification to 
framework
   will be delayed and based on pipeline delay depth, new requests from 
framework
   will be delayed.
 * Camera stack uses internal buffer managers for internal and framework 
buffers.
   While mapping and unmapping these managers will be accessed, so uses common 
lock
   and hence is a blocking call. So unmapping delay will cause the delay for 
mapping
   of a new request and leads to framedrop.

Map and unmap happens in the camera service process context. There is no 
separate perf
path to perform unmapping.

In Camera stack along with map/unmap delay, additional delays are due to HW. So 
HW should
be able to get the requests in time from SW to avoid frame drops.

Thanks,
Sai
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2 2/3] iommu/io-pgtable: Optimize partial walk flush for large scatter-gather list

2021-06-17 Thread Sai Prakash Ranjan

Currently for iommu_unmap() of large scatter-gather list with page size
elements, the majority of time is spent in flushing of partial walks in
__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA) which do not support
range based invalidations like on arm-smmu-v3.2.

For example: to unmap a 32MB scatter-gather list with page size elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge
overhead.

So instead use tlb_flush_all() callback (TLBIALL/TLBIASID) to invalidate
the entire context for partial walk flush on select few platforms where
cost of over-invalidation is less than unmap latency using the newly
introduced quirk IO_PGTABLE_QUIRK_TLB_INV_ALL. We also do this for
non-strict mode given its all about over-invalidation saving time on
individual unmaps and non-deterministic generally.

For this example of 32MB scatter-gather list unmap, this results in just
16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs
thereby increasing the performance of unmaps drastically.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

This is further reduced once the map/unmap_pages() support gets in which
will result in just 1 TLBIASID as compared to 16 TLBIASIDs.

Real world data also shows big difference in unmap performance as below:

There were reports of camera frame drops because of high overhead in
iommu unmap without this optimization because of frequent unmaps issued
by camera of about 100MB/s taking more than 100ms thereby causing frame
drops.

Signed-off-by: Sai Prakash Ranjan 
---
 include/linux/io-pgtable.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 45441592a0e6..fd6b30cfdbf7 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -219,6 +219,12 @@ static inline void
 io_pgtable_tlb_flush_walk(struct io_pgtable *iop, unsigned long iova,
  size_t size, size_t granule)
 {
+   if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT ||
+   iop->cfg.quirks & IO_PGTABLE_QUIRK_TLB_INV_ALL) {
+   iop->cfg.tlb->tlb_flush_all(iop->cookie);
+   return;
+   }
+
if (iop->cfg.tlb && iop->cfg.tlb->tlb_flush_walk)
iop->cfg.tlb->tlb_flush_walk(iova, size, granule, iop->cookie);
 }
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2 3/3] iommu/arm-smmu-qcom: Set IO_PGTABLE_QUIRK_TLB_INV_ALL for QTI SoC impl

2021-06-17 Thread Sai Prakash Ranjan

Set the pgtable quirk IO_PGTABLE_QUIRK_TLB_INV_ALL for QTI SoC
implementation to use ::tlb_flush_all() for partial walk flush
to improve unmap performance.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 7771d40176de..b8ae51592d00 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -146,6 +146,8 @@ static int qcom_adreno_smmu_init_context(struct 
arm_smmu_domain *smmu_domain,
 {
struct adreno_smmu_priv *priv;
 
+   pgtbl_cfg->quirks |= IO_PGTABLE_QUIRK_TLB_INV_ALL;
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -185,6 +187,14 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {
{ }
 };
 
+static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain,
+   struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
+{
+   pgtbl_cfg->quirks |= IO_PGTABLE_QUIRK_TLB_INV_ALL;
+
+   return 0;
+}
+
 static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
 {
unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 
1);
@@ -308,6 +318,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu)
 }
 
 static const struct arm_smmu_impl qcom_smmu_impl = {
+   .init_context = qcom_smmu_init_context,
.cfg_probe = qcom_smmu_cfg_probe,
.def_domain_type = qcom_smmu_def_domain_type,
.reset = qcom_smmu500_reset,
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2 1/3] iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush

2021-06-17 Thread Sai Prakash Ranjan

Add a quirk IO_PGTABLE_QUIRK_TLB_INV_ALL to invalidate entire context
with tlb_flush_all() callback in partial walk flush to improve unmap
performance on select few platforms where the cost of over-invalidation
is less than the unmap latency.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 3 ++-
 include/linux/io-pgtable.h | 5 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 87def58e79b5..5d362f2214bd 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -768,7 +768,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
IO_PGTABLE_QUIRK_ARM_TTBR1 |
-   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
+   IO_PGTABLE_QUIRK_TLB_INV_ALL))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4d40dfa75b55..45441592a0e6 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -82,6 +82,10 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
 *  attributes set in the TCR for a non-coherent page-table walker.
+*
+* IO_PGTABLE_QUIRK_TLB_INV_ALL: Use TLBIALL/TLBIASID to invalidate
+*  entire context for partial walk flush to increase unmap
+*  performance on select few platforms.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -89,6 +93,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
+   #define IO_PGTABLE_QUIRK_TLB_INV_ALLBIT(7)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2 0/3] iommu/io-pgtable: Optimize partial walk flush for large scatter-gather list

2021-06-17 Thread Sai Prakash Ranjan

Currently for iommu_unmap() of large scatter-gather list with page size
elements, the majority of time is spent in flushing of partial walks in
__arm_lpae_unmap() which is a VA based TLB invalidation invalidating
page-by-page on iommus like arm-smmu-v2 (TLBIVA) which do not support
range based invalidations like on arm-smmu-v3.2.

For example: to unmap a 32MB scatter-gather list with page size elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge
overhead.

So instead use tlb_flush_all() callback (TLBIALL/TLBIASID) to invalidate
the entire context for partial walk flush on select few platforms where
cost of over-invalidation is less than unmap latency using the newly
introduced quirk IO_PGTABLE_QUIRK_TLB_INV_ALL. We also do this for
non-strict mode given its all about over-invalidation saving time on
individual unmaps and non-deterministic generally.

For this example of 32MB scatter-gather list unmap, this results in just
16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs
thereby increasing the performance of unmaps drastically.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

This is further reduced once the map/unmap_pages() support gets in which
will result in just 1 TLBIASID as compared to 16 TLBIASIDs.

Real world data also shows big difference in unmap performance as below:

There were reports of camera frame drops because of high overhead in
iommu unmap without this optimization because of frequent unmaps issued
by camera of about 100MB/s taking more than 100ms thereby causing frame
drops.

Changes in v2:
 * Add a quirk to choose tlb_flush_all in partial walk flush
 * Set the quirk for QTI SoC implementation

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk
flush
  iommu/io-pgtable: Optimize partial walk flush for large scatter-gather
list
  iommu/arm-smmu-qcom: Set IO_PGTABLE_QUIRK_TLB_INV_ALL for QTI SoC impl

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++
 drivers/iommu/io-pgtable-arm.c |  3 ++-
 include/linux/io-pgtable.h | 11 +++
 3 files changed, 24 insertions(+), 1 deletion(-)

-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-17 Thread Sai Prakash Ranjan


Hi Krishna,

On 2021-06-18 02:48, Krishna Reddy wrote:
Instead of flush_ops in init_context hook, perhaps a io_pgtable quirk 
since this is
related to tlb, probably a bad name but IO_PGTABLE_QUIRK_TLB_INV which 
will

be set in init_context impl hook and the prev condition in
io_pgtable_tlb_flush_walk()
becomes something like below. Seems very minimal and neat instead of 
poking

into tlb_flush_walk functions or touching dma strict with some flag?

if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT ||
 iop->cfg.quirks & IO_PGTABLE_QUIRK_TLB_INV) {
 iop->cfg.tlb->tlb_flush_all(iop->cookie);
 return;
}


Can you name it as IO_PGTABLE_QUIRK_TLB_INV_ASID or
IO_PGTABLE_QUIRK_TLB_INV_ALL_ASID?



tlb_flush_all() callback implementations can use TLBIALL or TLBIASID. so
having ASID in the quirk name doesn't sound right given this quirk 
should

be generic enough to be usable on other implementations as well.
Instead I will go with IO_PGTABLE_QUIRK_TLB_INV_ALL and will be happy to
change if others have some other preference.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-16 Thread Sai Prakash Ranjan


On 2021-06-16 12:28, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-15 19:23, Robin Murphy wrote:

On 2021-06-15 12:51, Sai Prakash Ranjan wrote:


...

Hi @Robin, from these discussions it seems they are not ok with the 
change
for all SoC vendor implementations and do not have any data on such 
impact.
As I mentioned above, on QCOM platforms we do have several 
optimizations in HW
for TLBIs and would like to make use of it and reduce the unmap 
latency.

What do you think, should this be made implementation specific?


Yes, it sounds like there's enough uncertainty for now that this needs
to be an opt-in feature. However, I still think that non-strict mode
could use it generically, since that's all about over-invalidating to
save time on individual unmaps - and relatively non-deterministic -
already.

So maybe we have a second set of iommu_flush_ops, or just a flag
somewhere to control the tlb_flush_walk functions internally, and the
choice can be made in the iommu_get_dma_strict() test, but also forced
on all the time by your init_context hook. What do you reckon?



Sounds good to me. Since you mentioned non-strict mode using it 
generically,
can't we just set tlb_flush_all() in io_pgtable_tlb_flush_walk() like 
below
based on quirk so that we don't need to add any check in 
iommu_get_dma_strict()

and just force the new flush_ops in init_context hook?

if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
iop->cfg.tlb->tlb_flush_all(iop->cookie);
return;
}



Instead of flush_ops in init_context hook, perhaps a io_pgtable quirk 
since this
is related to tlb, probably a bad name but IO_PGTABLE_QUIRK_TLB_INV 
which will be
set in init_context impl hook and the prev condition in 
io_pgtable_tlb_flush_walk()
becomes something like below. Seems very minimal and neat instead of 
poking into

tlb_flush_walk functions or touching dma strict with some flag?

if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT ||
iop->cfg.quirks & IO_PGTABLE_QUIRK_TLB_INV) {
iop->cfg.tlb->tlb_flush_all(iop->cookie);
return;
}

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-15 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-15 19:23, Robin Murphy wrote:

On 2021-06-15 12:51, Sai Prakash Ranjan wrote:


...

Hi @Robin, from these discussions it seems they are not ok with the 
change
for all SoC vendor implementations and do not have any data on such 
impact.
As I mentioned above, on QCOM platforms we do have several 
optimizations in HW
for TLBIs and would like to make use of it and reduce the unmap 
latency.

What do you think, should this be made implementation specific?


Yes, it sounds like there's enough uncertainty for now that this needs
to be an opt-in feature. However, I still think that non-strict mode
could use it generically, since that's all about over-invalidating to
save time on individual unmaps - and relatively non-deterministic -
already.

So maybe we have a second set of iommu_flush_ops, or just a flag
somewhere to control the tlb_flush_walk functions internally, and the
choice can be made in the iommu_get_dma_strict() test, but also forced
on all the time by your init_context hook. What do you reckon?



Sounds good to me. Since you mentioned non-strict mode using it 
generically,
can't we just set tlb_flush_all() in io_pgtable_tlb_flush_walk() like 
below
based on quirk so that we don't need to add any check in 
iommu_get_dma_strict()

and just force the new flush_ops in init_context hook?

if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
iop->cfg.tlb->tlb_flush_all(iop->cookie);
return;
}

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-15 Thread Sai Prakash Ranjan

Hi Krishna,

On 2021-06-14 23:18, Krishna Reddy wrote:
Right but we won't know until we profile the specific usecases or try 
them in
generic workload to see if they affect the performance. Sure, over 
invalidation is
a concern where multiple buffers can be mapped to same context and the 
cache
is not usable at the time for lookup and such but we don't do it for 
small buffers
and only for large buffers which means thousands of TLB entry mappings 
in

which case TLBIASID is preferred (note: I mentioned the HW team
recommendation to use it for anything greater than 128 TLB entries) in 
my
earlier reply. And also note that we do this only for partial walk 
flush, we are not

arbitrarily changing all the TLBIs to ASID based.

Most of the heavy bw use cases does involve processing larger buffers.
When the physical memory is allocated dis-contiguously at page_size
(let's use 4KB here)
granularity, each aligned 2MB chunks IOVA unmap would involve
performing a TLBIASID
as 2MB is not a leaf. Essentially, It happens all the time during
large buffer unmaps and
potentially impact active traffic on other large buffers. Depending on 
how much

latency HW engines can absorb, the overflow/underflow issues for ISO
engines can be
sporadic and vendor specific.
Performing TLBIASID as default for all SoCs is not a safe operation.

Ok so from what I gather from this is that its not easy to test for the
negative impact and you don't have data on such yet and the behaviour is
very vendor specific. To add on qcom impl, we have several performance
improvements for TLB cache invalidations in HW like wait-for-safe(for 
realtime

clients such as camera and display) and few others to allow for cache
lookups/updates when TLBI is in progress for the same context bank, so 
atleast

we are good here.

I am no camera expert but from what the camera team mentioned is that 
there
is a thread which frees memory(large unused memory buffers) 
periodically which
ends up taking around 100+ms and causing some camera test failures 
with
frame drops. Parallel efforts are already being made to optimize this 
usage of
thread but as I mentioned previously, this is *not a camera specific*, 
lets say
someone else invokes such large unmaps, it's going to face the same 
issue.

From the above, It doesn't look like the root cause of frame drops is
fully understood.
Why is 100+ms delay causing camera frame drop?  Is the same thread
submitting the buffers
to camera after unmap is complete? If not, how is the unmap latency
causing issue here?

Ok since you are interested in camera usecase, I have requested for more 
details
from the camera team and will give it once they comeback. However I 
don't think
its good to have unmap latency at all and that is being addressed by 
this patch.

> If unmap is queued and performed on a back ground thread, would it
> resolve the frame drops?

Not sure I understand what you mean by queuing on background thread 
but with

that or not, we still do the same number of TLBIs and hop through
iommu->io-pgtable->arm-smmu to perform the the unmap, so how will that
help?

I mean adding the unmap requests into a queue and processing them from
a different thread.
It is not to reduce the TLBIs. But, not to block subsequent buffer
allocation, IOVA map requests, if they
are being requested from same thread that is performing unmap. If
unmap is already performed from
a different thread, then the issue still need to be root caused to
understand it fully. Check for any
serialization issues.

This patch is to optimize unmap latency because of large number of mmio 
writes(TLBIVAs)
wasting CPU cycles and not to fix camera issue which can probably be 
solved by
parallelization. It seems to me like you are ok with the unmap latency 
in general

which we are not and want to avoid that latency.

Hi @Robin, from these discussions it seems they are not ok with the 
change
for all SoC vendor implementations and do not have any data on such 
impact.
As I mentioned above, on QCOM platforms we do have several optimizations 
in HW

for TLBIs and would like to make use of it and reduce the unmap latency.
What do you think, should this be made implementation specific?

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-11 Thread Sai Prakash Ranjan

Hi Krishna,

On 2021-06-11 22:19, Krishna Reddy wrote:

Hi Sai,

>> > No, the unmap latency is not just in some test case written, the
>> > issue is very real and we have workloads where camera is reporting
>> > frame drops because of this unmap latency in the order of 100s of
milliseconds.

Not exactly, this issue is not specific to camera. If you look at the 
numbers in the
commit text, even for the test device its the same observation. It 
depends on
the buffer size we are unmapping which affects the number of TLBIs 
issue. I am
not aware of any such HW side bw issues for camera specifically on 
QCOM

devices.

It is clear that reducing number of TLBIs  reduces the umap API
latency. But, It is
at the expense of throwing away valid tlb entries.
Quantifying the impact of arbitrary invalidation of valid tlb entries
at context level is not straight forward and
use case dependent. The side-effects might be rare or won't be known
until they are noticed.

Right but we won't know until we profile the specific usecases or try 
them

in generic workload to see if they affect the performance. Sure, over
invalidation is a concern where multiple buffers can be mapped to same 
context
and the cache is not usable at the time for lookup and such but we don't 
do it
for small buffers and only for large buffers which means thousands of 
TLB entry
mappings in which case TLBIASID is preferred (note: I mentioned the HW 
team
recommendation to use it for anything greater than 128 TLB entries) in 
my earlier
reply. And also note that we do this only for partial walk flush, we are 
not

arbitrarily changing all the TLBIs to ASID based.

Can you provide more details on How the unmap latency is causing
camera to drop frames?
Is unmap performed in the perf path?

I am no camera expert but from what the camera team mentioned is that
there is a thread which frees memory(large unused memory buffers)
periodically which ends up taking around 100+ms and causing some camera 
test
failures with frame drops. Parallel efforts are already being made to 
optimize
this usage of thread but as I mentioned previously, this is *not a 
camera
specific*, lets say someone else invokes such large unmaps, it's going 
to face

the same issue.

If unmap is queued and performed on a back ground thread, would it
resolve the frame drops?

Not sure I understand what you mean by queuing on background thread but 
with

that or not, we still do the same number of TLBIs and hop through
iommu->io-pgtable->arm-smmu to perform the the unmap, so how will that 
help?

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-10 Thread Sai Prakash Ranjan

Hi Krishna,

On 2021-06-11 06:07, Krishna Reddy wrote:

> No, the unmap latency is not just in some test case written, the issue
> is very real and we have workloads where camera is reporting frame
> drops because of this unmap latency in the order of 100s of milliseconds.
> And hardware team recommends using ASID based invalidations for
> anything larger than 128 TLB entries. So yes, we have taken note of
> impacts here before going this way and hence feel more inclined to
> make this qcom specific if required.

Seems like the real issue here is not the unmap API latency.
It should be the high number of back to back SMMU TLB invalidate
register writes that is resulting
in lower ISO BW to Camera and overflow. Isn't it?
Even Tegra186 SoC has similar issue and HW team recommended to rate
limit the number of
back to back SMMU tlb invalidate registers writes. The subsequent
Tegra194 SoC has a dedicated SMMU for
ISO clients to avoid the impact of TLB invalidates from NISO clients on 
ISO BW.

Not exactly, this issue is not specific to camera. If you look at
the numbers in the commit text, even for the test device its the
same observation. It depends on the buffer size we are unmapping
which affects the number of TLBIs issue. I am not aware of any
such HW side bw issues for camera specifically on QCOM devices.

Thanks,
Sai

Thinking some more, I
wonder if the Tegra folks might have an opinion to add here, given
that their multiple-SMMU solution was seemingly about trying to get
enough TLB and pagetable walk bandwidth in the first place?

While it is good to reduce the number of tlb register writes, Flushing
all TLB entries at context granularity arbitrarily
can have negative impact on active traffic and BW. I don't have much
data on possible impact at this point.
Can the flushing at context granularity be made a quirk than
performing it as default?

-KR

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-10 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-10 20:59, Robin Murphy wrote:

On 2021-06-10 12:54, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-10 17:03, Robin Murphy wrote:

On 2021-06-10 10:36, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-10 14:38, Robin Murphy wrote:

On 2021-06-10 06:24, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-10 00:14, Robin Murphy wrote:

On 2021-06-09 15:53, Sai Prakash Ranjan wrote:
Currently for iommu_unmap() of large scatter-gather list with 
page size
elements, the majority of time is spent in flushing of partial 
walks in
__arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA 
for

arm-smmu).

For example: to unmap a 32MB scatter-gather list with page size 
elements
(8192 entries), there are 16->2MB buffer unmaps based on the 
pgsize (2MB
for 4K granule) and each of 2MB will further result in 512 
TLBIVAs (2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB 
causing a huge

overhead.

So instead use io_pgtable_tlb_flush_all() to invalidate the 
entire context
if size (pgsize) is greater than the granule size (4K, 16K, 
64K). For this
example of 32MB scatter-gather list unmap, this results in just 
16 ASID
based TLB invalidations or tlb_flush_all() callback (TLBIASID in 
case of
arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the 
performance of

unmaps drastically.

Condition (size > granule size) is chosen for 
io_pgtable_tlb_flush_all()
because for any granule with supported pgsizes, we will have at 
least 512
TLB invalidations for which tlb_flush_all() is already 
recommended. For
example, take 4K granule with 2MB pgsize, this will result in 
512 TLBIVA

in partial walk flush.

Test on QTI SM8150 SoC for 10 iterations of 
iommu_{map_sg}/unmap:

(average over 10 iterations)

Before this optimization:

 size    iommu_map_sg  iommu_unmap
   4K    2.067 us 1.854 us
  64K    9.598 us 8.802 us
   1M  148.890 us   130.718 us
   2M  305.864 us    67.291 us
  12M 1793.604 us   390.838 us
  16M 2386.848 us   518.187 us
  24M 3563.296 us   775.989 us
  32M 4747.171 us  1033.364 us

After this optimization:

 size    iommu_map_sg  iommu_unmap
   4K    1.723 us 1.765 us
  64K    9.880 us 8.869 us
   1M  155.364 us   135.223 us
   2M  303.906 us 5.385 us
  12M 1786.557 us    21.250 us
  16M 2391.890 us    27.437 us
  24M 3570.895 us    39.937 us
  32M 4755.234 us    51.797 us

This is further reduced once the map/unmap_pages() support gets 
in which
will result in just 1 tlb_flush_all() as opposed to 16 
tlb_flush_all().


Signed-off-by: Sai Prakash Ranjan 


---
  drivers/iommu/io-pgtable-arm.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 87def58e79b5..c3cb9add3179 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct 
arm_lpae_io_pgtable *data,

    if (!iopte_leaf(pte, lvl, iop->fmt)) {
  /* Also flush any partial walks */
-    io_pgtable_tlb_flush_walk(iop, iova, size,
-  ARM_LPAE_GRANULE(data));
+    if (size > ARM_LPAE_GRANULE(data))
+    io_pgtable_tlb_flush_all(iop);
+    else


Erm, when will the above condition ever not be true? ;)



Ah right, silly me :)

Taking a step back, though, what about the impact to drivers 
other

than SMMUv2?


Other drivers would be msm_iommu.c, qcom_iommu.c which does the 
same
thing as arm-smmu-v2 (page based invalidations), then there is 
ipmmu-vmsa.c

which does tlb_flush_all() for flush walk.


In particular I'm thinking of SMMUv3.2 where the whole
range can be invalidated by VA in a single command anyway, so the
additional penalties of TLBIALL are undesirable.



Right, so I am thinking we can have a new generic quirk 
IO_PGTABLE_QUIRK_RANGE_INV
to choose between range based invalidations(tlb_flush_walk) and 
tlb_flush_all().
In this case of arm-smmu-v3.2, we can tie up 
ARM_SMMU_FEAT_RANGE_INV with this quirk

and have something like below, thoughts?

if (iop->cfg.quirks & IO_PGTABLE_QUIRK_RANGE_INV)
 io_pgtable_tlb_flush_walk(iop, iova, size,
   ARM_LPAE_GRANULE(data));
else
 io_pgtable_tlb_flush_all(iop);


The design here has always been that io-pgtable says *what* needs
invalidating, and we left it up to the drivers to decide exactly
*how*. Even though things have evolved a bit I don't think that has
fundamentally changed - tlb_flush_walk is now only used in this one
place (technically I suppose it could be renamed tlb_flush_table 
but

it's not worth the churn), so driv

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-10 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-10 17:03, Robin Murphy wrote:

On 2021-06-10 10:36, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-10 14:38, Robin Murphy wrote:

On 2021-06-10 06:24, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-10 00:14, Robin Murphy wrote:

On 2021-06-09 15:53, Sai Prakash Ranjan wrote:
Currently for iommu_unmap() of large scatter-gather list with page 
size
elements, the majority of time is spent in flushing of partial 
walks in
__arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA 
for

arm-smmu).

For example: to unmap a 32MB scatter-gather list with page size 
elements
(8192 entries), there are 16->2MB buffer unmaps based on the 
pgsize (2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs 
(2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing 
a huge

overhead.

So instead use io_pgtable_tlb_flush_all() to invalidate the entire 
context
if size (pgsize) is greater than the granule size (4K, 16K, 64K). 
For this
example of 32MB scatter-gather list unmap, this results in just 16 
ASID
based TLB invalidations or tlb_flush_all() callback (TLBIASID in 
case of
arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the 
performance of

unmaps drastically.

Condition (size > granule size) is chosen for 
io_pgtable_tlb_flush_all()
because for any granule with supported pgsizes, we will have at 
least 512
TLB invalidations for which tlb_flush_all() is already 
recommended. For
example, take 4K granule with 2MB pgsize, this will result in 512 
TLBIVA

in partial walk flush.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

 size    iommu_map_sg  iommu_unmap
   4K    2.067 us 1.854 us
  64K    9.598 us 8.802 us
   1M  148.890 us   130.718 us
   2M  305.864 us    67.291 us
  12M 1793.604 us   390.838 us
  16M 2386.848 us   518.187 us
  24M 3563.296 us   775.989 us
  32M 4747.171 us  1033.364 us

After this optimization:

 size    iommu_map_sg  iommu_unmap
   4K    1.723 us 1.765 us
  64K    9.880 us 8.869 us
   1M  155.364 us   135.223 us
   2M  303.906 us 5.385 us
  12M 1786.557 us    21.250 us
  16M 2391.890 us    27.437 us
  24M 3570.895 us    39.937 us
  32M 4755.234 us    51.797 us

This is further reduced once the map/unmap_pages() support gets in 
which
will result in just 1 tlb_flush_all() as opposed to 16 
tlb_flush_all().


Signed-off-by: Sai Prakash Ranjan 


---
  drivers/iommu/io-pgtable-arm.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 87def58e79b5..c3cb9add3179 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct 
arm_lpae_io_pgtable *data,

    if (!iopte_leaf(pte, lvl, iop->fmt)) {
  /* Also flush any partial walks */
-    io_pgtable_tlb_flush_walk(iop, iova, size,
-  ARM_LPAE_GRANULE(data));
+    if (size > ARM_LPAE_GRANULE(data))
+    io_pgtable_tlb_flush_all(iop);
+    else


Erm, when will the above condition ever not be true? ;)



Ah right, silly me :)


Taking a step back, though, what about the impact to drivers other
than SMMUv2?


Other drivers would be msm_iommu.c, qcom_iommu.c which does the same
thing as arm-smmu-v2 (page based invalidations), then there is 
ipmmu-vmsa.c

which does tlb_flush_all() for flush walk.


In particular I'm thinking of SMMUv3.2 where the whole
range can be invalidated by VA in a single command anyway, so the
additional penalties of TLBIALL are undesirable.



Right, so I am thinking we can have a new generic quirk 
IO_PGTABLE_QUIRK_RANGE_INV
to choose between range based invalidations(tlb_flush_walk) and 
tlb_flush_all().
In this case of arm-smmu-v3.2, we can tie up ARM_SMMU_FEAT_RANGE_INV 
with this quirk

and have something like below, thoughts?

if (iop->cfg.quirks & IO_PGTABLE_QUIRK_RANGE_INV)
 io_pgtable_tlb_flush_walk(iop, iova, size,
   ARM_LPAE_GRANULE(data));
else
 io_pgtable_tlb_flush_all(iop);


The design here has always been that io-pgtable says *what* needs
invalidating, and we left it up to the drivers to decide exactly
*how*. Even though things have evolved a bit I don't think that has
fundamentally changed - tlb_flush_walk is now only used in this one
place (technically I suppose it could be renamed tlb_flush_table but
it's not worth the churn), so drivers can implement their own
preferred table-invalidating behaviour even more easily than choosing
whether

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-10 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-10 14:38, Robin Murphy wrote:

On 2021-06-10 06:24, Sai Prakash Ranjan wrote:

Hi Robin,

On 2021-06-10 00:14, Robin Murphy wrote:

On 2021-06-09 15:53, Sai Prakash Ranjan wrote:
Currently for iommu_unmap() of large scatter-gather list with page 
size
elements, the majority of time is spent in flushing of partial walks 
in

__arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA for
arm-smmu).

For example: to unmap a 32MB scatter-gather list with page size 
elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize 
(2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs 
(2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a 
huge

overhead.

So instead use io_pgtable_tlb_flush_all() to invalidate the entire 
context
if size (pgsize) is greater than the granule size (4K, 16K, 64K). 
For this
example of 32MB scatter-gather list unmap, this results in just 16 
ASID
based TLB invalidations or tlb_flush_all() callback (TLBIASID in 
case of
arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the 
performance of

unmaps drastically.

Condition (size > granule size) is chosen for 
io_pgtable_tlb_flush_all()
because for any granule with supported pgsizes, we will have at 
least 512
TLB invalidations for which tlb_flush_all() is already recommended. 
For
example, take 4K granule with 2MB pgsize, this will result in 512 
TLBIVA

in partial walk flush.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

 size    iommu_map_sg  iommu_unmap
   4K    2.067 us 1.854 us
  64K    9.598 us 8.802 us
   1M  148.890 us   130.718 us
   2M  305.864 us    67.291 us
  12M 1793.604 us   390.838 us
  16M 2386.848 us   518.187 us
  24M 3563.296 us   775.989 us
  32M 4747.171 us  1033.364 us

After this optimization:

 size    iommu_map_sg  iommu_unmap
   4K    1.723 us 1.765 us
  64K    9.880 us 8.869 us
   1M  155.364 us   135.223 us
   2M  303.906 us 5.385 us
  12M 1786.557 us    21.250 us
  16M 2391.890 us    27.437 us
  24M 3570.895 us    39.937 us
  32M 4755.234 us    51.797 us

This is further reduced once the map/unmap_pages() support gets in 
which
will result in just 1 tlb_flush_all() as opposed to 16 
tlb_flush_all().


Signed-off-by: Sai Prakash Ranjan 
---
  drivers/iommu/io-pgtable-arm.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 87def58e79b5..c3cb9add3179 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct 
arm_lpae_io_pgtable *data,

    if (!iopte_leaf(pte, lvl, iop->fmt)) {
  /* Also flush any partial walks */
-    io_pgtable_tlb_flush_walk(iop, iova, size,
-  ARM_LPAE_GRANULE(data));
+    if (size > ARM_LPAE_GRANULE(data))
+    io_pgtable_tlb_flush_all(iop);
+    else


Erm, when will the above condition ever not be true? ;)



Ah right, silly me :)


Taking a step back, though, what about the impact to drivers other
than SMMUv2?


Other drivers would be msm_iommu.c, qcom_iommu.c which does the same
thing as arm-smmu-v2 (page based invalidations), then there is 
ipmmu-vmsa.c

which does tlb_flush_all() for flush walk.


In particular I'm thinking of SMMUv3.2 where the whole
range can be invalidated by VA in a single command anyway, so the
additional penalties of TLBIALL are undesirable.



Right, so I am thinking we can have a new generic quirk 
IO_PGTABLE_QUIRK_RANGE_INV
to choose between range based invalidations(tlb_flush_walk) and 
tlb_flush_all().
In this case of arm-smmu-v3.2, we can tie up ARM_SMMU_FEAT_RANGE_INV 
with this quirk

and have something like below, thoughts?

if (iop->cfg.quirks & IO_PGTABLE_QUIRK_RANGE_INV)
     io_pgtable_tlb_flush_walk(iop, iova, size,
   ARM_LPAE_GRANULE(data));
else
     io_pgtable_tlb_flush_all(iop);


The design here has always been that io-pgtable says *what* needs
invalidating, and we left it up to the drivers to decide exactly
*how*. Even though things have evolved a bit I don't think that has
fundamentally changed - tlb_flush_walk is now only used in this one
place (technically I suppose it could be renamed tlb_flush_table but
it's not worth the churn), so drivers can implement their own
preferred table-invalidating behaviour even more easily than choosing
whether to bounce a quirk through the common code or not. Consider
what you've already seen for the Renesas

Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-09 Thread Sai Prakash Ranjan


Hi Robin,

On 2021-06-10 00:14, Robin Murphy wrote:

On 2021-06-09 15:53, Sai Prakash Ranjan wrote:
Currently for iommu_unmap() of large scatter-gather list with page 
size
elements, the majority of time is spent in flushing of partial walks 
in

__arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA for
arm-smmu).

For example: to unmap a 32MB scatter-gather list with page size 
elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize 
(2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs 
(2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a 
huge

overhead.

So instead use io_pgtable_tlb_flush_all() to invalidate the entire 
context
if size (pgsize) is greater than the granule size (4K, 16K, 64K). For 
this
example of 32MB scatter-gather list unmap, this results in just 16 
ASID
based TLB invalidations or tlb_flush_all() callback (TLBIASID in case 
of
arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the 
performance of

unmaps drastically.

Condition (size > granule size) is chosen for 
io_pgtable_tlb_flush_all()
because for any granule with supported pgsizes, we will have at least 
512
TLB invalidations for which tlb_flush_all() is already recommended. 
For
example, take 4K granule with 2MB pgsize, this will result in 512 
TLBIVA

in partial walk flush.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

 sizeiommu_map_sg  iommu_unmap
   4K2.067 us 1.854 us
  64K9.598 us 8.802 us
   1M  148.890 us   130.718 us
   2M  305.864 us67.291 us
  12M 1793.604 us   390.838 us
  16M 2386.848 us   518.187 us
  24M 3563.296 us   775.989 us
  32M 4747.171 us  1033.364 us

After this optimization:

 sizeiommu_map_sg  iommu_unmap
   4K1.723 us 1.765 us
  64K9.880 us 8.869 us
   1M  155.364 us   135.223 us
   2M  303.906 us 5.385 us
  12M 1786.557 us21.250 us
  16M 2391.890 us27.437 us
  24M 3570.895 us39.937 us
  32M 4755.234 us51.797 us

This is further reduced once the map/unmap_pages() support gets in 
which
will result in just 1 tlb_flush_all() as opposed to 16 
tlb_flush_all().


Signed-off-by: Sai Prakash Ranjan 
---
  drivers/iommu/io-pgtable-arm.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 87def58e79b5..c3cb9add3179 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct 
arm_lpae_io_pgtable *data,

if (!iopte_leaf(pte, lvl, iop->fmt)) {
/* Also flush any partial walks */
-   io_pgtable_tlb_flush_walk(iop, iova, size,
- ARM_LPAE_GRANULE(data));
+   if (size > ARM_LPAE_GRANULE(data))
+   io_pgtable_tlb_flush_all(iop);
+   else


Erm, when will the above condition ever not be true? ;)



Ah right, silly me :)


Taking a step back, though, what about the impact to drivers other
than SMMUv2?


Other drivers would be msm_iommu.c, qcom_iommu.c which does the same
thing as arm-smmu-v2 (page based invalidations), then there is 
ipmmu-vmsa.c

which does tlb_flush_all() for flush walk.


In particular I'm thinking of SMMUv3.2 where the whole
range can be invalidated by VA in a single command anyway, so the
additional penalties of TLBIALL are undesirable.



Right, so I am thinking we can have a new generic quirk 
IO_PGTABLE_QUIRK_RANGE_INV
to choose between range based invalidations(tlb_flush_walk) and 
tlb_flush_all().
In this case of arm-smmu-v3.2, we can tie up ARM_SMMU_FEAT_RANGE_INV 
with this quirk

and have something like below, thoughts?

if (iop->cfg.quirks & IO_PGTABLE_QUIRK_RANGE_INV)
io_pgtable_tlb_flush_walk(iop, iova, size,
  ARM_LPAE_GRANULE(data));
else
io_pgtable_tlb_flush_all(iop);

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list

2021-06-09 Thread Sai Prakash Ranjan

Currently for iommu_unmap() of large scatter-gather list with page size
elements, the majority of time is spent in flushing of partial walks in
__arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA for
arm-smmu).

For example: to unmap a 32MB scatter-gather list with page size elements
(8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB
for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K)
resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge
overhead.

So instead use io_pgtable_tlb_flush_all() to invalidate the entire context
if size (pgsize) is greater than the granule size (4K, 16K, 64K). For this
example of 32MB scatter-gather list unmap, this results in just 16 ASID
based TLB invalidations or tlb_flush_all() callback (TLBIASID in case of
arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the performance of
unmaps drastically.

Condition (size > granule size) is chosen for io_pgtable_tlb_flush_all()
because for any granule with supported pgsizes, we will have at least 512
TLB invalidations for which tlb_flush_all() is already recommended. For
example, take 4K granule with 2MB pgsize, this will result in 512 TLBIVA
in partial walk flush.

Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap:
(average over 10 iterations)

Before this optimization:

sizeiommu_map_sg  iommu_unmap
  4K2.067 us 1.854 us
 64K9.598 us 8.802 us
  1M  148.890 us   130.718 us
  2M  305.864 us67.291 us
 12M 1793.604 us   390.838 us
 16M 2386.848 us   518.187 us
 24M 3563.296 us   775.989 us
 32M 4747.171 us  1033.364 us

After this optimization:

sizeiommu_map_sg  iommu_unmap
  4K1.723 us 1.765 us
 64K9.880 us 8.869 us
  1M  155.364 us   135.223 us
  2M  303.906 us 5.385 us
 12M 1786.557 us21.250 us
 16M 2391.890 us27.437 us
 24M 3570.895 us39.937 us
 32M 4755.234 us51.797 us

This is further reduced once the map/unmap_pages() support gets in which
will result in just 1 tlb_flush_all() as opposed to 16 tlb_flush_all().

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 87def58e79b5..c3cb9add3179 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable 
*data,
 
if (!iopte_leaf(pte, lvl, iop->fmt)) {
/* Also flush any partial walks */
-   io_pgtable_tlb_flush_walk(iop, iova, size,
- ARM_LPAE_GRANULE(data));
+   if (size > ARM_LPAE_GRANULE(data))
+   io_pgtable_tlb_flush_all(iop);
+   else
+   io_pgtable_tlb_flush_walk(iop, iova, size,
+ 
ARM_LPAE_GRANULE(data));
ptep = iopte_deref(pte, data);
__arm_lpae_free_pgtable(data, lvl + 1, ptep);
} else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv3 0/2] iommu/arm-smmu-qcom: Add SC7280 support

2021-06-08 Thread Sai Prakash Ranjan


On 2021-06-08 17:31, Will Deacon wrote:

On Tue, Apr 20, 2021 at 11:34:55AM +0530, Sai Prakash Ranjan wrote:

Patch 1 adds the sc7280 smmu compatible.
Patch 2 moves the adreno smmu check before apss smmu to enable
adreno smmu specific implementation.

Note that dt-binding for sc7280 is already merged.


This conflicts with what I've already got queued at [1]. Please can you
send an updated version, as I wasn't sure about the initialisation 
order

you need here wrt to the ACPI parts.

Thanks,

Will

[1]
https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/log/?h=for-joerg/arm-smmu/updates


Sure, have rebased and sent the updated patch [1] after testing for the 
order.


Thanks,
Sai

[1] 
https://lore.kernel.org/lkml/cover.1623155117.git.saiprakash.ran...@codeaurora.org/


--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv4 1/2] iommu/arm-smmu-qcom: Add SC7280 SMMU compatible

2021-06-08 Thread Sai Prakash Ranjan

Add compatible for SC7280 SMMU to use the Qualcomm Technologies, Inc.
specific implementation.

Signed-off-by: Sai Prakash Ranjan 
Reviewed-by: Bjorn Andersson 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 6f70f0e57c64..e93b5dbda7de 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -178,6 +178,7 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,mdss" },
{ .compatible = "qcom,sc7180-mdss" },
{ .compatible = "qcom,sc7180-mss-pil" },
+   { .compatible = "qcom,sc7280-mdss" },
{ .compatible = "qcom,sc8180x-mdss" },
{ .compatible = "qcom,sdm845-mdss" },
{ .compatible = "qcom,sdm845-mss-pil" },
@@ -342,6 +343,7 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
 static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
{ .compatible = "qcom,msm8998-smmu-v2" },
{ .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sc7280-smmu-500" },
{ .compatible = "qcom,sc8180x-smmu-500" },
{ .compatible = "qcom,sdm630-smmu-v2" },
{ .compatible = "qcom,sdm845-smmu-500" },
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv4 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl

2021-06-08 Thread Sai Prakash Ranjan

Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU
both implement "arm,mmu-500" in some QTI SoCs and to run through
adreno smmu specific implementation such as enabling split pagetables
support, we need to match the "qcom,adreno-smmu" compatible first
before apss smmu or else we will be running apps smmu implementation
for adreno smmu and the additional features for adreno smmu is never
set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps
and adreno smmu implementing "arm,mmu-500", so the adreno smmu
implementation is never reached because the current sequence checks
for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that
specific impl and we never reach adreno smmu specific implementation.

Suggested-by: Akhil P Oommen 
Signed-off-by: Sai Prakash Ranjan 
Reviewed-by: Bjorn Andersson 
Acked-by: Jordan Crouse 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index e93b5dbda7de..83c32566bf64 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -370,11 +370,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct 
arm_smmu_device *smmu)
return qcom_smmu_create(smmu, &qcom_smmu_impl);
}
 
-   if (of_match_node(qcom_smmu_impl_of_match, np))
-   return qcom_smmu_create(smmu, &qcom_smmu_impl);
-
+   /*
+* Do not change this order of implementation, i.e., first adreno
+* smmu impl and then apss smmu since we can have both implementing
+* arm,mmu-500 in which case we will miss setting adreno smmu specific
+* features if the order is changed.
+*/
if (of_device_is_compatible(np, "qcom,adreno-smmu"))
return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);
 
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, &qcom_smmu_impl);
+
return smmu;
 }
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv4 0/2] iommu/arm-smmu-qcom: Add SC7280 support

2021-06-08 Thread Sai Prakash Ranjan

Patch 1 adds the sc7280 smmu compatible.
Patch 2 moves the adreno smmu check before apss smmu to enable
adreno smmu specific implementation.

Note that dt-binding for sc7280 is already merged.

Changes in v4:
 * Rebased on top of arm-smmu/updates with acpi changes.

Changes in v3:
 * Collect acks and reviews
 * Rebase on top of for-joerg/arm-smmu/updates

Changes in v2:
 * Add a comment to make sure this order is not changed in future (Jordan)

Sai Prakash Ranjan (2):
  iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
  iommu/arm-smmu-qcom: Move the adreno smmu specific impl

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv3 0/2] iommu/arm-smmu-qcom: Add SC7280 support

2021-06-06 Thread Sai Prakash Ranjan


Hi Will,

On 2021-05-24 08:13, Sai Prakash Ranjan wrote:

Hi Will,

On 2021-04-20 11:34, Sai Prakash Ranjan wrote:

Patch 1 adds the sc7280 smmu compatible.
Patch 2 moves the adreno smmu check before apss smmu to enable
adreno smmu specific implementation.

Note that dt-binding for sc7280 is already merged.

Changes in v3:
 * Collect acks and reviews
 * Rebase on top of for-joerg/arm-smmu/updates

Changes in v2:
 * Add a comment to make sure this order is not changed in future 
(Jordan)


Sai Prakash Ranjan (2):
  iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
  iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)


Gentle Ping!



Is this going to be taken for 5.14 or needs one more release cycle?

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv3 0/2] iommu/arm-smmu-qcom: Add SC7280 support

2021-05-23 Thread Sai Prakash Ranjan


Hi Will,

On 2021-04-20 11:34, Sai Prakash Ranjan wrote:

Patch 1 adds the sc7280 smmu compatible.
Patch 2 moves the adreno smmu check before apss smmu to enable
adreno smmu specific implementation.

Note that dt-binding for sc7280 is already merged.

Changes in v3:
 * Collect acks and reviews
 * Rebase on top of for-joerg/arm-smmu/updates

Changes in v2:
 * Add a comment to make sure this order is not changed in future 
(Jordan)


Sai Prakash Ranjan (2):
  iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
  iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)


Gentle Ping!

Thanks,
Sai
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-04-19 Thread Sai Prakash Ranjan


On 2021-04-19 20:08, Bjorn Andersson wrote:

On Fri 26 Feb 03:55 CST 2021, Sai Prakash Ranjan wrote:


Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU
both implement "arm,mmu-500" in some QTI SoCs and to run through
adreno smmu specific implementation such as enabling split pagetables
support, we need to match the "qcom,adreno-smmu" compatible first
before apss smmu or else we will be running apps smmu implementation
for adreno smmu and the additional features for adreno smmu is never
set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps
and adreno smmu implementing "arm,mmu-500", so the adreno smmu
implementation is never reached because the current sequence checks
for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that
specific impl and we never reach adreno smmu specific implementation.

Suggested-by: Akhil P Oommen 
Signed-off-by: Sai Prakash Ranjan 


Sorry for taking my time thinking about this.

Reviewed-by: Bjorn Andersson 



No worries, thanks Bjorn.

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv3 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-04-19 Thread Sai Prakash Ranjan

Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU
both implement "arm,mmu-500" in some QTI SoCs and to run through
adreno smmu specific implementation such as enabling split pagetables
support, we need to match the "qcom,adreno-smmu" compatible first
before apss smmu or else we will be running apps smmu implementation
for adreno smmu and the additional features for adreno smmu is never
set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps
and adreno smmu implementing "arm,mmu-500", so the adreno smmu
implementation is never reached because the current sequence checks
for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that
specific impl and we never reach adreno smmu specific implementation.

Suggested-by: Akhil P Oommen 
Signed-off-by: Sai Prakash Ranjan 
Reviewed-by: Bjorn Andersson 
Acked-by: Jordan Crouse 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index bea3ee0dabc2..03f048aebb80 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -345,11 +345,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct 
arm_smmu_device *smmu)
 {
const struct device_node *np = smmu->dev->of_node;
 
-   if (of_match_node(qcom_smmu_impl_of_match, np))
-   return qcom_smmu_create(smmu, &qcom_smmu_impl);
-
+   /*
+* Do not change this order of implementation, i.e., first adreno
+* smmu impl and then apss smmu since we can have both implementing
+* arm,mmu-500 in which case we will miss setting adreno smmu specific
+* features if the order is changed.
+*/
if (of_device_is_compatible(np, "qcom,adreno-smmu"))
return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);
 
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, &qcom_smmu_impl);
+
return smmu;
 }
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv3 1/2] iommu/arm-smmu-qcom: Add SC7280 SMMU compatible

2021-04-19 Thread Sai Prakash Ranjan

Add compatible for SC7280 SMMU to use the Qualcomm Technologies, Inc.
specific implementation.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 98b3a1c2a181..bea3ee0dabc2 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -166,6 +166,7 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,mdss" },
{ .compatible = "qcom,sc7180-mdss" },
{ .compatible = "qcom,sc7180-mss-pil" },
+   { .compatible = "qcom,sc7280-mdss" },
{ .compatible = "qcom,sc8180x-mdss" },
{ .compatible = "qcom,sdm845-mdss" },
{ .compatible = "qcom,sdm845-mss-pil" },
@@ -330,6 +331,7 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
 static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
{ .compatible = "qcom,msm8998-smmu-v2" },
{ .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sc7280-smmu-500" },
{ .compatible = "qcom,sc8180x-smmu-500" },
{ .compatible = "qcom,sdm630-smmu-v2" },
{ .compatible = "qcom,sdm845-smmu-500" },
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv3 0/2] iommu/arm-smmu-qcom: Add SC7280 support

2021-04-19 Thread Sai Prakash Ranjan

Patch 1 adds the sc7280 smmu compatible.
Patch 2 moves the adreno smmu check before apss smmu to enable
adreno smmu specific implementation.

Note that dt-binding for sc7280 is already merged.

Changes in v3:
 * Collect acks and reviews
 * Rebase on top of for-joerg/arm-smmu/updates

Changes in v2:
 * Add a comment to make sure this order is not changed in future (Jordan)

Sai Prakash Ranjan (2):
  iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
  iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-04-18 Thread Sai Prakash Ranjan


On 2021-04-05 14:12, Sai Prakash Ranjan wrote:

Hi Bjorn,

On 2021-03-25 20:35, Will Deacon wrote:

On Thu, Mar 25, 2021 at 01:10:12PM +0530, Sai Prakash Ranjan wrote:


...



I think there is consensus on this series. I can resend if required 
but it

still applies cleanly, let me know if you have any comments?


Please resend with the bindings patch, and I'd like Bjorn's Ack as 
well.




Can we have your review/ack in case there is nothing pending here?



Gentle Ping!

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-04-05 Thread Sai Prakash Ranjan


Hi Bjorn,

On 2021-03-25 20:35, Will Deacon wrote:

On Thu, Mar 25, 2021 at 01:10:12PM +0530, Sai Prakash Ranjan wrote:


...



I think there is consensus on this series. I can resend if required 
but it

still applies cleanly, let me know if you have any comments?


Please resend with the bindings patch, and I'd like Bjorn's Ack as 
well.




Can we have your review/ack in case there is nothing pending here?

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-03-25 Thread Sai Prakash Ranjan

Hi Will,

On 2021-03-15 00:31, Sai Prakash Ranjan wrote:

On 2021-03-12 04:59, Bjorn Andersson wrote:

On Sat 27 Feb 07:53 CST 2021, Sai Prakash Ranjan wrote:

Hi Bjorn,

On 2021-02-27 00:44, Bjorn Andersson wrote:
> On Fri 26 Feb 12:23 CST 2021, Rob Clark wrote:
>
>
> The current logic picks one of:
> 1) is the compatible mentioned in qcom_smmu_impl_of_match[]
> 2) is the compatible an adreno
> 3) no quirks needed
>
> The change flips the order of these, so the only way I can see this
> change affecting things is if we expected a match on #2, but we got one
> on #1.
>
> Which implies that the instance that we want to act according to the
> adreno impl was listed in qcom_smmu_impl_of_match[] - which either is
> wrong, or there's a single instance that needs both behaviors.
>
> (And I believe Jordan's answer confirms the latter - there's a single
> SMMU instance that needs all them quirks at once)
>

Let me go through the problem statement in case my commit message 
wasn't
clear. There are two SMMUs (APSS and GPU) on SC7280 and both are 
SMMU500

(ARM SMMU IP).

APSS SMMU compatible - ("qcom,sc7280-smmu-500", "arm,mmu-500")
GPU SMMU compatible - ("qcom,sc7280-smmu-500", "qcom,adreno-smmu", 
"arm,mmu-500")

Now if we take SC7180 as an example, GPU SMMU was QSMMU(QCOM SMMU IP)
and APSS SMMU was SMMU500(ARM SMMU IP).

APSS SMMU compatible - ("qcom,sc7180-smmu-500", "arm,mmu-500")
GPU SMMU compatible - ("qcom,sc7180-smmu-v2", "qcom,adreno-smmu", 
"qcom,smmu-v2")

Current code sequence without this patch,

if (of_match_node(qcom_smmu_impl_of_match, np))
 return qcom_smmu_create(smmu, &qcom_smmu_impl);

if (of_device_is_compatible(np, "qcom,adreno-smmu"))
return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);

Now if we look at the compatible for SC7180, there is no problem 
because
the APSS SMMU will match the one in qcom_smmu_impl_of_match[] and GPU 
SMMU
will match "qcom,adreno-smmu" because the compatible strings are 
different.
But for SC7280, both the APSS SMMU and GPU SMMU 
compatible("qcom,sc7280-smmu-500")
are same. So GPU SMMU will match with the one in 
qcom_smmu_impl_of_match[]
i.e.., "qcom,sc7280-smmu-500" which functionally doesn't cause any 
problem
but we will miss settings for split pagetables which are part of GPU 
SMMU

specific implementation.

We can avoid this with yet another new compatible for GPU SMMU 
something like
"qcom,sc7280-adreno-smmu-500" but since we can handle this easily in 
the
driver and since the IPs are same, meaning if there was a hardware 
quirk
required, then we would need to apply to both of them and would this 
additional

compatible be of any help?

No, I think you're doing the right thing of having them both. I just
didn't remember us doing that.

Coming to the part of quirks now, you are right saying both SMMUs 
will need
to have the same quirks in SC7280 and similar others where both are 
based on
same IPs but those should probably be *hardware quirks* and if they 
are
software based like the S2CR quirk depending on the firmware, then it 
might
not be applicable to both. In case if it is applicable, then as 
Jordan mentioned,

we can add the same quirks in GPU SMMU implementation.

I do suspect that at some point (probably sooner than later) we'd have
to support both inheriting of stream from the bootloader and the 
Adreno

"quirks" in the same instance.

But for now this is okay to me.

Sure, let me know if you or anyone face any issues without it and I 
will
add it. I will resend this series with the dt-bindings patch for sc7280 
smmu

which wasn't cc'd to smmu folks by mistake.

I think there is consensus on this series. I can resend if required but 
it

still applies cleanly, let me know if you have any comments?

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-03-14 Thread Sai Prakash Ranjan

On 2021-03-12 04:59, Bjorn Andersson wrote:

On Sat 27 Feb 07:53 CST 2021, Sai Prakash Ranjan wrote:

Hi Bjorn,

On 2021-02-27 00:44, Bjorn Andersson wrote:
> On Fri 26 Feb 12:23 CST 2021, Rob Clark wrote:
>
>
> The current logic picks one of:
> 1) is the compatible mentioned in qcom_smmu_impl_of_match[]
> 2) is the compatible an adreno
> 3) no quirks needed
>
> The change flips the order of these, so the only way I can see this
> change affecting things is if we expected a match on #2, but we got one
> on #1.
>
> Which implies that the instance that we want to act according to the
> adreno impl was listed in qcom_smmu_impl_of_match[] - which either is
> wrong, or there's a single instance that needs both behaviors.
>
> (And I believe Jordan's answer confirms the latter - there's a single
> SMMU instance that needs all them quirks at once)
>

Let me go through the problem statement in case my commit message 
wasn't
clear. There are two SMMUs (APSS and GPU) on SC7280 and both are 
SMMU500

(ARM SMMU IP).

APSS SMMU compatible - ("qcom,sc7280-smmu-500", "arm,mmu-500")
GPU SMMU compatible - ("qcom,sc7280-smmu-500", "qcom,adreno-smmu", 
"arm,mmu-500")

Now if we take SC7180 as an example, GPU SMMU was QSMMU(QCOM SMMU IP)
and APSS SMMU was SMMU500(ARM SMMU IP).

APSS SMMU compatible - ("qcom,sc7180-smmu-500", "arm,mmu-500")
GPU SMMU compatible - ("qcom,sc7180-smmu-v2", "qcom,adreno-smmu", 
"qcom,smmu-v2")

Current code sequence without this patch,

if (of_match_node(qcom_smmu_impl_of_match, np))
 return qcom_smmu_create(smmu, &qcom_smmu_impl);

if (of_device_is_compatible(np, "qcom,adreno-smmu"))
return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);

Now if we look at the compatible for SC7180, there is no problem 
because
the APSS SMMU will match the one in qcom_smmu_impl_of_match[] and GPU 
SMMU
will match "qcom,adreno-smmu" because the compatible strings are 
different.
But for SC7280, both the APSS SMMU and GPU SMMU 
compatible("qcom,sc7280-smmu-500")
are same. So GPU SMMU will match with the one in 
qcom_smmu_impl_of_match[]
i.e.., "qcom,sc7280-smmu-500" which functionally doesn't cause any 
problem
but we will miss settings for split pagetables which are part of GPU 
SMMU

specific implementation.

We can avoid this with yet another new compatible for GPU SMMU 
something like
"qcom,sc7280-adreno-smmu-500" but since we can handle this easily in 
the
driver and since the IPs are same, meaning if there was a hardware 
quirk
required, then we would need to apply to both of them and would this 
additional

compatible be of any help?

No, I think you're doing the right thing of having them both. I just
didn't remember us doing that.

Coming to the part of quirks now, you are right saying both SMMUs will 
need
to have the same quirks in SC7280 and similar others where both are 
based on
same IPs but those should probably be *hardware quirks* and if they 
are
software based like the S2CR quirk depending on the firmware, then it 
might
not be applicable to both. In case if it is applicable, then as Jordan 
mentioned,

we can add the same quirks in GPU SMMU implementation.

I do suspect that at some point (probably sooner than later) we'd have
to support both inheriting of stream from the bootloader and the Adreno
"quirks" in the same instance.

But for now this is okay to me.

Sure, let me know if you or anyone face any issues without it and I will
add it. I will resend this series with the dt-bindings patch for sc7280 
smmu

which wasn't cc'd to smmu folks by mistake.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-03-08 Thread Sai Prakash Ranjan


Hi,

On 2021-02-05 17:38, Sai Prakash Ranjan wrote:

On 2021-02-04 03:16, Will Deacon wrote:

On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote:

On 2021-02-01 23:50, Jordan Crouse wrote:
> On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:
> > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
> > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > > > On 2021-01-29 14:35, Will Deacon wrote:
> > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:
> > > > > > +#define IOMMU_LLC(1 << 6)
> > > > >
> > > > > On reflection, I'm a bit worried about exposing this because I think 
it
> > > > > will
> > > > > introduce a mismatched virtual alias with the CPU (we don't even have 
a
> > > > > MAIR
> > > > > set up for this memory type). Now, we also have that issue for the 
PTW,
> > > > > but
> > > > > since we always use cache maintenance (i.e. the streaming API) for
> > > > > publishing the page-tables to a non-coheren walker, it works out.
> > > > > However,
> > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
> > > > > allocation, then they're potentially in for a nasty surprise due to 
the
> > > > > mismatched outer-cacheability attributes.
> > > > >
> > > >
> > > > Can't we add the syscached memory type similar to what is done on 
android?
> > >
> > > Maybe. How does the GPU driver map these things on the CPU side?
> >
> > Currently we use writecombine mappings for everything, although there
> > are some cases that we'd like to use cached (but have not merged
> > patches that would give userspace a way to flush/invalidate)
> >
>
> LLC/system cache doesn't have a relationship with the CPU cache.  Its
> just a
> little accelerator that sits on the connection from the GPU to DDR and
> caches
> accesses. The hint that Sai is suggesting is used to mark the buffers as
> 'no-write-allocate' to prevent GPU write operations from being cached in
> the LLC
> which a) isn't interesting and b) takes up cache space for read
> operations.
>
> Its easiest to think of the LLC as a bonus accelerator that has no cost
> for
> us to use outside of the unfortunate per buffer hint.
>
> We do have to worry about the CPU cache w.r.t I/O coherency (which is a
> different hint) and in that case we have all of concerns that Will
> identified.
>

For mismatched outer cacheability attributes which Will mentioned, I 
was

referring to [1] in android kernel.


I've lost track of the conversation here :/

When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also 
mapped

into the CPU and with what attributes? Rob said "writecombine for
everything" -- does that mean ioremap_wc() / MEMREMAP_WC?



Rob answered this.

Finally, we need to be careful when we use the word "hint" as 
"allocation
hint" has a specific meaning in the architecture, and if we only 
mismatch on
those then we're actually ok. But I think IOMMU_LLC is more than just 
a
hint, since it actually drives eviction policy (i.e. it enables 
writeback).


Sorry for the pedantry, but I just want to make sure we're all talking
about the same things!



Sorry for the confusion which probably was caused by my mentioning of
android, NWA(no write allocate) is an allocation hint which we can 
ignore

for now as it is not introduced yet in upstream.



Any chance of taking this forward? We do not want to miss out on small 
fps

gain when the product gets released.

Thanks,
Sai
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-02-27 Thread Sai Prakash Ranjan

Hi Bjorn,

On 2021-02-27 00:44, Bjorn Andersson wrote:
> On Fri 26 Feb 12:23 CST 2021, Rob Clark wrote:
> 
> 
> The current logic picks one of:
> 1) is the compatible mentioned in qcom_smmu_impl_of_match[]
> 2) is the compatible an adreno
> 3) no quirks needed
> 
> The change flips the order of these, so the only way I can see this
> change affecting things is if we expected a match on #2, but we got one
> on #1.
> 
> Which implies that the instance that we want to act according to the
> adreno impl was listed in qcom_smmu_impl_of_match[] - which either is
> wrong, or there's a single instance that needs both behaviors.
> 
> (And I believe Jordan's answer confirms the latter - there's a single
> SMMU instance that needs all them quirks at once)
> 

Let me go through the problem statement in case my commit message wasn't
clear. There are two SMMUs (APSS and GPU) on SC7280 and both are SMMU500
(ARM SMMU IP).

APSS SMMU compatible - ("qcom,sc7280-smmu-500", "arm,mmu-500")
GPU SMMU compatible - ("qcom,sc7280-smmu-500", "qcom,adreno-smmu", 
"arm,mmu-500")

Now if we take SC7180 as an example, GPU SMMU was QSMMU(QCOM SMMU IP)
and APSS SMMU was SMMU500(ARM SMMU IP).

APSS SMMU compatible - ("qcom,sc7180-smmu-500", "arm,mmu-500")
GPU SMMU compatible - ("qcom,sc7180-smmu-v2", "qcom,adreno-smmu", 
"qcom,smmu-v2")

Current code sequence without this patch,

if (of_match_node(qcom_smmu_impl_of_match, np))
 return qcom_smmu_create(smmu, &qcom_smmu_impl);

if (of_device_is_compatible(np, "qcom,adreno-smmu"))
return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);

Now if we look at the compatible for SC7180, there is no problem because
the APSS SMMU will match the one in qcom_smmu_impl_of_match[] and GPU SMMU
will match "qcom,adreno-smmu" because the compatible strings are different.
But for SC7280, both the APSS SMMU and GPU SMMU 
compatible("qcom,sc7280-smmu-500")
are same. So GPU SMMU will match with the one in qcom_smmu_impl_of_match[]
i.e.., "qcom,sc7280-smmu-500" which functionally doesn't cause any problem
but we will miss settings for split pagetables which are part of GPU SMMU
specific implementation.

We can avoid this with yet another new compatible for GPU SMMU something like
"qcom,sc7280-adreno-smmu-500" but since we can handle this easily in the
driver and since the IPs are same, meaning if there was a hardware quirk
required, then we would need to apply to both of them and would this additional
compatible be of any help?

Coming to the part of quirks now, you are right saying both SMMUs will need
to have the same quirks in SC7280 and similar others where both are based on
same IPs but those should probably be *hardware quirks* and if they are
software based like the S2CR quirk depending on the firmware, then it might
not be applicable to both. In case if it is applicable, then as Jordan 
mentioned,
we can add the same quirks in GPU SMMU implementation.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-02-26 Thread Sai Prakash Ranjan

Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU
both implement "arm,mmu-500" in some QTI SoCs and to run through
adreno smmu specific implementation such as enabling split pagetables
support, we need to match the "qcom,adreno-smmu" compatible first
before apss smmu or else we will be running apps smmu implementation
for adreno smmu and the additional features for adreno smmu is never
set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps
and adreno smmu implementing "arm,mmu-500", so the adreno smmu
implementation is never reached because the current sequence checks
for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that
specific impl and we never reach adreno smmu specific implementation.

Suggested-by: Akhil P Oommen 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index bea3ee0dabc2..03f048aebb80 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -345,11 +345,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct 
arm_smmu_device *smmu)
 {
const struct device_node *np = smmu->dev->of_node;
 
-   if (of_match_node(qcom_smmu_impl_of_match, np))
-   return qcom_smmu_create(smmu, &qcom_smmu_impl);
-
+   /*
+* Do not change this order of implementation, i.e., first adreno
+* smmu impl and then apss smmu since we can have both implementing
+* arm,mmu-500 in which case we will miss setting adreno smmu specific
+* features if the order is changed.
+*/
if (of_device_is_compatible(np, "qcom,adreno-smmu"))
return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);
 
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, &qcom_smmu_impl);
+
return smmu;
 }
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2 1/2] iommu/arm-smmu-qcom: Add SC7280 SMMU compatible

2021-02-26 Thread Sai Prakash Ranjan

Add compatible for SC7280 SMMU to use the Qualcomm Technologies, Inc.
specific implementation.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 98b3a1c2a181..bea3ee0dabc2 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -166,6 +166,7 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,mdss" },
{ .compatible = "qcom,sc7180-mdss" },
{ .compatible = "qcom,sc7180-mss-pil" },
+   { .compatible = "qcom,sc7280-mdss" },
{ .compatible = "qcom,sc8180x-mdss" },
{ .compatible = "qcom,sdm845-mdss" },
{ .compatible = "qcom,sdm845-mss-pil" },
@@ -330,6 +331,7 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
 static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
{ .compatible = "qcom,msm8998-smmu-v2" },
{ .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sc7280-smmu-500" },
{ .compatible = "qcom,sc8180x-smmu-500" },
{ .compatible = "qcom,sdm630-smmu-v2" },
{ .compatible = "qcom,sdm845-smmu-500" },
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv2 0/2] iommu/arm-smmu-qcom: Add SC7280 support

2021-02-26 Thread Sai Prakash Ranjan

Patch 1 adds the sc7280 smmu compatible.
Patch 2 moves the adreno smmu check before apss smmu to enable
adreno smmu specific implementation.

Changes in v2:
 * Add a comment to make sure this order is not changed in future (Jordan)

Sai Prakash Ranjan (2):
  iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
  iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)


base-commit: 7060377ce06f9cd3ed6274c0f2310463feb5baec
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-02-26 Thread Sai Prakash Ranjan


On 2021-02-25 23:36, Jordan Crouse wrote:

On Thu, Feb 25, 2021 at 03:54:10PM +0530, Sai Prakash Ranjan wrote:

Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU
both implement "arm,mmu-500" in some QTI SoCs and to run through
adreno smmu specific implementation such as enabling split pagetables
support, we need to match the "qcom,adreno-smmu" compatible first
before apss smmu or else we will be running apps smmu implementation
for adreno smmu and the additional features for adreno smmu is never
set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps
and adreno smmu implementing "arm,mmu-500", so the adreno smmu
implementation is never reached because the current sequence checks
for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that
specific impl and we never reach adreno smmu specific implementation.

Suggested-by: Akhil P Oommen 
Signed-off-by: Sai Prakash Ranjan 
---

Its either this or we add a new compatible for adreno smmu 
implementing

arm,mmu-500 like "qcom,sc7280-adreno-smmu-500".

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c

index bea3ee0dabc2..7d0fc2c8e72f 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -345,11 +345,11 @@ struct arm_smmu_device 
*qcom_smmu_impl_init(struct arm_smmu_device *smmu)

 {
const struct device_node *np = smmu->dev->of_node;

-   if (of_match_node(qcom_smmu_impl_of_match, np))
-   return qcom_smmu_create(smmu, &qcom_smmu_impl);
-
if (of_device_is_compatible(np, "qcom,adreno-smmu"))
return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);

+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, &qcom_smmu_impl);
+


It would be good to add a comment here explaining the order here so we
don't accidentally reorganize ourselves back into a problem later.



Sure its better, will add it.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

2021-02-25 Thread Sai Prakash Ranjan

Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU
both implement "arm,mmu-500" in some QTI SoCs and to run through
adreno smmu specific implementation such as enabling split pagetables
support, we need to match the "qcom,adreno-smmu" compatible first
before apss smmu or else we will be running apps smmu implementation
for adreno smmu and the additional features for adreno smmu is never
set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps
and adreno smmu implementing "arm,mmu-500", so the adreno smmu
implementation is never reached because the current sequence checks
for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that
specific impl and we never reach adreno smmu specific implementation.

Suggested-by: Akhil P Oommen 
Signed-off-by: Sai Prakash Ranjan 
---

Its either this or we add a new compatible for adreno smmu implementing
arm,mmu-500 like "qcom,sc7280-adreno-smmu-500".

---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index bea3ee0dabc2..7d0fc2c8e72f 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -345,11 +345,11 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct 
arm_smmu_device *smmu)
 {
const struct device_node *np = smmu->dev->of_node;
 
-   if (of_match_node(qcom_smmu_impl_of_match, np))
-   return qcom_smmu_create(smmu, &qcom_smmu_impl);
-
if (of_device_is_compatible(np, "qcom,adreno-smmu"))
return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);
 
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, &qcom_smmu_impl);
+
return smmu;
 }
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 1/2] iommu/arm-smmu-qcom: Add SC7280 SMMU compatible

2021-02-25 Thread Sai Prakash Ranjan

Add compatible for SC7280 SMMU to use the Qualcomm Technologies, Inc.
specific implementation.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 98b3a1c2a181..bea3ee0dabc2 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -166,6 +166,7 @@ static const struct of_device_id 
qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,mdss" },
{ .compatible = "qcom,sc7180-mdss" },
{ .compatible = "qcom,sc7180-mss-pil" },
+   { .compatible = "qcom,sc7280-mdss" },
{ .compatible = "qcom,sc8180x-mdss" },
{ .compatible = "qcom,sdm845-mdss" },
{ .compatible = "qcom,sdm845-mss-pil" },
@@ -330,6 +331,7 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
 static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
{ .compatible = "qcom,msm8998-smmu-v2" },
{ .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sc7280-smmu-500" },
{ .compatible = "qcom,sc8180x-smmu-500" },
{ .compatible = "qcom,sdm630-smmu-v2" },
{ .compatible = "qcom,sdm845-smmu-500" },
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 0/2] iommu/arm-smmu-qcom: Add SC7280 support

2021-02-25 Thread Sai Prakash Ranjan

Patch 1 adds the sc7280 smmu compatible.
Patch 2 moves the adreno smmu check before apss smmu to enable
adreno smmu specific implementation.

Sai Prakash Ranjan (2):
  iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
  iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)


base-commit: 7060377ce06f9cd3ed6274c0f2310463feb5baec
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH] iommu: Add device name to iommu map/unmap trace events

2021-02-09 Thread Sai Prakash Ranjan

IOMMU map/unmap traces become hard to decode i.e., it becomes hard
to associate the map/unmap events with the particular device from
the iova/paddr/size parameters alone when there are multiple
devices attached. So it is useful to add the device name to iommu
trace events which can be used to filter out map/unmap traces for
a particular device when we are debugging iommu faults such as
context faults where we are interested with the map/unmap traces
for a specific device.

Before:
  map: IOMMU: iova=0x00036000 paddr=0x0001164d8000 size=4096
  unmap: IOMMU: iova=0x00036000 size=4096 unmapped_size=4096

After:
  map: IOMMU: dev=1d84000.ufshc iova=0x000fffa88000 
paddr=0x0001063db000 size=4096
  unmap: IOMMU: dev=1d84000.ufshc iova=0x000fffa88000 size=4096 
unmapped_size=4096

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/iommu.c|  8 +---
 include/linux/iommu.h|  1 +
 include/trace/events/iommu.h | 20 
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d0b0a15dba84..37081b745f38 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1947,8 +1947,10 @@ static int __iommu_attach_device(struct iommu_domain 
*domain,
return -ENODEV;
 
ret = domain->ops->attach_dev(domain, dev);
-   if (!ret)
+   if (!ret) {
trace_attach_device_to_domain(dev);
+   strscpy(domain->dev_name, dev_name(dev), 
sizeof(domain->dev_name));
+   }
return ret;
 }
 
@@ -2440,7 +2442,7 @@ static int __iommu_map(struct iommu_domain *domain, 
unsigned long iova,
if (ret)
iommu_unmap(domain, orig_iova, orig_size - size);
else
-   trace_map(orig_iova, orig_paddr, orig_size);
+   trace_map(orig_iova, orig_paddr, orig_size, domain->dev_name);
 
return ret;
 }
@@ -2523,7 +2525,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
unmapped += unmapped_page;
}
 
-   trace_unmap(orig_iova, size, unmapped);
+   trace_unmap(orig_iova, size, unmapped, domain->dev_name);
return unmapped;
 }
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5e7fe519430a..6064187d9bb6 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -87,6 +87,7 @@ struct iommu_domain {
void *handler_token;
struct iommu_domain_geometry geometry;
void *iova_cookie;
+   char dev_name[32];
 };
 
 enum iommu_cap {
diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
index 72b4582322ff..44e48fb8b677 100644
--- a/include/trace/events/iommu.h
+++ b/include/trace/events/iommu.h
@@ -85,47 +85,51 @@ DEFINE_EVENT(iommu_device_event, detach_device_from_domain,
 
 TRACE_EVENT(map,
 
-   TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size),
+   TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size, const char 
*dev_name),
 
-   TP_ARGS(iova, paddr, size),
+   TP_ARGS(iova, paddr, size, dev_name),
 
TP_STRUCT__entry(
__field(u64, iova)
__field(u64, paddr)
__field(size_t, size)
+   __string(dev_name, dev_name)
),
 
TP_fast_assign(
__entry->iova = iova;
__entry->paddr = paddr;
__entry->size = size;
+   __assign_str(dev_name, dev_name);
),
 
-   TP_printk("IOMMU: iova=0x%016llx paddr=0x%016llx size=%zu",
-   __entry->iova, __entry->paddr, __entry->size
+   TP_printk("IOMMU: dev=%s iova=0x%016llx paddr=0x%016llx size=%zu",
+ __get_str(dev_name), __entry->iova, __entry->paddr, 
__entry->size
)
 );
 
 TRACE_EVENT(unmap,
 
-   TP_PROTO(unsigned long iova, size_t size, size_t unmapped_size),
+   TP_PROTO(unsigned long iova, size_t size, size_t unmapped_size, const 
char *dev_name),
 
-   TP_ARGS(iova, size, unmapped_size),
+   TP_ARGS(iova, size, unmapped_size, dev_name),
 
TP_STRUCT__entry(
__field(u64, iova)
__field(size_t, size)
__field(size_t, unmapped_size)
+   __string(dev_name, dev_name)
),
 
TP_fast_assign(
__entry->iova = iova;
__entry->size = size;
__entry->unmapped_size = unmapped_size;
+   __assign_str(dev_name, dev_name);
),
 
-   TP_printk("IOMMU: iova=0x%016llx size=%zu unmapped_size=%zu",
-   __entry->iova, __entry->size, __entry->unmapped_size
+   TP_printk("IOMMU: dev=%s iova=0x%016llx size=%zu unmapped_size=%zu",
+ __get_str(dev_name), __entry->iova, __entry->size, 
__entry->unmapped_size
)
 );
 
-- 
QUALCOMM INDIA

Re: Consult on ARM SMMU debugfs

2021-02-05 Thread Sai Prakash Ranjan


On 2021-01-15 22:47, Robin Murphy wrote:

On 2021-01-15 15:14, Russell King - ARM Linux admin wrote:

On Mon, Jan 11, 2021 at 08:01:48PM +, Robin Murphy wrote:

On 2021-01-07 02:45, chenxiang (M) wrote:

Hi Will,ï¿½ Robin or other guys,

When debugging SMMU/SVA issue on huawei ARM64 board, we find that it
lacks of enough debugfs for ARM SMMU driver (such as

the value of STE/CD which we need to check sometimes). Currently it
creates top-level iommu directory in debugfs, but there is no 
debugfs


for ARM SMMU driver specially. Do you know whether ARM have the plan 
to

do that recently?


FWIW I don't think I've ever felt the need to need to inspect the 
Stream
Table on a live system. So far the nature of the STE code has been 
simple
enough that it's very hard for any given STE to be *wrong* - either 
it's set
up as expected and thus works fine, or it's not initialised at all 
and you
get C_BAD_STE, where 99% of the time you then just cross-reference 
the

Stream ID against the firmware and find that the DT/IORT is wrong.

Similarly I don't think I've even even *seen* an issue that could be
attributed to a context descriptor, although I appreciate that as we 
start

landing more PASID and SVA support the scope for that starts to widen
considerably.

Feel free to propose a patch if you believe it would be genuinely 
useful and
won't just bit-rot into a maintenance burden, but it's not something 
that's

on our roadmap here.


I do think that the IOMMU stuff needs better debugging. I've hit the
WARN_ON() in __arm_lpae_map(), and it's been pretty much undebuggable,
so I've resorted to putting the IOMMU into bypass mode permanently to
work around the issue.

The reason that it's undebuggable is if one puts printk() or trace
statements in the code, boots the platform, you get flooded with those
debugging messages, because every access to the rootfs generates and
tears down a mapping.

It would be nice to be able to inspect the IOMMU page tables and state
of the IOMMU, rather than having to resort to effectively disabling
the IOMMU.


Certainly once we get to stuff like unpinned VFIO, having the ability
to inspect pagetables for arbitrary IOMMU API usage will indeed be
useful. From the DMA mapping perspective, though, unless you're
working on the io-pgtable code itself it's not really going to tell
you much that dumping the mappings from dma-debug can't already.

FWIW whenever I encounter that particular warning in iommu-dma
context, I don't care where the existing mapping is pointing, since
it's merely a symptom of the damage already having been done. At that
point I'd usually go off and audit all the DMA API calls in the
offending driver, since it's typically caused by corruption in the
IOVA allocator from passing the wrong size in a dma_unmap_*() call,
and those can often be spotted by inspection. For active debugging,
what you really want to know is the *history* of operations around
that IOVA, since you're primarily interested in the request that last
mapped it, then the corresponding unmap request for nominally the same
buffer (which allowed the IOVA region to be freed for reuse) that for
some reason didn't cover one or more pages that it should have. The
IOMMU API tracepoints can be a handy tool there.



Currently IOMMU trace events are not straight forward to decode if
there are multiple devices attached. For ex: consider below:

map: IOMMU: iova=0x00035000 paddr=0x000113be2000 size=4096
unmap: IOMMU: iova=0x00034000 size=4096 unmapped_size=4096
unmap: IOMMU: iova=0x00035000 size=4096 unmapped_size=4096
map: IOMMU: iova=0x00036000 paddr=0x0001164d8000 size=4096
map: IOMMU: iova=0x00037000 paddr=0x0001164da000 size=4096
unmap: IOMMU: iova=0x00036000 size=4096 unmapped_size=4096
unmap: IOMMU: iova=0x00037000 size=4096 unmapped_size=4096

How about making it more useful adding the device name as well? Ex:

map: IOMMU:ae0.mdss iova=0x0002b000 paddr=0x00010a9e6000 
size=8192
map: IOMMU:ae0.mdss iova=0x0002d000 paddr=0x00010a9ec000 
size=21790
map: IOMMU:ae0.mdss iova=0x00241000 paddr=0x00010c40 
size=59392
map: IOMMU:a60.dwc3 iova=0x0004a000 paddr=0x00010a821000 
size=4096
map: IOMMU:a60.dwc3 iova=0x00049000 paddr=0x00010a82 
size=4096
unmap: IOMMU:a60.dwc3 iova=0x0004a000 size=4096 
unmapped_size=4096
unmap: IOMMU:a60.dwc3 iova=0x00049000 size=4096 
unmapped_size=4096


We have been carrying a local patch downstream like forever, I can post 
a

patch if you guys think it is useful in general.

Thanks
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-02-05 Thread Sai Prakash Ranjan


On 2021-02-04 03:16, Will Deacon wrote:

On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote:

On 2021-02-01 23:50, Jordan Crouse wrote:
> On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:
> > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
> > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > > > On 2021-01-29 14:35, Will Deacon wrote:
> > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:
> > > > > > +#define IOMMU_LLC(1 << 6)
> > > > >
> > > > > On reflection, I'm a bit worried about exposing this because I think 
it
> > > > > will
> > > > > introduce a mismatched virtual alias with the CPU (we don't even have 
a
> > > > > MAIR
> > > > > set up for this memory type). Now, we also have that issue for the 
PTW,
> > > > > but
> > > > > since we always use cache maintenance (i.e. the streaming API) for
> > > > > publishing the page-tables to a non-coheren walker, it works out.
> > > > > However,
> > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
> > > > > allocation, then they're potentially in for a nasty surprise due to 
the
> > > > > mismatched outer-cacheability attributes.
> > > > >
> > > >
> > > > Can't we add the syscached memory type similar to what is done on 
android?
> > >
> > > Maybe. How does the GPU driver map these things on the CPU side?
> >
> > Currently we use writecombine mappings for everything, although there
> > are some cases that we'd like to use cached (but have not merged
> > patches that would give userspace a way to flush/invalidate)
> >
>
> LLC/system cache doesn't have a relationship with the CPU cache.  Its
> just a
> little accelerator that sits on the connection from the GPU to DDR and
> caches
> accesses. The hint that Sai is suggesting is used to mark the buffers as
> 'no-write-allocate' to prevent GPU write operations from being cached in
> the LLC
> which a) isn't interesting and b) takes up cache space for read
> operations.
>
> Its easiest to think of the LLC as a bonus accelerator that has no cost
> for
> us to use outside of the unfortunate per buffer hint.
>
> We do have to worry about the CPU cache w.r.t I/O coherency (which is a
> different hint) and in that case we have all of concerns that Will
> identified.
>

For mismatched outer cacheability attributes which Will mentioned, I 
was

referring to [1] in android kernel.


I've lost track of the conversation here :/

When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also 
mapped

into the CPU and with what attributes? Rob said "writecombine for
everything" -- does that mean ioremap_wc() / MEMREMAP_WC?



Rob answered this.

Finally, we need to be careful when we use the word "hint" as 
"allocation
hint" has a specific meaning in the architecture, and if we only 
mismatch on

those then we're actually ok. But I think IOMMU_LLC is more than just a
hint, since it actually drives eviction policy (i.e. it enables 
writeback).


Sorry for the pedantry, but I just want to make sure we're all talking
about the same things!



Sorry for the confusion which probably was caused by my mentioning of
android, NWA(no write allocate) is an allocation hint which we can 
ignore

for now as it is not introduced yet in upstream.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-02-01 Thread Sai Prakash Ranjan


On 2021-02-01 23:50, Jordan Crouse wrote:

On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:

On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
>
> On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > On 2021-01-29 14:35, Will Deacon wrote:
> > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:
> > > > Add a new page protection flag IOMMU_LLC which can be used
> > > > by non-coherent masters to set cacheable memory attributes
> > > > for an outer level of cache called as last-level cache or
> > > > system cache. Initial user of this page protection flag is
> > > > the adreno gpu and then can later be used by other clients
> > > > such as video where this can be used for per-buffer based
> > > > mapping.
> > > >
> > > > Signed-off-by: Sai Prakash Ranjan 
> > > > ---
> > > >  drivers/iommu/io-pgtable-arm.c | 3 +++
> > > >  include/linux/iommu.h  | 6 ++
> > > >  2 files changed, 9 insertions(+)
> > > >
> > > > diff --git a/drivers/iommu/io-pgtable-arm.c
> > > > b/drivers/iommu/io-pgtable-arm.c
> > > > index 7439ee7fdcdb..ebe653ef601b 100644
> > > > --- a/drivers/iommu/io-pgtable-arm.c
> > > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > > @@ -415,6 +415,9 @@ static arm_lpae_iopte
> > > > arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
> > > >   else if (prot & IOMMU_CACHE)
> > > >   pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> > > >   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > > + else if (prot & IOMMU_LLC)
> > > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
> > > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > >   }
> > > >
> > > >   if (prot & IOMMU_CACHE)
> > > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> > > > index ffaa389ea128..1f82057df531 100644
> > > > --- a/include/linux/iommu.h
> > > > +++ b/include/linux/iommu.h
> > > > @@ -31,6 +31,12 @@
> > > >   * if the IOMMU page table format is equivalent.
> > > >   */
> > > >  #define IOMMU_PRIV   (1 << 5)
> > > > +/*
> > > > + * Non-coherent masters can use this page protection flag to set
> > > > cacheable
> > > > + * memory attributes for only a transparent outer level of cache,
> > > > also known as
> > > > + * the last-level or system cache.
> > > > + */
> > > > +#define IOMMU_LLC(1 << 6)
> > >
> > > On reflection, I'm a bit worried about exposing this because I think it
> > > will
> > > introduce a mismatched virtual alias with the CPU (we don't even have a
> > > MAIR
> > > set up for this memory type). Now, we also have that issue for the PTW,
> > > but
> > > since we always use cache maintenance (i.e. the streaming API) for
> > > publishing the page-tables to a non-coheren walker, it works out.
> > > However,
> > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
> > > allocation, then they're potentially in for a nasty surprise due to the
> > > mismatched outer-cacheability attributes.
> > >
> >
> > Can't we add the syscached memory type similar to what is done on android?
>
> Maybe. How does the GPU driver map these things on the CPU side?

Currently we use writecombine mappings for everything, although there
are some cases that we'd like to use cached (but have not merged
patches that would give userspace a way to flush/invalidate)

BR,
-R


LLC/system cache doesn't have a relationship with the CPU cache.  Its 
just a
little accelerator that sits on the connection from the GPU to DDR and 
caches
accesses. The hint that Sai is suggesting is used to mark the buffers 
as
'no-write-allocate' to prevent GPU write operations from being cached 
in the LLC
which a) isn't interesting and b) takes up cache space for read 
operations.


Its easiest to think of the LLC as a bonus accelerator that has no cost 
for

us to use outside of the unfortunate per buffer hint.

We do have to worry about the CPU cache w.r.t I/O coherency (which is a
different hint) and in that case we have all of concerns that Will 
identified.




For mismatched outer cacheability attributes which Will mentioned, I was
referring to [1] in android kernel.

[1] https://android-review.googlesource.com/c/kernel/common/+/1549097/3

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-02-01 Thread Sai Prakash Ranjan


On 2021-02-01 23:50, Jordan Crouse wrote:

On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:

On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
>
> On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > On 2021-01-29 14:35, Will Deacon wrote:
> > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:
> > > > Add a new page protection flag IOMMU_LLC which can be used
> > > > by non-coherent masters to set cacheable memory attributes
> > > > for an outer level of cache called as last-level cache or
> > > > system cache. Initial user of this page protection flag is
> > > > the adreno gpu and then can later be used by other clients
> > > > such as video where this can be used for per-buffer based
> > > > mapping.
> > > >
> > > > Signed-off-by: Sai Prakash Ranjan 
> > > > ---
> > > >  drivers/iommu/io-pgtable-arm.c | 3 +++
> > > >  include/linux/iommu.h  | 6 ++
> > > >  2 files changed, 9 insertions(+)
> > > >
> > > > diff --git a/drivers/iommu/io-pgtable-arm.c
> > > > b/drivers/iommu/io-pgtable-arm.c
> > > > index 7439ee7fdcdb..ebe653ef601b 100644
> > > > --- a/drivers/iommu/io-pgtable-arm.c
> > > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > > @@ -415,6 +415,9 @@ static arm_lpae_iopte
> > > > arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
> > > >   else if (prot & IOMMU_CACHE)
> > > >   pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> > > >   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > > + else if (prot & IOMMU_LLC)
> > > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
> > > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > >   }
> > > >
> > > >   if (prot & IOMMU_CACHE)
> > > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> > > > index ffaa389ea128..1f82057df531 100644
> > > > --- a/include/linux/iommu.h
> > > > +++ b/include/linux/iommu.h
> > > > @@ -31,6 +31,12 @@
> > > >   * if the IOMMU page table format is equivalent.
> > > >   */
> > > >  #define IOMMU_PRIV   (1 << 5)
> > > > +/*
> > > > + * Non-coherent masters can use this page protection flag to set
> > > > cacheable
> > > > + * memory attributes for only a transparent outer level of cache,
> > > > also known as
> > > > + * the last-level or system cache.
> > > > + */
> > > > +#define IOMMU_LLC(1 << 6)
> > >
> > > On reflection, I'm a bit worried about exposing this because I think it
> > > will
> > > introduce a mismatched virtual alias with the CPU (we don't even have a
> > > MAIR
> > > set up for this memory type). Now, we also have that issue for the PTW,
> > > but
> > > since we always use cache maintenance (i.e. the streaming API) for
> > > publishing the page-tables to a non-coheren walker, it works out.
> > > However,
> > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
> > > allocation, then they're potentially in for a nasty surprise due to the
> > > mismatched outer-cacheability attributes.
> > >
> >
> > Can't we add the syscached memory type similar to what is done on android?
>
> Maybe. How does the GPU driver map these things on the CPU side?

Currently we use writecombine mappings for everything, although there
are some cases that we'd like to use cached (but have not merged
patches that would give userspace a way to flush/invalidate)

BR,
-R


LLC/system cache doesn't have a relationship with the CPU cache.  Its 
just a
little accelerator that sits on the connection from the GPU to DDR and 
caches
accesses. The hint that Sai is suggesting is used to mark the buffers 
as
'no-write-allocate' to prevent GPU write operations from being cached 
in the LLC
which a) isn't interesting and b) takes up cache space for read 
operations.


Its easiest to think of the LLC as a bonus accelerator that has no cost 
for

us to use outside of the unfortunate per buffer hint.

We do have to worry about the CPU cache w.r.t I/O coherency (which is a
different hint) and in that case we have all of concerns that Will 
identified.




For mismatched outer cacheability attributes which Will mentioned, I was
referring to [1] in android kernel.

[1] https://android-review.googlesource.com/c/kernel/common/+/1549097/3

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-01-29 Thread Sai Prakash Ranjan


On 2021-01-29 14:35, Will Deacon wrote:

On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:

Add a new page protection flag IOMMU_LLC which can be used
by non-coherent masters to set cacheable memory attributes
for an outer level of cache called as last-level cache or
system cache. Initial user of this page protection flag is
the adreno gpu and then can later be used by other clients
such as video where this can be used for per-buffer based
mapping.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 include/linux/iommu.h  | 6 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 7439ee7fdcdb..ebe653ef601b 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,

else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (prot & IOMMU_LLC)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}

if (prot & IOMMU_CACHE)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ffaa389ea128..1f82057df531 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -31,6 +31,12 @@
  * if the IOMMU page table format is equivalent.
  */
 #define IOMMU_PRIV (1 << 5)
+/*
+ * Non-coherent masters can use this page protection flag to set 
cacheable
+ * memory attributes for only a transparent outer level of cache, 
also known as

+ * the last-level or system cache.
+ */
+#define IOMMU_LLC  (1 << 6)


On reflection, I'm a bit worried about exposing this because I think it 
will
introduce a mismatched virtual alias with the CPU (we don't even have a 
MAIR
set up for this memory type). Now, we also have that issue for the PTW, 
but

since we always use cache maintenance (i.e. the streaming API) for
publishing the page-tables to a non-coheren walker, it works out. 
However,

if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
allocation, then they're potentially in for a nasty surprise due to the
mismatched outer-cacheability attributes.



Can't we add the syscached memory type similar to what is done on 
android?


So I can take patch (1) as a trivial rename, but unfortunately I think 
this

needs more thought before exposing it beyond the PTW.



That wouldn't be of much use, would it :) , we would be losing on
perf gain for GPU usecases without the rest of the patches.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-01-29 Thread Sai Prakash Ranjan


On 2021-01-20 10:48, Sai Prakash Ranjan wrote:

On 2021-01-11 19:45, Sai Prakash Ranjan wrote:

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to set the right PTE attribute for GPU buffers to be sys cached.
Without this, the system cache lines are not allocated for GPU.

So the patches in this series introduces a new prot flag IOMMU_LLC,
renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
and makes GPU the user of this protection flag.

The series slightly depends on following 2 patches posted earlier and
is based on msm-next branch:
 * https://lore.kernel.org/patchwork/patch/1363008/
 * https://lore.kernel.org/patchwork/patch/1363010/

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Rename last-level cache quirk to
IO_PGTABLE_QUIRK_PTW_LLC
  iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
  drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 3 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/gpu/drm/msm/msm_iommu.c | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h   | 4 
 drivers/iommu/io-pgtable-arm.c  | 9 ++---
 include/linux/io-pgtable.h  | 6 +++---
 include/linux/iommu.h   | 6 ++
 7 files changed, 26 insertions(+), 7 deletions(-)


base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709



Gentle Ping!



Gentle Ping!!

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-01-19 Thread Sai Prakash Ranjan


On 2021-01-11 19:45, Sai Prakash Ranjan wrote:

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to set the right PTE attribute for GPU buffers to be sys cached.
Without this, the system cache lines are not allocated for GPU.

So the patches in this series introduces a new prot flag IOMMU_LLC,
renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
and makes GPU the user of this protection flag.

The series slightly depends on following 2 patches posted earlier and
is based on msm-next branch:
 * https://lore.kernel.org/patchwork/patch/1363008/
 * https://lore.kernel.org/patchwork/patch/1363010/

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Rename last-level cache quirk to
IO_PGTABLE_QUIRK_PTW_LLC
  iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
  drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 3 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/gpu/drm/msm/msm_iommu.c | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h   | 4 
 drivers/iommu/io-pgtable-arm.c  | 9 ++---
 include/linux/io-pgtable.h  | 6 +++---
 include/linux/iommu.h   | 6 ++
 7 files changed, 26 insertions(+), 7 deletions(-)


base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709



Gentle Ping!

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 3/3] drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

2021-01-11 Thread Sai Prakash Ranjan

Use the newly introduced IOMMU_LLC page protection flag to map
GPU buffers. This will make sure that proper stage-1 PTE
attributes are set for GPU buffers to use system cache. This
also introduces MMU_FEATURE_USE_LLC features bit to check for
GPUs supporting LLC and set them in the target specific address
space creation, in this case we set them for A6XX GPUs.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 3 +++
 drivers/gpu/drm/msm/msm_iommu.c   | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h | 4 
 3 files changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 3c7ad51732bb..23da21b6f0ff 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1266,6 +1266,9 @@ a6xx_create_address_space(struct msm_gpu *gpu, struct 
platform_device *pdev)
return ERR_CAST(mmu);
}
 
+   if (!IS_ERR_OR_NULL(a6xx_gpu->llc_slice))
+   mmu->features |= MMU_FEATURE_USE_LLC;
+
/*
 * Use the aperture start or SZ_16M, whichever is greater. This will
 * ensure that we align with the allocated pagetable range while still
diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index 22ac7c692a81..a329f9836422 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -235,6 +235,9 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova,
if (iova & BIT_ULL(48))
iova |= GENMASK_ULL(63, 49);
 
+   if (mmu->features & MMU_FEATURE_USE_LLC)
+   prot |= IOMMU_LLC;
+
ret = iommu_map_sgtable(iommu->domain, iova, sgt, prot);
WARN_ON(!ret);
 
diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
index 61ade89d9e48..efcd1939c98e 100644
--- a/drivers/gpu/drm/msm/msm_mmu.h
+++ b/drivers/gpu/drm/msm/msm_mmu.h
@@ -23,12 +23,16 @@ enum msm_mmu_type {
MSM_MMU_IOMMU_PAGETABLE,
 };
 
+/* MMU features */
+#define MMU_FEATURE_USE_LLCBIT(0)
+
 struct msm_mmu {
const struct msm_mmu_funcs *funcs;
struct device *dev;
int (*handler)(void *arg, unsigned long iova, int flags);
void *arg;
enum msm_mmu_type type;
+   u32 features;
 };
 
 static inline void msm_mmu_init(struct msm_mmu *mmu, struct device *dev,
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-01-11 Thread Sai Prakash Ranjan

Add a new page protection flag IOMMU_LLC which can be used
by non-coherent masters to set cacheable memory attributes
for an outer level of cache called as last-level cache or
system cache. Initial user of this page protection flag is
the adreno gpu and then can later be used by other clients
such as video where this can be used for per-buffer based
mapping.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 include/linux/iommu.h  | 6 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7439ee7fdcdb..ebe653ef601b 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,
else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (prot & IOMMU_LLC)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}
 
if (prot & IOMMU_CACHE)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ffaa389ea128..1f82057df531 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -31,6 +31,12 @@
  * if the IOMMU page table format is equivalent.
  */
 #define IOMMU_PRIV (1 << 5)
+/*
+ * Non-coherent masters can use this page protection flag to set cacheable
+ * memory attributes for only a transparent outer level of cache, also known as
+ * the last-level or system cache.
+ */
+#define IOMMU_LLC  (1 << 6)
 
 struct iommu_ops;
 struct iommu_group;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 1/3] iommu/io-pgtable: Rename last-level cache quirk to IO_PGTABLE_QUIRK_PTW_LLC

2021-01-11 Thread Sai Prakash Ranjan

Rename last-level cache quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to
IO_PGTABLE_QUIRK_PTW_LLC which is used to set the required TCR
attributes for non-coherent page table walker to be more generic
and in sync with the upcoming page protection flag IOMMU_LLC.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/iommu/io-pgtable-arm.c  | 6 +++---
 include/linux/io-pgtable.h  | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 0f184c3dd9d9..82b5e4969195 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -190,7 +190,7 @@ void adreno_set_llc_attributes(struct iommu_domain *iommu)
 {
struct io_pgtable_domain_attr pgtbl_cfg;
 
-   pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
+   pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_PTW_LLC;
iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg);
 }
 
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7c9ea9d7874a..7439ee7fdcdb 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -762,7 +762,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
IO_PGTABLE_QUIRK_ARM_TTBR1 |
-   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   IO_PGTABLE_QUIRK_PTW_LLC))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -774,12 +774,12 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
-   if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   if (cfg->quirks & IO_PGTABLE_QUIRK_PTW_LLC)
goto out_free_data;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-   if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   if (!(cfg->quirks & IO_PGTABLE_QUIRK_PTW_LLC))
tcr->orgn = ARM_LPAE_TCR_RGN_NC;
else
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index fb4d5a763e0c..6f996a817441 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -87,8 +87,8 @@ struct io_pgtable_cfg {
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
 *
-* IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
-*  attributes set in the TCR for a non-coherent page-table walker.
+* IO_PGTABLE_QUIRK_PTW_LLC: Override the outer-cacheability attributes
+*  set in the TCR for a non-coherent page-table walker.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -96,7 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
-   #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
+   #define IO_PGTABLE_QUIRK_PTW_LLCBIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-01-11 Thread Sai Prakash Ranjan

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to set the right PTE attribute for GPU buffers to be sys cached.
Without this, the system cache lines are not allocated for GPU.

So the patches in this series introduces a new prot flag IOMMU_LLC,
renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
and makes GPU the user of this protection flag.

The series slightly depends on following 2 patches posted earlier and
is based on msm-next branch:
 * https://lore.kernel.org/patchwork/patch/1363008/
 * https://lore.kernel.org/patchwork/patch/1363010/

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Rename last-level cache quirk to
IO_PGTABLE_QUIRK_PTW_LLC
  iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
  drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 3 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/gpu/drm/msm/msm_iommu.c | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h   | 4 
 drivers/iommu/io-pgtable-arm.c  | 9 ++---
 include/linux/io-pgtable.h  | 6 +++---
 include/linux/iommu.h   | 6 ++
 7 files changed, 26 insertions(+), 7 deletions(-)


base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 0/5] Optimize iommu_map_sg() performance

2021-01-10 Thread Sai Prakash Ranjan


On 2021-01-11 11:52, Sai Prakash Ranjan wrote:

Hi Isaac,

I gave this series a go on chromebook and saw these warnings
and several device probe failures, logs attached below:

WARN corresponds to this code in arm_lpae_map_by_pgsize()

if (WARN_ON(iaext || (paddr + size) >> cfg->oas))
return -ERANGE;

Logs:

[2.411391] [ cut here ]
[2.416149] WARNING: CPU: 6 PID: 56 at
drivers/iommu/io-pgtable-arm.c:492 arm_lpae_map_sg+0x234/0x248
[2.425606] Modules linked in:
[2.428749] CPU: 6 PID: 56 Comm: kworker/6:1 Not tainted 5.10.5 #970
[2.440287] Workqueue: events deferred_probe_work_func
[2.445563] pstate: 20c9 (nzCv daif +PAN +UAO -TCO BTYPE=--)
[2.451726] pc : arm_lpae_map_sg+0x234/0x248
[2.456112] lr : arm_lpae_map_sg+0xe0/0x248
[2.460410] sp : ffc010513750
[2.463820] x29: ffc010513790 x28: ffb943332000
[2.469281] x27: 000ff000 x26: ffb943d14900
[2.474738] x25: 1000 x24: 000103465000
[2.480196] x23: 0001 x22: 000103466000
[2.485645] x21: 0003 x20: 0a20
[2.491103] x19: ffc010513850 x18: 0001
[2.496562] x17: 0002 x16: 
[2.502021] x15:  x14: 
[2.507479] x13: 0001 x12: 
[2.512928] x11: 0010 x10: 
[2.518385] x9 : 0001 x8 : 40201000
[2.523844] x7 : 0a20 x6 : ffb943463000
[2.529302] x5 : 0003 x4 : 1000
[2.534760] x3 : 0001 x2 : ffb941f605a0
[2.540219] x1 : 0003 x0 : 0e40
[2.545679] Call trace:
[2.548196]  arm_lpae_map_sg+0x234/0x248
[2.552225]  arm_smmu_map_sg+0x80/0xc4
[2.556078]  __iommu_map_sg+0x6c/0x188
[2.559931]  iommu_map_sg_atomic+0x18/0x20
[2.564144]  iommu_dma_alloc_remap+0x26c/0x34c
[2.568703]  iommu_dma_alloc+0x9c/0x268
[2.572647]  dma_alloc_attrs+0x88/0xfc
[2.576503]  gsi_ring_alloc+0x50/0x144
[2.580356]  gsi_init+0x2c4/0x5c4
[2.583766]  ipa_probe+0x14c/0x2b4
[2.587263]  platform_drv_probe+0x94/0xb4
[2.591377]  really_probe+0x138/0x348
[2.595145]  driver_probe_device+0x80/0xb8
[2.599358]  __device_attach_driver+0x90/0xa8
[2.603829]  bus_for_each_drv+0x84/0xcc
[2.607772]  __device_attach+0xc0/0x148
[2.611713]  device_initial_probe+0x18/0x20
[2.616012]  bus_probe_device+0x38/0x94
[2.619953]  deferred_probe_work_func+0x78/0xb0
[2.624611]  process_one_work+0x210/0x3dc
[2.628726]  worker_thread+0x284/0x3e0
[2.632578]  kthread+0x148/0x1a8
[2.635891]  ret_from_fork+0x10/0x18
[2.639562] ---[ end trace 9bac18cad6a9862e ]---
[2.644414] ipa 1e4.ipa: error -12 allocating channel 0 event 
ring

[2.651656] ipa: probe of 1e4.ipa failed with error -12
[2.660072] dwc3 a60.dwc3: Adding to iommu group 8
[2.668632] xhci-hcd xhci-hcd.13.auto: xHCI Host Controller
[2.674680] xhci-hcd xhci-hcd.13.auto: new USB bus registered,
assigned bus number 1



...

Isaac provided a fix which he will post as v2 and no warnings were 
observed

with that fix.

Tested-by: Sai Prakash Ranjan 

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 0/5] Optimize iommu_map_sg() performance

2021-01-10 Thread Sai Prakash Ranjan

Hi Isaac,

On 2021-01-09 07:20, Isaac J. Manjarres wrote:
> The iommu_map_sg() code currently iterates through the given
> scatter-gather list, and in the worst case, invokes iommu_map()
> for each element in the scatter-gather list, which calls into
> the IOMMU driver through an indirect call. For an IOMMU driver
> that uses a format supported by the io-pgtable code, the IOMMU
> driver will then call into the io-pgtable code to map the chunk.
> 
> Jumping between the IOMMU core code, the IOMMU driver, and the
> io-pgtable code and back for each element in a scatter-gather list
> is not efficient.
> 
> Instead, add a map_sg() hook in both the IOMMU driver ops and the
> io-pgtable ops. iommu_map_sg() can then call into the IOMMU driver's
> map_sg() hook with the entire scatter-gather list, which can call
> into the io-pgtable map_sg() hook, which can process the entire
> scatter-gather list, signficantly reducing the number of indirect
> calls, and jumps between these layers, boosting performance.
> 
> On a system that uses the ARM SMMU driver, and the ARM LPAE format,
> the current implementation of iommu_map_sg() yields the following
> latencies for mapping scatter-gather lists of various sizes. These
> latencies are calculated by repeating the mapping operation 10 times:
> 
> sizeiommu_map_sg latency
>   4K0.624 us
>  64K9.468 us
>   1M  122.557 us
>   2M  239.807 us
>  12M 1435.979 us
>  24M 2884.968 us
>  32M 3832.979 us
> 
> On the same system, the proposed modifications yield the following
> results:
> 
> sizeiommu_map_sg latency
>   4K3.645 us
>  64K4.198 us
>   1M   11.010 us
>   2M   17.125 us
>  12M   82.416 us
>  24M  158.677 us
>  32M  210.468 us
> 
> The procedure for collecting the iommu_map_sg latencies is
> the same in both experiments. Clearly, reducing the jumps
> between the different layers in the IOMMU code offers a
> signficant performance boost in iommu_map_sg() latency.
> 

I gave this series a go on chromebook and saw these warnings
and several device probe failures, logs attached below:

WARN corresponds to this code in arm_lpae_map_by_pgsize()

if (WARN_ON(iaext || (paddr + size) >> cfg->oas))
return -ERANGE;

Logs:

[2.411391] [ cut here ]
[2.416149] WARNING: CPU: 6 PID: 56 at drivers/iommu/io-pgtable-arm.c:492 
arm_lpae_map_sg+0x234/0x248
[2.425606] Modules linked in:
[2.428749] CPU: 6 PID: 56 Comm: kworker/6:1 Not tainted 5.10.5 #970
[2.440287] Workqueue: events deferred_probe_work_func
[2.445563] pstate: 20c9 (nzCv daif +PAN +UAO -TCO BTYPE=--)
[2.451726] pc : arm_lpae_map_sg+0x234/0x248
[2.456112] lr : arm_lpae_map_sg+0xe0/0x248
[2.460410] sp : ffc010513750
[2.463820] x29: ffc010513790 x28: ffb943332000 
[2.469281] x27: 000ff000 x26: ffb943d14900 
[2.474738] x25: 1000 x24: 000103465000 
[2.480196] x23: 0001 x22: 000103466000 
[2.485645] x21: 0003 x20: 0a20 
[2.491103] x19: ffc010513850 x18: 0001 
[2.496562] x17: 0002 x16:  
[2.502021] x15:  x14:  
[2.507479] x13: 0001 x12:  
[2.512928] x11: 0010 x10:  
[2.518385] x9 : 0001 x8 : 40201000 
[2.523844] x7 : 0a20 x6 : ffb943463000 
[2.529302] x5 : 0003 x4 : 1000 
[2.534760] x3 : 0001 x2 : ffb941f605a0 
[2.540219] x1 : 0003 x0 : 0e40 
[2.545679] Call trace:
[2.548196]  arm_lpae_map_sg+0x234/0x248
[2.552225]  arm_smmu_map_sg+0x80/0xc4
[2.556078]  __iommu_map_sg+0x6c/0x188
[2.559931]  iommu_map_sg_atomic+0x18/0x20
[2.564144]  iommu_dma_alloc_remap+0x26c/0x34c
[2.568703]  iommu_dma_alloc+0x9c/0x268
[2.572647]  dma_alloc_attrs+0x88/0xfc
[2.576503]  gsi_ring_alloc+0x50/0x144
[2.580356]  gsi_init+0x2c4/0x5c4
[2.583766]  ipa_probe+0x14c/0x2b4
[2.587263]  platform_drv_probe+0x94/0xb4
[2.591377]  really_probe+0x138/0x348
[2.595145]  driver_probe_device+0x80/0xb8
[2.599358]  __device_attach_driver+0x90/0xa8
[2.603829]  bus_for_each_drv+0x84/0xcc
[2.607772]  __device_attach+0xc0/0x148
[2.611713]  device_initial_probe+0x18/0x20
[2.616012]  bus_probe_device+0x38/0x94
[2.619953]  deferred_probe_work_func+0x78/0xb0
[2.624611]  process_one_work+0x210/0x3dc
[2.628726]  worker_thread+0x284/0x3e0
[2.632578]  kthread+0x148/0x1a8
[2.635891]  ret_from_fork+0x10/0x18
[2.639562] ---[ end trace 9bac18cad6a9862e ]---
[2.644414] ipa 1e4.ipa: error -12 allocating channel 0 event ring
[2.651656

Re: [PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache

2021-01-10 Thread Sai Prakash Ranjan

On 2021-01-08 23:48, Will Deacon wrote:

On Fri, Jan 08, 2021 at 11:17:25AM +0530, Sai Prakash Ranjan wrote:

On 2021-01-07 22:27, isa...@codeaurora.org wrote:
> On 2021-01-06 03:56, Will Deacon wrote:
> > On Thu, Dec 24, 2020 at 12:10:07PM +0530, Sai Prakash Ranjan wrote:
> > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY
> > > flag")
> > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
> > > the memory type setting required for the non-coherent masters to use
> > > system cache. Now that system cache support for GPU is added, we will
> > > need to mark the memory as normal sys-cached for GPU to use
> > > system cache.
> > > Without this, the system cache lines are not allocated for GPU.
> > > We use
> > > the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page
> > > protection
> > > flag as the flag cannot be exposed via DMA api because of no in-tree
> > > users.
> > >
> > > Signed-off-by: Sai Prakash Ranjan 
> > > ---
> > >  drivers/iommu/io-pgtable-arm.c | 3 +++
> > >  1 file changed, 3 insertions(+)
> > >
> > > diff --git a/drivers/iommu/io-pgtable-arm.c
> > > b/drivers/iommu/io-pgtable-arm.c
> > > index 7c9ea9d7874a..3fb7de8304a2 100644
> > > --- a/drivers/iommu/io-pgtable-arm.c
> > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > @@ -415,6 +415,9 @@ static arm_lpae_iopte
> > > arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
> > >  else if (prot & IOMMU_CACHE)
> > >  pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> > >  << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > +else if (data->iop.cfg.quirks & 
IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
> > > +pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
> > > +<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > >  }
> >
> While this approach of enabling system cache globally for both page
> tables and other buffers
> works for the GPU usecase, this isn't ideal for other clients that use
> system cache. For example,
> video clients only want to cache a subset of their buffers in the
> system cache, due to the sizing constraint
> imposed by how much of the system cache they can use. So, it would be
> ideal to have
> a way of expressing the desire to use the system cache on a per-buffer
> basis. Additionally,
> our video clients use the DMA layer, and since the requirement is for
> caching in the system cache
> to be a per buffer attribute, it seems like we would have to have a
> DMA attribute to express
> this on a per-buffer basis.
>

I did bring this up initially [1], also where is this video client
in upstream? AFAIK, only system cache user in upstream is GPU.
We cannot add any DMA attribute unless there is any user upstream
as per [2], so when the support for such a client is added, wouldn't
((data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) || 
PROT_FLAG)

work?

Hmm, I think this is another case where we need to separate out the
page-table walker attributes from the access attributes. Currently,
IO_PGTABLE_QUIRK_ARM_OUTER_WBWA applies _only_ to the page-table walker
and I don't think it makes any sense for that to be per-buffer (how 
would
you even manage that?). However, if we want to extend this to data 
accesses
and we know that there are valid use-cases where this should be 
per-buffer,
then shoe-horning it in with the walker quirk does not feel like the 
best

thing to do.

As a starting point, we could:

  1. Rename IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
  2. Add a new prot flag IOMMU_LLC
  3. Have the GPU pass the new prot for its buffer mappings

This looks good to me, I will work on this and post something soon.

Does that work? One thing I'm not sure about is whether IOMMU_CACHE 
should
imply IOMMU_LLC, or whether there is a use-case for inner-cacheable, 
outer
non-cacheable mappings for a coherent device. Have you ever seen that 
sort

of thing before?

I don't think there is such a usecase as Isaac mentioned.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache

2021-01-10 Thread Sai Prakash Ranjan


On 2021-01-08 23:39, isa...@codeaurora.org wrote:

On 2021-01-07 21:47, Sai Prakash Ranjan wrote:

On 2021-01-07 22:27, isa...@codeaurora.org wrote:

On 2021-01-06 03:56, Will Deacon wrote:

On Thu, Dec 24, 2020 at 12:10:07PM +0530, Sai Prakash Ranjan wrote:
commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY 
flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it 
went
the memory type setting required for the non-coherent masters to 
use
system cache. Now that system cache support for GPU is added, we 
will
need to mark the memory as normal sys-cached for GPU to use system 
cache.
Without this, the system cache lines are not allocated for GPU. We 
use
the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page 
protection
flag as the flag cannot be exposed via DMA api because of no 
in-tree

users.

Signed-off-by: Sai Prakash Ranjan 


---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 7c9ea9d7874a..3fb7de8304a2 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -415,6 +415,9 @@ static arm_lpae_iopte 
arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,

else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}



While this approach of enabling system cache globally for both page
tables and other buffers
works for the GPU usecase, this isn't ideal for other clients that 
use

system cache. For example,
video clients only want to cache a subset of their buffers in the
system cache, due to the sizing constraint
imposed by how much of the system cache they can use. So, it would be
ideal to have
a way of expressing the desire to use the system cache on a 
per-buffer

basis. Additionally,
our video clients use the DMA layer, and since the requirement is for
caching in the system cache
to be a per buffer attribute, it seems like we would have to have a
DMA attribute to express
this on a per-buffer basis.



I did bring this up initially [1], also where is this video client
in upstream? AFAIK, only system cache user in upstream is GPU.
We cannot add any DMA attribute unless there is any user upstream

Right, there wouldn't be an upstream user, which would be problematic,
but I was thinking of having it so that when video or any of our other
clients that use this attribute on a per buffer basis upstreams their
code, it's not too much of a stretch to add the support.


Agreed.


as per [2], so when the support for such a client is added, wouldn't
((data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) || 
PROT_FLAG)

work?
I don't think that will work, because we currently have clients who use 
the

system cache as follows:
-cache only page tables in the system cache
-cache only data buffers in the system cache
-cache both page tables and all buffers in the system cache
-cache both page tables and some buffers in the system cache

The approach you're suggesting doesn't allow for the last case, as 
caching the

page tables in the system cache involves setting
IO_PGTABLE_QUIRK_ARM_OUTER_WBWA,
so we will end up losing the flexibility to cache some data buffers in
the system cache.



Ah yes, you are right, I believe Jordan mentioned the same [1].

[1] 
https://lore.kernel.org/lkml/20200709161352.gc21...@jcrouse1-lnx.qualcomm.com/



Ideally, the page table quirk would drive the settings for the TCR,
and the prot flag
drives the PTE for the mapping, as is done with the page table walker
being dma-coherent,
while buffers are mapped as cacheable based on IOMMU_CACHE. Thoughts?



Right, mixing the two is not correct. Will's suggestion for a new prot
flag sounds good to me, I will work on that.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache

2021-01-07 Thread Sai Prakash Ranjan


On 2021-01-07 22:27, isa...@codeaurora.org wrote:

On 2021-01-06 03:56, Will Deacon wrote:

On Thu, Dec 24, 2020 at 12:10:07PM +0530, Sai Prakash Ranjan wrote:
commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY 
flag")

removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to mark the memory as normal sys-cached for GPU to use system 
cache.
Without this, the system cache lines are not allocated for GPU. We 
use
the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page 
protection

flag as the flag cannot be exposed via DMA api because of no in-tree
users.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 7c9ea9d7874a..3fb7de8304a2 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,

else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}



While this approach of enabling system cache globally for both page
tables and other buffers
works for the GPU usecase, this isn't ideal for other clients that use
system cache. For example,
video clients only want to cache a subset of their buffers in the
system cache, due to the sizing constraint
imposed by how much of the system cache they can use. So, it would be
ideal to have
a way of expressing the desire to use the system cache on a per-buffer
basis. Additionally,
our video clients use the DMA layer, and since the requirement is for
caching in the system cache
to be a per buffer attribute, it seems like we would have to have a
DMA attribute to express
this on a per-buffer basis.



I did bring this up initially [1], also where is this video client
in upstream? AFAIK, only system cache user in upstream is GPU.
We cannot add any DMA attribute unless there is any user upstream
as per [2], so when the support for such a client is added, wouldn't
((data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) || PROT_FLAG)
work?

[1] 
https://lore.kernel.org/dri-devel/ecfda7ca80f6d7b4ff3d89b8758f4...@codeaurora.org/

[2] https://lore.kernel.org/linux-iommu/20191026053026.ga14...@lst.de/T/

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache

2021-01-06 Thread Sai Prakash Ranjan


Hi Will,

On 2021-01-06 17:26, Will Deacon wrote:

On Thu, Dec 24, 2020 at 12:10:07PM +0530, Sai Prakash Ranjan wrote:

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to mark the memory as normal sys-cached for GPU to use system 
cache.

Without this, the system cache lines are not allocated for GPU. We use
the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page protection
flag as the flag cannot be exposed via DMA api because of no in-tree
users.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 7c9ea9d7874a..3fb7de8304a2 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,

else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}


drivers/iommu/io-pgtable.c currently documents this quirk as applying 
only
to the page-table walker. Given that we only have one user at the 
moment,

I think it's ok to change that, but please update the comment.



Sure, how about this change in comment:

 * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the 
outer-cacheability
-*  attributes set in the TCR for a non-coherent page-table 
walker.
+*  attributes set in the TCR for a non-coherent page-table 
walker
+*  and also to set the correct cacheability attributes to 
use an

+*  outer level of cache for non-coherent masters.

We also need to decide on whether we want to allow the quirk to be 
passed
if the coherency of the page-table walker differs from the DMA device, 
since

we have these combinations:

Coherent walker?IOMMU_CACHE IO_PGTABLE_QUIRK_ARM_OUTER_WBWA
0:  N   0   0
1:  N   0   1
2:  N   1   0
3:  N   1   1
4:  Y   0   0
5:  Y   0   1
6:  Y   1   0
7:  Y   1   1

Some of them are obviously bogus, such as (7), but I don't know what to
do about cases such as (3) and (5).



I thought this was already decided when IOMMU_SYS_CACHE_ONLY prot flag 
was

added in this same location [1]. dma-coherent masters can use the normal
cached memory type to use the system cache and non dma-coherent masters
willing to use system cache should use normal sys-cached memory type 
with

this quirk.

[1] 
https://lore.kernel.org/linux-arm-msm/20190516093020.18028-1-vivek.gau...@codeaurora.org/


Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache

2020-12-23 Thread Sai Prakash Ranjan

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to mark the memory as normal sys-cached for GPU to use system cache.
Without this, the system cache lines are not allocated for GPU. We use
the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page protection
flag as the flag cannot be exposed via DMA api because of no in-tree
users.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7c9ea9d7874a..3fb7de8304a2 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,
else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}
 
if (prot & IOMMU_CACHE)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv10 8/9] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-11-24 Thread Sai Prakash Ranjan

Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
Acked-by: Will Deacon 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  9 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 -
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 -
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 7fed89c9d18a..26e2734eb4d7 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -214,14 +214,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
return nvidia_smmu_impl_init(smmu);
 
-   if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
-   return qcom_smmu_impl_init(smmu);
-
-   if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
-   return qcom_adreno_smmu_impl_init(smmu);
+   smmu = qcom_smmu_impl_init(smmu);
 
if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = &mrvl_mmu500_impl;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index d0636c803a36..add1859b2899 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -318,12 +318,23 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
return &qsmmu->smmu;
 }
 
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};
+
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
 {
-   return qcom_smmu_create(smmu, &qcom_smmu_impl);
-}
+   const struct device_node *np = smmu->dev->of_node;
 
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu)
-{
-   return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, &qcom_smmu_impl);
+
+   if (of_device_is_compatible(np, "qcom,adreno-smmu"))
+   return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl);
+
+   return smmu;
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index cb7ca3a444c9..d2a2d1bc58ba 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -523,7 +523,6 @@ static inline void arm_smmu_writeq(struct arm_smmu_device 
*smmu, int page,
 struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu);
 
 void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
 int arm_mmu500_reset(struct arm_smmu_device *smmu);
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv10 7/9] drm/msm/a6xx: Add support for using system cache on MMU500 based targets

2020-11-24 Thread Sai Prakash Ranjan

From: Jordan Crouse 

GPU targets with an MMU-500 attached have a slightly different process for
enabling system cache. Use the compatible string on the IOMMU phandle
to see if an MMU-500 is attached and modify the programming sequence
accordingly.

Signed-off-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +--
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h |  1 +
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 95c98c642876..3f8b92da8cba 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
 
 static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 {
+   struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
+   struct msm_gpu *gpu = &adreno_gpu->base;
u32 cntl1_regval = 0;
 
if (IS_ERR(a6xx_gpu->llc_mmio))
@@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
   (gpu_scid << 15) | (gpu_scid << 20);
}
 
+   /*
+* For targets with a MMU500, activate the slice but don't program the
+* register.  The XBL will take care of that.
+*/
if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
-   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+   if (!a6xx_gpu->have_mmu500) {
+   u32 gpuhtw_scid = 
llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
 
-   gpuhtw_scid &= 0x1f;
-   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), 
gpuhtw_scid);
+   }
}
 
if (cntl1_regval) {
@@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 * Program the slice IDs for the various GPU blocks and GPU MMU
 * pagetables
 */
-   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
-
-   /*
-* Program cacheability overrides to not allocate cache lines on
-* a write miss
-*/
-   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   if (a6xx_gpu->have_mmu500)
+   gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0),
+   cntl1_regval);
+   else {
+   a6xx_llc_write(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache
+* lines on a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);
+   }
}
 }
 
@@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct a6xx_gpu 
*a6xx_gpu)
 static void a6xx_llc_slices_init(struct platform_device *pdev,
struct a6xx_gpu *a6xx_gpu)
 {
+   struct device_node *phandle;
+
a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
if (IS_ERR(a6xx_gpu->llc_mmio))
return;
 
+   /*
+* There is a different programming path for targets with an mmu500
+* attached, so detect if that is the case
+*/
+   phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0);
+   a6xx_gpu->have_mmu500 = (phandle &&
+   of_device_is_compatible(phandle, "arm,mmu-500"));
+   of_node_put(phandle);
+
a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
 
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index 9e6079af679c..e793d329e77b 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -32,6 +32,7 @@ struct a6xx_gpu {
void __iomem *llc_mmio;
void *llc_slice;
void *htw_llc_slice;
+   bool have_mmu500;
 };
 
 #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv10 9/9] iommu: arm-smmu-impl: Add a space before open parenthesis

2020-11-24 Thread Sai Prakash Ranjan

Fix the checkpatch warning for space required before the open
parenthesis.

Signed-off-by: Sai Prakash Ranjan 
Acked-by: Will Deacon 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 26e2734eb4d7..136872e77195 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -12,7 +12,7 @@
 
 static int arm_smmu_gr0_ns(int offset)
 {
-   switch(offset) {
+   switch (offset) {
case ARM_SMMU_GR0_sCR0:
case ARM_SMMU_GR0_sACR:
case ARM_SMMU_GR0_sGFSR:
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv10 0/9] System Cache support for GPU and required SMMU support

2020-11-24 Thread Sai Prakash Ranjan

Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices preallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 7 adds system cache support in SMMU and GPU driver.
Patch 8 and 9 are minor cleanups for arm-smmu impl.

Changes in v10:
 * Fix non-strict mode domain attr handling (Will)
 * Split the domain attribute patch into two (Will)

Changes in v9:
 * Change name from domain_attr_io_pgtbl_cfg to io_pgtable_domain_attr (Will)
 * Modify comment for the quirk as suggested (Will)
 * Compare with IO_PGTABLE_QUIRK_NON_STRICT for non-strict mode (Will)

Changes in v8:
 * Introduce a generic domain attribute for pagetable config (Will)
 * Rename quirk to more generic IO_PGTABLE_QUIRK_ARM_OUTER_WBWA (Will)
 * Move non-strict mode to use new struct domain_attr_io_pgtbl_config (Will)

Changes in v7:
 * Squash Jordan's patch to support MMU500 targets
 * Rebase on top of for-joerg/arm-smmu/updates and Jordan's short series for 
adreno-smmu impl

Changes in v6:
 * Move table to arm-smmu-qcom (Robin)

Changes in v5:
 * Drop cleanup of blank lines since it was intentional (Robin)
 * Rebase again on top of msm-next-pgtables as it moves pretty fast

Changes in v4:
 * Drop IOMMU_SYS_CACHE prot flag
 * Rebase on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v3:
 * Fix domain attribute setting to before iommu_attach_device()
 * Fix few code style and checkpatch warnings
 * Rebase on top of Jordan's latest split pagetables and per-instance
   pagetables support

Changes in v2:
 * Addressed review comments and rebased on top of Jordan's split
   pagetables series

Jordan Crouse (1):
  drm/msm/a6xx: Add support for using system cache on MMU500 based
targets

Sai Prakash Ranjan (6):
  iommu/io-pgtable: Add a domain attribute for pagetable configuration
  iommu/io-pgtable-arm: Add support to use system cache
  iommu/arm-smmu: Add support for pagetable config domain attribute
  iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr
  iommu: arm-smmu-impl: Use table to list QCOM implementations
  iommu: arm-smmu-impl: Add a space before open parenthesis

Sharat Masetty (2):
  drm/msm: rearrange the gpu_rmw() function
  drm/msm/a6xx: Add support for using system cache(LLC)

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 109 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h  |   5 +
 drivers/gpu/drm/msm/adreno/adreno_gpu.c|  17 
 drivers/gpu/drm/msm/msm_drv.c  |   8 ++
 drivers/gpu/drm/msm/msm_drv.h  |   1 +
 drivers/gpu/drm/msm/msm_gpu.h  |   5 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  11 +--
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c |  21 +++-
 drivers/iommu/arm/arm-smmu/arm-smmu.c  |  33 ++-
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |   3 +-
 drivers/iommu/io-pgtable-arm.c |  10 +-
 include/linux/io-pgtable.h |   8 ++
 include/linux/iommu.h  |   1 +
 13 files changed, 205 insertions(+), 27 deletions(-)


base-commit: a29bbb0861f487a5e144dc997a9f71a36c7a2404
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv10 5/9] drm/msm: rearrange the gpu_rmw() function

2020-11-24 Thread Sai Prakash Ranjan

From: Sharat Masetty 

The register read-modify-write construct is generic enough
that it can be used by other subsystems as needed, create
a more generic rmw() function and have the gpu_rmw() use
this new function.

Signed-off-by: Sharat Masetty 
Reviewed-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/msm_drv.c | 8 
 drivers/gpu/drm/msm/msm_drv.h | 1 +
 drivers/gpu/drm/msm/msm_gpu.h | 5 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 49685571dc0e..a1e22b974b77 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr)
return val;
 }
 
+void msm_rmw(void __iomem *addr, u32 mask, u32 or)
+{
+   u32 val = msm_readl(addr);
+
+   val &= ~mask;
+   msm_writel(val | or, addr);
+}
+
 struct msm_vblank_work {
struct work_struct work;
int crtc_id;
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index b9dd8f8f4887..655b3b0424a1 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
*pdev, const char *name,
const char *dbgname);
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
+void msm_rmw(void __iomem *addr, u32 mask, u32 or);
 
 struct msm_gpu_submitqueue;
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 6c9e1fdc1a76..b2b419277953 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 
 static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
 {
-   uint32_t val = gpu_read(gpu, reg);
-
-   val &= ~mask;
-   gpu_write(gpu, reg, val | or);
+   msm_rmw(gpu->mmio + (reg << 2), mask, or);
 }
 
 static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv10 6/9] drm/msm/a6xx: Add support for using system cache(LLC)

2020-11-24 Thread Sai Prakash Ranjan

From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_IO_PGTABLE_CFG is another domain level attribute used
by the IOMMU driver for pagetable configuration which will be used
to set a quirk initially to set the right attributes to cache the
hardware pagetables into the system cache.

Signed-off-by: Sharat Masetty 
[saiprakash.ranjan: fix to set attr before device attach to iommu and rebase]
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
 3 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 948f3656c20c..95c98c642876 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,7 +8,9 @@
 #include "a6xx_gpu.h"
 #include "a6xx_gmu.xml.h"
 
+#include 
 #include 
+#include 
 
 #define GPU_PAS_ID 13
 
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
return IRQ_HANDLED;
 }
 
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or)
+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value)
+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 
10) |
+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   }
+}
+
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_putd(a6xx_gpu->llc_slice);
+   llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_slices_init(struct platform_device *pdev,
+   struct a6xx_gpu *a6xx_gpu)
+{
+   a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
+   a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+
+   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
msm_gpu_resume_devfreq(gpu);
 
+   a6xx_llc_activate(a6xx_gpu);
+
return 0;
 }
 
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
trace_msm_gpu_suspend(0);
 
+   a6xx_llc_deactivate(a6xx_gpu);
+
devfreq_suspend_device(gpu->devfreq.devfreq);
 
return a6xx_gmu_stop(a6xx_gpu);
@@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
d

[PATCHv10 2/9] iommu/io-pgtable-arm: Add support to use system cache

2020-11-24 Thread Sai Prakash Ranjan

Add a quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to override
the outer-cacheability attributes set in the TCR for a
non-coherent page table walker when using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 10 --
 include/linux/io-pgtable.h |  4 
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index a7a9bc08dcd1..7c9ea9d7874a 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
 
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -773,10 +774,15 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   goto out_free_data;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   else
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
}
 
tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 215fd9d69540..fb4d5a763e0c 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
+*
+* IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
+*  attributes set in the TCR for a non-coherent page-table walker.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -93,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
+   #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv10 4/9] iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr

2020-11-24 Thread Sai Prakash Ranjan

Now that we have a struct io_pgtable_domain_attr with quirks,
use that for non_strict mode as well thereby removing the need
for more members of arm_smmu_domain in the future.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 15 +--
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 -
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4b9b10fe50ed..d8979bb71fc0 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -786,9 +786,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
goto out_clear_smmu;
}
 
-   if (smmu_domain->non_strict)
-   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-
if (smmu_domain->pgtbl_cfg.quirks)
pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
 
@@ -1526,9 +1523,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
break;
case IOMMU_DOMAIN_DMA:
switch (attr) {
-   case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   *(int *)data = smmu_domain->non_strict;
+   case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: {
+   bool non_strict = smmu_domain->pgtbl_cfg.quirks &
+ IO_PGTABLE_QUIRK_NON_STRICT;
+   *(int *)data = non_strict;
return 0;
+   }
default:
return -ENODEV;
}
@@ -1578,7 +1578,10 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   smmu_domain->non_strict = *(int *)data;
+   if (*(int *)data)
+   smmu_domain->pgtbl_cfg.quirks |= 
IO_PGTABLE_QUIRK_NON_STRICT;
+   else
+   smmu_domain->pgtbl_cfg.quirks &= 
~IO_PGTABLE_QUIRK_NON_STRICT;
break;
default:
ret = -ENODEV;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index bb5a419f240f..cb7ca3a444c9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -368,7 +368,6 @@ struct arm_smmu_domain {
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
-   boolnon_strict;
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCHv10 3/9] iommu/arm-smmu: Add support for pagetable config domain attribute

2020-11-24 Thread Sai Prakash Ranjan

Add support for domain attribute DOMAIN_ATTR_IO_PGTABLE_CFG
to get/set pagetable configuration data which initially will
be used to set quirks and later can be extended to include
other pagetable configuration data.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 2 files changed, 21 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 0f28a8614da3..4b9b10fe50ed 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->pgtbl_cfg.quirks)
+   pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1511,6 +1514,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_NESTING:
*(int *)data = (smmu_domain->stage == 
ARM_SMMU_DOMAIN_NESTED);
return 0;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct io_pgtable_domain_attr *pgtbl_cfg = data;
+   *pgtbl_cfg = smmu_domain->pgtbl_cfg;
+
+   return 0;
+   }
default:
return -ENODEV;
}
@@ -1551,6 +1560,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct io_pgtable_domain_attr *pgtbl_cfg = data;
+
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   smmu_domain->pgtbl_cfg = *pgtbl_cfg;
+   break;
+   }
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 04288b6fc619..bb5a419f240f 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -364,6 +364,7 @@ enum arm_smmu_domain_stage {
 struct arm_smmu_domain {
struct arm_smmu_device  *smmu;
struct io_pgtable_ops   *pgtbl_ops;
+   struct io_pgtable_domain_attr   pgtbl_cfg;
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

1 2 3 >

1 - 100 of 281 matches

Mail list logo