[PATCHv3] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- Changes in v3: * Move this debug feature to arm-smmu-qcom-debug.c (Will Deacon). * Keep single ratelimit state and remove local variable (Robin). Changes in v2: * Use scm call consistently so that it works on older chipsets where some of these regs are secure registers. * Add device specific data to get the implementation defined register offsets. --- drivers/iommu/Kconfig | 10 ++ drivers/iommu/arm/arm-smmu/Makefile | 1 + .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c | 142 ++ drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c| 32 +++- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h| 28 drivers/iommu/arm/arm-smmu/arm-smmu.c | 6 +- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 7 files changed, 211 insertions(+), 9 deletions(-) create mode 100644 drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c create mode 100644 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index c79a0df090c0..5c5cb5bee8b6 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -363,6 +363,16 @@ config ARM_SMMU_QCOM When running on a Qualcomm platform that has the custom variant of the ARM SMMU, this needs to be built into the SMMU driver. +config ARM_SMMU_QCOM_DEBUG + bool "ARM SMMU QCOM implementation defined debug support" + depends on ARM_SMMU_QCOM + help + Support for implementation specific debug features in ARM SMMU + hardware found in QTI platforms. + + Say Y here to enable debug for issues such as TLB sync timeouts + which requires implementation defined register dumps. + config ARM_SMMU_V3 tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support" depends on ARM64 diff --git a/drivers/iommu/arm/arm-smmu/Makefile b/drivers/iommu/arm/arm-smmu/Makefile index b0cc01aa20c9..2a5a95e8e3f9 100644 --- a/drivers/iommu/arm/arm-smmu/Makefile +++ b/drivers/iommu/arm/arm-smmu/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o obj-$(CONFIG_ARM_SMMU) += arm_smmu.o arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o arm_smmu-$(CONFIG_ARM_SMMU_QCOM) += arm-smmu-qcom.o +arm_smmu-$(CONFIG_ARM_SMMU_QCOM_DEBUG) += arm-smmu-qcom-debug.o diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c new file mode 100644 index ..6eed8e67a0ca --- /dev/null +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#include +#include +#include + +#include "arm-smmu.h" +#include "arm-smmu-qcom.h" + +enum qcom_smmu_impl_reg_offset { + QCOM_SMMU_TBU_PWR_STATUS, + QCOM_SMMU_STATS_SYNC_INV_TBU_ACK, + QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR, +}; + +struct qcom_smmu_config { + const u32 *reg_offset; +}; + +void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) +{ + int ret; + u32 tbu_pwr_status, sync_inv_ack, sync_inv_progress; + struct qcom_smmu *qsmmu = container_of(smmu, struct qcom_smmu, smmu); + const struct qcom_smmu_config *cfg; + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (__ratelimit(&rs)) { + dev_err(smmu->dev, "TLB sync timed out -- SMMU may be deadlocked\n"); + + cfg = qsmmu->cfg; + if (!cfg) + return; + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_TBU_PWR_STATUS], + &tbu_pwr_status); + if (ret) + dev_err(smmu->dev, + "Failed to read TBU power status: %d\n", ret); + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_STATS_SYNC_INV_TBU_ACK], + &sync_inv_ack); + if (ret) + dev_err(smmu->dev, + "Failed to read TBU sync/inv ack status: %d\n", ret); + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR], +
Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
Hi Robin, On 7/6/2022 10:15 PM, Robin Murphy wrote: On 2022-05-26 05:14, Sai Prakash Ranjan wrote: TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- Changes in v2: * Use scm call consistently so that it works on older chipsets where some of these regs are secure registers. * Add device specific data to get the implementation defined register offsets. --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++--- drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 146 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7820711c4560..bb68aa85b28b 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -5,13 +5,27 @@ #include #include +#include #include #include #include "arm-smmu.h" +#define QCOM_DUMMY_VAL -1 + +enum qcom_smmu_impl_reg_offset { + QCOM_SMMU_TBU_PWR_STATUS, + QCOM_SMMU_STATS_SYNC_INV_TBU_ACK, + QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR, +}; + +struct qcom_smmu_config { + const u32 *reg_offset; +}; + struct qcom_smmu { struct arm_smmu_device smmu; + const struct qcom_smmu_config *cfg; bool bypass_quirk; u8 bypass_cbndx; u32 stall_enabled; @@ -22,6 +36,56 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu) return container_of(smmu, struct qcom_smmu, smmu); } +static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page, + int sync, int status) +{ + int ret; + unsigned int spin_cnt, delay; + u32 reg, tbu_pwr_status, sync_inv_ack, sync_inv_progress; + struct qcom_smmu *qsmmu = to_qcom_smmu(smmu); + const struct qcom_smmu_config *cfg; + + arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL); + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { + reg = arm_smmu_readl(smmu, page, status); + if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE)) + return; + cpu_relax(); + } + udelay(delay); + } + + dev_err_ratelimited(smmu->dev, + "TLB sync timed out -- SMMU may be deadlocked\n"); Maybe consider a single ratelimit state for the whole function so all the output stays together. If things go sufficiently wrong, mixed up bits of partial output from different events may be misleadingly unhelpful (and at the very least it'll be up to 5x more effective at the intent of limiting log spam). Right, makes sense. Will change it. + cfg = qsmmu->cfg; + if (!cfg) + return; + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_TBU_PWR_STATUS], + &tbu_pwr_status); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read TBU power status: %d\n", ret); + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_STATS_SYNC_INV_TBU_ACK], + &sync_inv_ack); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read TBU sync/inv ack status: %d\n", ret); + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR], + &sync_inv_progress); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read TCU syn/inv progress: %d\n", ret); + + dev_err_ratelimited(smmu->dev, + "TBU: power_status %#x sync_inv_ack %#x sync_inv_progress %#x\n", + tbu_pwr_status, sync_inv_ack, sync_inv_progress); +} + static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int idx, u32 reg) { @@ -374,6 +438,7 @@ static const struct arm_smmu_impl qcom_smmu_impl = { .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, .write_s2cr = qcom_smmu_write_s2cr, + .tlb_sync = qcom_smmu_tlb_sync, }; static const struct arm_smmu_impl qcom_adreno_smmu_impl = { @@ -382,12 +447,84 @@ static const struct arm_smmu_impl qcom_adreno_smmu_impl = { .reset = qcom_smmu500_reset, .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank, .write_sctlr = qcom_adreno_smmu_write_sctlr, + .tlb_sync = qcom_smmu_t
Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
On 7/6/2022 5:26 PM, Will Deacon wrote: On Thu, May 26, 2022 at 09:44:03AM +0530, Sai Prakash Ranjan wrote: TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- Changes in v2: * Use scm call consistently so that it works on older chipsets where some of these regs are secure registers. * Add device specific data to get the implementation defined register offsets. --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++--- drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 146 insertions(+), 18 deletions(-) If this is useful to you, then I suppose it's something we could support, however I'm pretty worried about our ability to maintain/scale this stuff as it is extended to support additional SoCs and other custom debugging features. Perhaps you could stick it all in arm-smmu-qcom-debug.c and have a new config option for that, so at least it's even further out of the way? Will Sounds good to me, will do that. Thanks, Sai ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
On 6/23/2022 11:32 AM, Sai Prakash Ranjan wrote: On 5/26/2022 9:44 AM, Sai Prakash Ranjan wrote: TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- Changes in v2: * Use scm call consistently so that it works on older chipsets where some of these regs are secure registers. * Add device specific data to get the implementation defined register offsets. --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++--- drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 146 insertions(+), 18 deletions(-) Any comments on this patch? Gentle Ping !! Thanks, Sai ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
On 5/26/2022 9:44 AM, Sai Prakash Ranjan wrote: TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- Changes in v2: * Use scm call consistently so that it works on older chipsets where some of these regs are secure registers. * Add device specific data to get the implementation defined register offsets. --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++--- drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 146 insertions(+), 18 deletions(-) Any comments on this patch? Thanks, Sai ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
Hi Vincent, On 6/9/2022 2:52 AM, Vincent Knecht wrote: Le jeudi 26 mai 2022 à 09:44 +0530, Sai Prakash Ranjan a écrit : TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- Changes in v2: * Use scm call consistently so that it works on older chipsets where some of these regs are secure registers. * Add device specific data to get the implementation defined register offsets. --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++--- drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 146 insertions(+), 18 deletions(-) Hi Sai, and thanks for this patch ! I've encountered TLB sync timeouts with msm8939 SoC recently. What would be needed to add to this patch so this SoC is supported ? Like, where could one check the values to be used in an equivalent of qcom_smmu_impl0_reg_offset values for this SoC (if any change needed) ? Current values are not found by simply greping in downstream/vendor dtsi/dts files... These are implementation defined registers and some might not be present on older SoCs and sometimes they don't add this support in downstream kernels even if the registers are present. I looked up the IP doc for msm8939 and I could find only TBU_PWR_STATUS register and you can use the same offset for it as given in this patch. Thanks, Sai ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- Changes in v2: * Use scm call consistently so that it works on older chipsets where some of these regs are secure registers. * Add device specific data to get the implementation defined register offsets. --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 161 ++--- drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 146 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7820711c4560..bb68aa85b28b 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -5,13 +5,27 @@ #include #include +#include #include #include #include "arm-smmu.h" +#define QCOM_DUMMY_VAL -1 + +enum qcom_smmu_impl_reg_offset { + QCOM_SMMU_TBU_PWR_STATUS, + QCOM_SMMU_STATS_SYNC_INV_TBU_ACK, + QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR, +}; + +struct qcom_smmu_config { + const u32 *reg_offset; +}; + struct qcom_smmu { struct arm_smmu_device smmu; + const struct qcom_smmu_config *cfg; bool bypass_quirk; u8 bypass_cbndx; u32 stall_enabled; @@ -22,6 +36,56 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu) return container_of(smmu, struct qcom_smmu, smmu); } +static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page, + int sync, int status) +{ + int ret; + unsigned int spin_cnt, delay; + u32 reg, tbu_pwr_status, sync_inv_ack, sync_inv_progress; + struct qcom_smmu *qsmmu = to_qcom_smmu(smmu); + const struct qcom_smmu_config *cfg; + + arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL); + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { + reg = arm_smmu_readl(smmu, page, status); + if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE)) + return; + cpu_relax(); + } + udelay(delay); + } + + dev_err_ratelimited(smmu->dev, + "TLB sync timed out -- SMMU may be deadlocked\n"); + + cfg = qsmmu->cfg; + if (!cfg) + return; + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_TBU_PWR_STATUS], + &tbu_pwr_status); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read TBU power status: %d\n", ret); + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_STATS_SYNC_INV_TBU_ACK], + &sync_inv_ack); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read TBU sync/inv ack status: %d\n", ret); + + ret = qcom_scm_io_readl(smmu->ioaddr + cfg->reg_offset[QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR], + &sync_inv_progress); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read TCU syn/inv progress: %d\n", ret); + + dev_err_ratelimited(smmu->dev, + "TBU: power_status %#x sync_inv_ack %#x sync_inv_progress %#x\n", + tbu_pwr_status, sync_inv_ack, sync_inv_progress); +} + static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int idx, u32 reg) { @@ -374,6 +438,7 @@ static const struct arm_smmu_impl qcom_smmu_impl = { .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, .write_s2cr = qcom_smmu_write_s2cr, + .tlb_sync = qcom_smmu_tlb_sync, }; static const struct arm_smmu_impl qcom_adreno_smmu_impl = { @@ -382,12 +447,84 @@ static const struct arm_smmu_impl qcom_adreno_smmu_impl = { .reset = qcom_smmu500_reset, .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank, .write_sctlr = qcom_adreno_smmu_write_sctlr, + .tlb_sync = qcom_smmu_tlb_sync, +}; + +/* Implementation Defined Register Space 0 register offsets */ +static const u32 qcom_smmu_impl0_reg_offset[]
Re: [PATCH] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
On 5/23/2022 10:48 PM, Sai Prakash Ranjan wrote: TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 50 ++ drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 4 ++ 3 files changed, 56 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7820711c4560..22e9a0085475 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -5,11 +5,19 @@ #include #include +#include #include #include #include "arm-smmu.h" +#define QCOM_DUMMY_VAL -1 + +/* Implementation Defined Register Space 0 registers */ +#define QCOM_SMMU_STATS_SYNC_INV_TBU_ACK 0x5dc +#define QCOM_SMMU_TBU_PWR_STATUS 0x2204 +#define QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR 0x2670 + struct qcom_smmu { struct arm_smmu_device smmu; bool bypass_quirk; @@ -22,6 +30,46 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu) return container_of(smmu, struct qcom_smmu, smmu); } +static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page, + int sync, int status) +{ + u32 sync_inv_ack, sync_inv_progress, tbu_pwr_status; + unsigned int spin_cnt, delay; + u32 reg; + int ret; + + arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL); + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { + reg = arm_smmu_readl(smmu, page, status); + if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE)) + return; + cpu_relax(); + } + udelay(delay); + } + + sync_inv_ack = arm_smmu_readl(smmu, ARM_SMMU_IMPL_DEF0, + QCOM_SMMU_STATS_SYNC_INV_TBU_ACK); Sorry, this doesn't work always, looks like on earlier chipsets this is a secure register and reading it from non-secure world would probably blow. Also this register can be in other implementation defined space for different chipsets. So I think we can use SCM call here and have a device specific data based on already existing compatible for QCOM SoCs to identify IMP_DEF space used. + ret = qcom_scm_io_readl(smmu->ioaddr + QCOM_SMMU_TBU_PWR_STATUS, + &tbu_pwr_status); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read TBU power status: %d\n", ret); + + ret = qcom_scm_io_readl(smmu->ioaddr + QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR, + &sync_inv_progress); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read SAFE WAIT counter: %d\n", ret); + + dev_err_ratelimited(smmu->dev, + "TLB sync timed out -- SMMU may be deadlocked\n" + "TBU: sync_inv_ack %#x power_status %#x sync_inv_progress %#x\n", + sync_inv_ack, tbu_pwr_status, sync_inv_progress); +} + static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int idx, u32 reg) { @@ -374,6 +422,7 @@ static const struct arm_smmu_impl qcom_smmu_impl = { .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, .write_s2cr = qcom_smmu_write_s2cr, + .tlb_sync = qcom_smmu_tlb_sync, }; static const struct arm_smmu_impl qcom_adreno_smmu_impl = { @@ -382,6 +431,7 @@ static const struct arm_smmu_impl qcom_adreno_smmu_impl = { .reset = qcom_smmu500_reset, .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank, .write_sctlr = qcom_adreno_smmu_write_sctlr, + .tlb_sync = qcom_smmu_tlb_sync, }; static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 2ed3594f384e..4c5b51109835 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2099,6 +2099,8 @@ static int arm_smmu_device_probe(struct platform_device *pdev) if (IS_ERR(smmu->base))
[PATCH] iommu/arm-smmu-qcom: Add debug support for TLB sync timeouts
TLB sync timeouts can be due to various reasons such as TBU power down or pending TCU/TBU invalidation/sync and so on. Debugging these often require dumping of some implementation defined registers to know the status of TBU/TCU operations and some of these registers are not accessible in non-secure world such as from kernel and requires SMC calls to read them in the secure world. So, add this debug support to dump implementation defined registers for TLB sync timeout issues. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 50 ++ drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 4 ++ 3 files changed, 56 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7820711c4560..22e9a0085475 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -5,11 +5,19 @@ #include #include +#include #include #include #include "arm-smmu.h" +#define QCOM_DUMMY_VAL -1 + +/* Implementation Defined Register Space 0 registers */ +#define QCOM_SMMU_STATS_SYNC_INV_TBU_ACK 0x5dc +#define QCOM_SMMU_TBU_PWR_STATUS 0x2204 +#define QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR 0x2670 + struct qcom_smmu { struct arm_smmu_device smmu; bool bypass_quirk; @@ -22,6 +30,46 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu) return container_of(smmu, struct qcom_smmu, smmu); } +static void qcom_smmu_tlb_sync(struct arm_smmu_device *smmu, int page, + int sync, int status) +{ + u32 sync_inv_ack, sync_inv_progress, tbu_pwr_status; + unsigned int spin_cnt, delay; + u32 reg; + int ret; + + arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL); + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { + reg = arm_smmu_readl(smmu, page, status); + if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE)) + return; + cpu_relax(); + } + udelay(delay); + } + + sync_inv_ack = arm_smmu_readl(smmu, ARM_SMMU_IMPL_DEF0, + QCOM_SMMU_STATS_SYNC_INV_TBU_ACK); + + ret = qcom_scm_io_readl(smmu->ioaddr + QCOM_SMMU_TBU_PWR_STATUS, + &tbu_pwr_status); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read TBU power status: %d\n", ret); + + ret = qcom_scm_io_readl(smmu->ioaddr + QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR, + &sync_inv_progress); + if (ret) + dev_err_ratelimited(smmu->dev, + "Failed to read SAFE WAIT counter: %d\n", ret); + + dev_err_ratelimited(smmu->dev, + "TLB sync timed out -- SMMU may be deadlocked\n" + "TBU: sync_inv_ack %#x power_status %#x sync_inv_progress %#x\n", + sync_inv_ack, tbu_pwr_status, sync_inv_progress); +} + static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int idx, u32 reg) { @@ -374,6 +422,7 @@ static const struct arm_smmu_impl qcom_smmu_impl = { .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, .write_s2cr = qcom_smmu_write_s2cr, + .tlb_sync = qcom_smmu_tlb_sync, }; static const struct arm_smmu_impl qcom_adreno_smmu_impl = { @@ -382,6 +431,7 @@ static const struct arm_smmu_impl qcom_adreno_smmu_impl = { .reset = qcom_smmu500_reset, .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank, .write_sctlr = qcom_adreno_smmu_write_sctlr, + .tlb_sync = qcom_smmu_tlb_sync, }; static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 2ed3594f384e..4c5b51109835 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2099,6 +2099,8 @@ static int arm_smmu_device_probe(struct platform_device *pdev) if (IS_ERR(smmu->base)) return PTR_ERR(smmu->base); ioaddr = res->start; + smmu->ioaddr = ioaddr; + /* * The resource size should effectively match the value of SMMU_TOP; * stash that temporarily until we know PAGESIZE to validate it with. diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 2b9b42fb6f30..8cf6567d970f 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-s
Re: [PATCHv4] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
On 2021-08-11 21:23, Robin Murphy wrote: On 2021-08-11 11:30, Will Deacon wrote: On Wed, Aug 11, 2021 at 11:37:25AM +0530, Sai Prakash Ranjan wrote: diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f7da8953afbe..3904b598e0f9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -327,9 +327,16 @@ static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size, static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t size, size_t granule, void *cookie) { - arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie, - ARM_SMMU_CB_S1_TLBIVA); - arm_smmu_tlb_sync_context(cookie); + struct arm_smmu_domain *smmu_domain = cookie; + struct arm_smmu_cfg *cfg = &smmu_domain->cfg; + + if (cfg->flush_walk_prefer_tlbiasid) { + arm_smmu_tlb_inv_context_s1(cookie); Hmm, this introduces an unconditional wmb() if tlbiasid is preferred. I think that should be predicated on ARM_SMMU_FEAT_COHERENT_WALK like it is for the by-VA ops. Worth doing as a separate patch. + } else { + arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie, + ARM_SMMU_CB_S1_TLBIVA); + arm_smmu_tlb_sync_context(cookie); + } } static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather *gather, @@ -765,8 +772,10 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, .iommu_dev = smmu->dev, }; - if (!iommu_get_dma_strict(domain)) + if (!iommu_get_dma_strict(domain)) { pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; + cfg->flush_walk_prefer_tlbiasid = true; This is going to interact badly with Robin's series to allow dynamic transition to non-strict mode, as we don't have a mechanism to switch over to the by-ASID behaviour. Yes, it should _work_, but it's ugly having different TLBI behaviour just because of the how the domain became non-strict. Robin -- I think this originated from your idea at [1]. Any idea how to make it work with your other series, or shall we drop this part for now and leave the TLB invalidation behaviour the same for now? Yeah, I'd say drop it - I'm currently half an hour into a first attempt at removing io_pgtable_tlb_flush_walk() entirely, which would make it moot for non-strict anyway. I have dropped it and sent a v5. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv5] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. On qcom implementation, there are several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. So the cost of over-invalidation is less compared to the unmap latency on several usecases like camera which deals with large buffers. So, ASID based TLB invalidations (TLBIASID) can be used to invalidate the entire context for partial walk flush thereby improving the unmap latency. For this example of 32MB scatter-gather list unmap, this change results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan --- Changes in v5: * Drop non-strict mode change as it will conflict with Robin's series Changes in v4: * Use a flag in struct arm_smmu_cfg to prefer TLBIASID (Will) Changes in v3: * Move the logic to arm-smmu driver from io-pgtable (Robin) * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for qcom impl Changes in v2: * Add a quirk to choose tlb_flush_all in partial walk flush * Set the quirk for QTI SoC implementation --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++ drivers/iommu/arm/arm-smmu/arm-smmu.c | 13 ++--- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 9b9d13ec5a88..55690af1b25d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -193,6 +193,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -235,6 +237,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -358,6 +368,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .init_context = qcom_smmu_init_context, .cfg_probe = qcom_smmu_cfg_probe, .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f7da8953afbe..67b660b0551d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -327,9
Re: [PATCHv4] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
On 2021-08-11 16:00, Will Deacon wrote: On Wed, Aug 11, 2021 at 11:37:25AM +0530, Sai Prakash Ranjan wrote: diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f7da8953afbe..3904b598e0f9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -327,9 +327,16 @@ static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size, static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t size, size_t granule, void *cookie) { - arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie, - ARM_SMMU_CB_S1_TLBIVA); - arm_smmu_tlb_sync_context(cookie); + struct arm_smmu_domain *smmu_domain = cookie; + struct arm_smmu_cfg *cfg = &smmu_domain->cfg; + + if (cfg->flush_walk_prefer_tlbiasid) { + arm_smmu_tlb_inv_context_s1(cookie); Hmm, this introduces an unconditional wmb() if tlbiasid is preferred. I think that should be predicated on ARM_SMMU_FEAT_COHERENT_WALK like it is for the by-VA ops. Worth doing as a separate patch. Ok I will keep this as-is for now then. + } else { + arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie, + ARM_SMMU_CB_S1_TLBIVA); + arm_smmu_tlb_sync_context(cookie); + } } static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather *gather, @@ -765,8 +772,10 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, .iommu_dev = smmu->dev, }; - if (!iommu_get_dma_strict(domain)) + if (!iommu_get_dma_strict(domain)) { pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; + cfg->flush_walk_prefer_tlbiasid = true; This is going to interact badly with Robin's series to allow dynamic transition to non-strict mode, as we don't have a mechanism to switch over to the by-ASID behaviour. Yes, it should _work_, but it's ugly having different TLBI behaviour just because of the how the domain became non-strict. Robin -- I think this originated from your idea at [1]. Any idea how to make it work with your other series, or shall we drop this part for now and leave the TLB invalidation behaviour the same for now? Will [1] https://lore.kernel.org/r/da62ff1c-9b49-34d3-69a1-1a674e4a3...@arm.com Right, I think we can drop this non-strict change for now because it also makes it a pain to backport it to 5.4/5.10 kernels because of large number of changes in dma apis in recent kernels. I will let you and Robin decide if it's ok to drop this change and introduce it later with a different patch. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
On 2021-08-10 23:38, Will Deacon wrote: On Tue, Aug 03, 2021 at 11:09:17AM +0530, Sai Prakash Ranjan wrote: On 2021-08-02 21:13, Will Deacon wrote: > On Wed, Jun 23, 2021 at 07:12:01PM +0530, Sai Prakash Ranjan wrote: > > diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c > > b/drivers/iommu/arm/arm-smmu/arm-smmu.c > > index d3c6f54110a5..f3845e822565 100644 > > --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c > > +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c > > @@ -341,6 +341,12 @@ static void arm_smmu_tlb_add_page_s1(struct > > iommu_iotlb_gather *gather, > > ARM_SMMU_CB_S1_TLBIVAL); > > } > > > > +static void arm_smmu_tlb_inv_walk_impl_s1(unsigned long iova, > > size_t size, > > + size_t granule, void *cookie) > > +{ > > + arm_smmu_tlb_inv_context_s1(cookie); > > +} > > + > > static void arm_smmu_tlb_inv_walk_s2(unsigned long iova, size_t size, > >size_t granule, void *cookie) > > { > > @@ -388,6 +394,12 @@ static const struct iommu_flush_ops > > arm_smmu_s1_tlb_ops = { > > .tlb_add_page = arm_smmu_tlb_add_page_s1, > > }; > > > > +const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops = { > > + .tlb_flush_all = arm_smmu_tlb_inv_context_s1, > > + .tlb_flush_walk = arm_smmu_tlb_inv_walk_impl_s1, > > + .tlb_add_page = arm_smmu_tlb_add_page_s1, > > +}; > > Hmm, dunno about this. Wouldn't it be a lot cleaner if the > tlb_flush_walk > callbacks just did the right thing based on the smmu_domain (maybe in > the > arm_smmu_cfg?) rather than having an entirely new set of ops just > because > they're const and you can't overide the bit you want? > > I don't think there's really an awful lot qcom-specific about the > principle > here -- there's a trade-off between over-invalidation and invalidation > latency. That happens on the CPU as well. > Sorry didn't understand, based on smmu_domain what? How do we make this implementation specific? Do you mean something like a quirk? The reason we didn't make this common was because nvidia folks weren't so happy with that, you can find the discussion in this thread [1]. [1] https://lore.kernel.org/lkml/20210609145315.25750-1-saiprakash.ran...@codeaurora.org/ The ->tlb_flush_walk() callbacks take a 'void *cookie' which, for this driver, is a 'struct arm_smmu_domain *'. From that, you can get to the 'struct arm_smmu_cfg' which could have something as coarse as: boolflush_walk_prefer_tlbiasid; which you can set when you initialise the domain (maybe in the ->init_context callback?). It shouldn't affect anybody else. Ah ok, you meant a new flag in arm_smmu_cfg, right getting it from cookie is no big deal but nonetheless thanks for detailing it. I have made the changes and sent a v4 after testing. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv4] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. On qcom implementation, there are several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. So the cost of over-invalidation is less compared to the unmap latency on several usecases like camera which deals with large buffers. So, ASID based TLB invalidations (TLBIASID) can be used to invalidate the entire context for partial walk flush thereby improving the unmap latency. Non-strict mode can use this by default for all platforms given its all about over-invalidation saving time on individual unmaps and non-deterministic generally. For this example of 32MB scatter-gather list unmap, this change results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan --- Changes in v4: * Use a flag in struct arm_smmu_cfg to prefer TLBIASID (Will) Changes in v3: * Move the logic to arm-smmu driver from io-pgtable (Robin) * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for qcom impl Changes in v2: * Add a quirk to choose tlb_flush_all in partial walk flush * Set the quirk for QTI SoC implementation --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++ drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 + drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 9b9d13ec5a88..55690af1b25d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -193,6 +193,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -235,6 +237,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -358,6 +368,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .init_context = qcom_smmu_init_context, .cfg_probe = qcom_smmu_cfg_probe, .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f7da8953afbe..3904b598e0f9 100644 --- a/drivers/iommu/arm/a
Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache
On 2021-08-10 14:46, Will Deacon wrote: On Mon, Aug 09, 2021 at 11:17:40PM +0530, Sai Prakash Ranjan wrote: On 2021-08-09 23:10, Will Deacon wrote: > On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote: > > On Mon, Aug 9, 2021 at 10:05 AM Will Deacon wrote: > > > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote: > > > > But I suppose we could call it instead IOMMU_QCOM_LLC or something > > > > like that to make it more clear that it is not necessarily something > > > > that would work with a different outer level cache implementation? > > > > > > ... or we could just deal with the problem so that other people can reuse > > > the code. I haven't really understood the reluctance to solve this properly. > > > > > > Am I missing some reason this isn't solvable? > > > > Oh, was there another way to solve it (other than foregoing setting > > INC_OCACHE in the pgtables)? Maybe I misunderstood, is there a > > corresponding setting on the MMU pgtables side of things? > > Right -- we just need to program the CPU's MMU with the matching memory > attributes! It's a bit more fiddly if you're just using ioremap_wc() > though, as it's usually the DMA API which handles the attributes under > the > hood. > > Anyway, sorry, I should've said that explicitly earlier on. We've done > this > sort of thing in the Android tree so I assumed Sai knew what needed to > be > done and then I didn't think to explain to you :( > Right I was aware of that but even in the android tree there is no user :) I'm assuming there are vendor modules using it there, otherwise we wouldn't have been asked to put it in. Since you work at Qualcomm, maybe you could talk to your colleagues (Isaac and Patrick) directly? Right I will check with them regarding the vendor modules in android. I think we can't have a new memory type without any user right in upstream like android tree? Correct. But I don't think we should be adding IOMMU_* anything upstream if we don't have a user. Agreed, once we have the fix for GPU crash I can continue further on using this properly. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks
On 2021-08-03 11:36, Sai Prakash Ranjan wrote: On 2021-08-02 21:42, Will Deacon wrote: On Tue, Jul 27, 2021 at 03:03:22PM +0530, Sai Prakash Ranjan wrote: Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in such cases, we would need to drop the XO clock vote in unprepare call and this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened) clock driver which controls RPMh managed clock resources for new QTI SoCs and is a blocking call. Given we cannot have a sleeping calls such as clk_bulk_prepare() and clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu operations like map and unmap can be in atomic context and are in fast path, add this prepare and unprepare call to drop the XO vote only for system pm callbacks since it is not a fast path and we expect the system to enter deep sleep states with system pm as opposed to runtime pm. This is a similar sequence of clock requests (prepare,enable and disable,unprepare) in arm-smmu probe and remove. Signed-off-by: Sai Prakash Ranjan Co-developed-by: Rajendra Nayak Signed-off-by: Rajendra Nayak --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) [+Rob] How does this work with that funny GPU which writes to the SMMU registers directly? Does the SMMU need to remain independently clocked for that to work or is it all in the same clock domain? As Rob mentioned, device link should take care of all the dependencies between SMMU and its consumers. But not sure how the question relates to this patch as this change is for system pm and not runtime pm, so it is exactly the sequence of SMMU probe/remove which if works currently for that GPU SMMU, then it should work just fine for system suspend and resume as well. diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index d3c6f54110a5..9561ba4c5d39 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2277,6 +2277,13 @@ static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev) static int __maybe_unused arm_smmu_pm_resume(struct device *dev) { + int ret; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + ret = clk_bulk_prepare(smmu->num_clks, smmu->clks); + if (ret) + return ret; + if (pm_runtime_suspended(dev)) return 0; If we subsequently fail to enable the clks in arm_smmu_runtime_resume() should we unprepare them again? If we are unable to turn on the clks then its fatal and we will not live for long. Nonetheless, it won't hurt to unprepare if clk enable fails as that is the correct thing anyway, so I have added it and sent a v2. Thanks, Sai Will @@ -2285,10 +2292,19 @@ static int __maybe_unused arm_smmu_pm_resume(struct device *dev) static int __maybe_unused arm_smmu_pm_suspend(struct device *dev) { + int ret = 0; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + if (pm_runtime_suspended(dev)) - return 0; + goto clk_unprepare; - return arm_smmu_runtime_suspend(dev); + ret = arm_smmu_runtime_suspend(dev); + if (ret) + return ret; + +clk_unprepare: + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + return ret; } static const struct dev_pm_ops arm_smmu_pm_ops = { -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks
Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in such cases, we would need to drop the XO clock vote in unprepare call and this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened) clock driver which controls RPMh managed clock resources for new QTI SoCs. Given we cannot have a sleeping calls such as clk_bulk_prepare() and clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu operations like map and unmap can be in atomic context and are in fast path, add this prepare and unprepare call to drop the XO vote only for system pm callbacks since it is not a fast path and we expect the system to enter deep sleep states with system pm as opposed to runtime pm. This is a similar sequence of clock requests (prepare,enable and disable,unprepare) in arm-smmu probe and remove. Signed-off-by: Sai Prakash Ranjan Co-developed-by: Rajendra Nayak Signed-off-by: Rajendra Nayak --- Changes in v2: * Add clk unprepare when clk enable fails in resume (Will) --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 26 +++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index d3c6f54110a5..da8ef9d82d79 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2277,18 +2277,38 @@ static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev) static int __maybe_unused arm_smmu_pm_resume(struct device *dev) { + int ret; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + ret = clk_bulk_prepare(smmu->num_clks, smmu->clks); + if (ret) + return ret; + if (pm_runtime_suspended(dev)) return 0; - return arm_smmu_runtime_resume(dev); + ret = arm_smmu_runtime_resume(dev); + if (ret) + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + + return ret; } static int __maybe_unused arm_smmu_pm_suspend(struct device *dev) { + int ret = 0; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + if (pm_runtime_suspended(dev)) - return 0; + goto clk_unprepare; - return arm_smmu_runtime_suspend(dev); + ret = arm_smmu_runtime_suspend(dev); + if (ret) + return ret; + +clk_unprepare: + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + return ret; } static const struct dev_pm_ops arm_smmu_pm_ops = { -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache
On 2021-08-10 00:00, Rob Clark wrote: On Mon, Aug 9, 2021 at 11:11 AM Sai Prakash Ranjan wrote: On 2021-08-09 23:37, Rob Clark wrote: > On Mon, Aug 9, 2021 at 10:47 AM Sai Prakash Ranjan > wrote: >> >> On 2021-08-09 23:10, Will Deacon wrote: >> > On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote: >> >> On Mon, Aug 9, 2021 at 10:05 AM Will Deacon wrote: >> >> > >> >> > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote: >> >> > > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon wrote: >> >> > > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote: >> >> > > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon wrote: >> >> > > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote: >> >> > > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon wrote: >> >> > > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan wrote: >> >> > > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote: >> >> > > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan wrote: >> >> > > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") >> >> > > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went >> >> > > > > > > > > > > the memory type setting required for the non-coherent masters to use >> >> > > > > > > > > > > system cache. Now that system cache support for GPU is added, we will >> >> > > > > > > > > > > need to set the right PTE attribute for GPU buffers to be sys cached. >> >> > > > > > > > > > > Without this, the system cache lines are not allocated for GPU. >> >> > > > > > > > > > > >> >> > > > > > > > > > > So the patches in this series introduces a new prot flag IOMMU_LLC, >> >> > > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC >> >> > > > > > > > > > > and makes GPU the user of this protection flag. >> >> > > > > > > > > > >> >> > > > > > > > > > Thank you for the patchset! Are you planning to refresh it, as it does >> >> > > > > > > > > > not apply anymore? >> >> > > > > > > > > > >> >> > > > > > > > > >> >> > > > > > > > > I was waiting on Will's reply [1]. If there are no changes needed, then >> >> > > > > > > > > I can repost the patch. >> >> > > > > > > > >> >> > > > > > > > I still think you need to handle the mismatched alias, no? You're adding >> >> > > > > > > > a new memory type to the SMMU which doesn't exist on the CPU side. That >> >> > > > > > > > can't be right. >> >> > > > > > > > >> >> > > > > > > >> >> > > > > > > Just curious, and maybe this is a dumb question, but what is your >> >> > > > > > > concern about mismatched aliases? I mean the cache hierarchy on the >> >> > > > > > > GPU device side (anything beyond the LLC) is pretty different and >> >> > > > > > > doesn't really care about the smmu pgtable attributes.. >> >> > > > > > >> >> > > > > > If the CPU accesses a shared buffer with different attributes to those which >> >> > > > > > the device is using then you fall into the "mismatched memory attributes" >> >> > > > > > part of the Arm architecture. It's reasonably unforgiving (you should go and >> >> > > > > > read it) and in some cases can apply to speculative accesses as well, but >> >> > > > > > the end result is typically loss of coherency. >> >> > > > > >> >> > > > > Ok, I might have a few other sections to read first to decipher the >> >> > >
Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache
On 2021-08-09 23:37, Rob Clark wrote: On Mon, Aug 9, 2021 at 10:47 AM Sai Prakash Ranjan wrote: On 2021-08-09 23:10, Will Deacon wrote: > On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote: >> On Mon, Aug 9, 2021 at 10:05 AM Will Deacon wrote: >> > >> > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote: >> > > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon wrote: >> > > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote: >> > > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon wrote: >> > > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote: >> > > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon wrote: >> > > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan wrote: >> > > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote: >> > > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan wrote: >> > > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") >> > > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went >> > > > > > > > > > > the memory type setting required for the non-coherent masters to use >> > > > > > > > > > > system cache. Now that system cache support for GPU is added, we will >> > > > > > > > > > > need to set the right PTE attribute for GPU buffers to be sys cached. >> > > > > > > > > > > Without this, the system cache lines are not allocated for GPU. >> > > > > > > > > > > >> > > > > > > > > > > So the patches in this series introduces a new prot flag IOMMU_LLC, >> > > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC >> > > > > > > > > > > and makes GPU the user of this protection flag. >> > > > > > > > > > >> > > > > > > > > > Thank you for the patchset! Are you planning to refresh it, as it does >> > > > > > > > > > not apply anymore? >> > > > > > > > > > >> > > > > > > > > >> > > > > > > > > I was waiting on Will's reply [1]. If there are no changes needed, then >> > > > > > > > > I can repost the patch. >> > > > > > > > >> > > > > > > > I still think you need to handle the mismatched alias, no? You're adding >> > > > > > > > a new memory type to the SMMU which doesn't exist on the CPU side. That >> > > > > > > > can't be right. >> > > > > > > > >> > > > > > > >> > > > > > > Just curious, and maybe this is a dumb question, but what is your >> > > > > > > concern about mismatched aliases? I mean the cache hierarchy on the >> > > > > > > GPU device side (anything beyond the LLC) is pretty different and >> > > > > > > doesn't really care about the smmu pgtable attributes.. >> > > > > > >> > > > > > If the CPU accesses a shared buffer with different attributes to those which >> > > > > > the device is using then you fall into the "mismatched memory attributes" >> > > > > > part of the Arm architecture. It's reasonably unforgiving (you should go and >> > > > > > read it) and in some cases can apply to speculative accesses as well, but >> > > > > > the end result is typically loss of coherency. >> > > > > >> > > > > Ok, I might have a few other sections to read first to decipher the >> > > > > terminology.. >> > > > > >> > > > > But my understanding of LLC is that it looks just like system memory >> > > > > to the CPU and GPU (I think that would make it "the point of >> > > > > coherence" between the GPU and CPU?) If that is true, shouldn't it be >> > > > > invisible from the point of view of different CPU mapping options? >> > > > >> > > > You could certainly build a system where mismatched attributes do
Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache
On 2021-08-09 23:10, Will Deacon wrote: On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote: On Mon, Aug 9, 2021 at 10:05 AM Will Deacon wrote: > > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote: > > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon wrote: > > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote: > > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon wrote: > > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote: > > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon wrote: > > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan wrote: > > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote: > > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan wrote: > > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") > > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went > > > > > > > > > > the memory type setting required for the non-coherent masters to use > > > > > > > > > > system cache. Now that system cache support for GPU is added, we will > > > > > > > > > > need to set the right PTE attribute for GPU buffers to be sys cached. > > > > > > > > > > Without this, the system cache lines are not allocated for GPU. > > > > > > > > > > > > > > > > > > > > So the patches in this series introduces a new prot flag IOMMU_LLC, > > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC > > > > > > > > > > and makes GPU the user of this protection flag. > > > > > > > > > > > > > > > > > > Thank you for the patchset! Are you planning to refresh it, as it does > > > > > > > > > not apply anymore? > > > > > > > > > > > > > > > > > > > > > > > > > I was waiting on Will's reply [1]. If there are no changes needed, then > > > > > > > > I can repost the patch. > > > > > > > > > > > > > > I still think you need to handle the mismatched alias, no? You're adding > > > > > > > a new memory type to the SMMU which doesn't exist on the CPU side. That > > > > > > > can't be right. > > > > > > > > > > > > > > > > > > > Just curious, and maybe this is a dumb question, but what is your > > > > > > concern about mismatched aliases? I mean the cache hierarchy on the > > > > > > GPU device side (anything beyond the LLC) is pretty different and > > > > > > doesn't really care about the smmu pgtable attributes.. > > > > > > > > > > If the CPU accesses a shared buffer with different attributes to those which > > > > > the device is using then you fall into the "mismatched memory attributes" > > > > > part of the Arm architecture. It's reasonably unforgiving (you should go and > > > > > read it) and in some cases can apply to speculative accesses as well, but > > > > > the end result is typically loss of coherency. > > > > > > > > Ok, I might have a few other sections to read first to decipher the > > > > terminology.. > > > > > > > > But my understanding of LLC is that it looks just like system memory > > > > to the CPU and GPU (I think that would make it "the point of > > > > coherence" between the GPU and CPU?) If that is true, shouldn't it be > > > > invisible from the point of view of different CPU mapping options? > > > > > > You could certainly build a system where mismatched attributes don't cause > > > loss of coherence, but as it's not guaranteed by the architecture and the > > > changes proposed here affect APIs which are exposed across SoCs, then I > > > don't think it helps much. > > > > > > > Hmm, the description of the new mapping flag is that it applies only > > to transparent outer level cache: > > > > +/* > > + * Non-coherent masters can use this page protection flag to set cacheable > > + * memory attributes for only a transparent outer level of cache, also known a
Re: [PATCH] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks
On 2021-08-02 21:42, Will Deacon wrote: On Tue, Jul 27, 2021 at 03:03:22PM +0530, Sai Prakash Ranjan wrote: Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in such cases, we would need to drop the XO clock vote in unprepare call and this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened) clock driver which controls RPMh managed clock resources for new QTI SoCs and is a blocking call. Given we cannot have a sleeping calls such as clk_bulk_prepare() and clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu operations like map and unmap can be in atomic context and are in fast path, add this prepare and unprepare call to drop the XO vote only for system pm callbacks since it is not a fast path and we expect the system to enter deep sleep states with system pm as opposed to runtime pm. This is a similar sequence of clock requests (prepare,enable and disable,unprepare) in arm-smmu probe and remove. Signed-off-by: Sai Prakash Ranjan Co-developed-by: Rajendra Nayak Signed-off-by: Rajendra Nayak --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) [+Rob] How does this work with that funny GPU which writes to the SMMU registers directly? Does the SMMU need to remain independently clocked for that to work or is it all in the same clock domain? As Rob mentioned, device link should take care of all the dependencies between SMMU and its consumers. But not sure how the question relates to this patch as this change is for system pm and not runtime pm, so it is exactly the sequence of SMMU probe/remove which if works currently for that GPU SMMU, then it should work just fine for system suspend and resume as well. diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index d3c6f54110a5..9561ba4c5d39 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2277,6 +2277,13 @@ static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev) static int __maybe_unused arm_smmu_pm_resume(struct device *dev) { + int ret; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + ret = clk_bulk_prepare(smmu->num_clks, smmu->clks); + if (ret) + return ret; + if (pm_runtime_suspended(dev)) return 0; If we subsequently fail to enable the clks in arm_smmu_runtime_resume() should we unprepare them again? If we are unable to turn on the clks then its fatal and we will not live for long. Thanks, Sai Will @@ -2285,10 +2292,19 @@ static int __maybe_unused arm_smmu_pm_resume(struct device *dev) static int __maybe_unused arm_smmu_pm_suspend(struct device *dev) { + int ret = 0; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + if (pm_runtime_suspended(dev)) - return 0; + goto clk_unprepare; - return arm_smmu_runtime_suspend(dev); + ret = arm_smmu_runtime_suspend(dev); + if (ret) + return ret; + +clk_unprepare: + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + return ret; } static const struct dev_pm_ops arm_smmu_pm_ops = { -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
On 2021-08-02 21:13, Will Deacon wrote: On Wed, Jun 23, 2021 at 07:12:01PM +0530, Sai Prakash Ranjan wrote: Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. On qcom implementation, there are several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. So the cost of over-invalidation is less compared to the unmap latency on several usecases like camera which deals with large buffers. So, ASID based TLB invalidations (TLBIASID) can be used to invalidate the entire context for partial walk flush thereby improving the unmap latency. Non-strict mode can use this by default for all platforms given its all about over-invalidation saving time on individual unmaps and non-deterministic generally. For this example of 32MB scatter-gather list unmap, this change results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 TLBIASID as compared to 16 TLBIASIDs. Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan --- Changes in v3: * Move the logic to arm-smmu driver from io-pgtable (Robin) * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for qcom impl Changes in v2: * Add a quirk to choose tlb_flush_all in partial walk flush * Set the quirk for QTI SoC implementation --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 13 + drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 - 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7771d40176de..218c71465819 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -10,6 +10,8 @@ #include "arm-smmu.h" +extern const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops; + struct qcom_smmu { struct arm_smmu_device smmu; bool bypass_quirk; @@ -146,6 +148,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -185,6 +189,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -308,6 +320,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .i
Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache
Hi Georgi, On 2021-07-28 19:30, Georgi Djakov wrote: On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan wrote: commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went the memory type setting required for the non-coherent masters to use system cache. Now that system cache support for GPU is added, we will need to set the right PTE attribute for GPU buffers to be sys cached. Without this, the system cache lines are not allocated for GPU. So the patches in this series introduces a new prot flag IOMMU_LLC, renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC and makes GPU the user of this protection flag. Hi Sai, Thank you for the patchset! Are you planning to refresh it, as it does not apply anymore? I was waiting on Will's reply [1]. If there are no changes needed, then I can repost the patch. [1] https://lore.kernel.org/lkml/21239ba603d0bdc4e4c696588a905...@codeaurora.org/ Thanks, Sai The series slightly depends on following 2 patches posted earlier and is based on msm-next branch: * https://lore.kernel.org/patchwork/patch/1363008/ * https://lore.kernel.org/patchwork/patch/1363010/ Sai Prakash Ranjan (3): iommu/io-pgtable: Rename last-level cache quirk to IO_PGTABLE_QUIRK_PTW_LLC iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 3 +++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +- drivers/gpu/drm/msm/msm_iommu.c | 3 +++ drivers/gpu/drm/msm/msm_mmu.h | 4 drivers/iommu/io-pgtable-arm.c | 9 ++--- include/linux/io-pgtable.h | 6 +++--- include/linux/iommu.h | 6 ++ 7 files changed, 26 insertions(+), 7 deletions(-) base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709 -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks
Hi Robin, On 2021-07-27 16:03, Robin Murphy wrote: On 2021-07-27 11:25, Robin Murphy wrote: On 2021-07-27 10:33, Sai Prakash Ranjan wrote: Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in such cases, we would need to drop the XO clock vote in unprepare call and this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened) clock driver which controls RPMh managed clock resources for new QTI SoCs and is a blocking call. Given we cannot have a sleeping calls such as clk_bulk_prepare() and clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu operations like map and unmap can be in atomic context and are in fast path, add this prepare and unprepare call to drop the XO vote only for system pm callbacks since it is not a fast path and we expect the system to enter deep sleep states with system pm as opposed to runtime pm. This is a similar sequence of clock requests (prepare,enable and disable,unprepare) in arm-smmu probe and remove. Nope. We call arm_smmu_rpm_get(), which may resume the device, from atomic contexts. clk_prepare() may sleep. This doesn't work. Urgh, or maybe I skimmed the commit message too lightly *and* managed to totally misread the patch, sorry :( I'll wake up some more and try again later... No worries, we took our time looking through that many times before posting this :) Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks
Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in such cases, we would need to drop the XO clock vote in unprepare call and this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened) clock driver which controls RPMh managed clock resources for new QTI SoCs and is a blocking call. Given we cannot have a sleeping calls such as clk_bulk_prepare() and clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu operations like map and unmap can be in atomic context and are in fast path, add this prepare and unprepare call to drop the XO vote only for system pm callbacks since it is not a fast path and we expect the system to enter deep sleep states with system pm as opposed to runtime pm. This is a similar sequence of clock requests (prepare,enable and disable,unprepare) in arm-smmu probe and remove. Signed-off-by: Sai Prakash Ranjan Co-developed-by: Rajendra Nayak Signed-off-by: Rajendra Nayak --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index d3c6f54110a5..9561ba4c5d39 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2277,6 +2277,13 @@ static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev) static int __maybe_unused arm_smmu_pm_resume(struct device *dev) { + int ret; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + ret = clk_bulk_prepare(smmu->num_clks, smmu->clks); + if (ret) + return ret; + if (pm_runtime_suspended(dev)) return 0; @@ -2285,10 +2292,19 @@ static int __maybe_unused arm_smmu_pm_resume(struct device *dev) static int __maybe_unused arm_smmu_pm_suspend(struct device *dev) { + int ret = 0; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + if (pm_runtime_suspended(dev)) - return 0; + goto clk_unprepare; - return arm_smmu_runtime_suspend(dev); + ret = arm_smmu_runtime_suspend(dev); + if (ret) + return ret; + +clk_unprepare: + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + return ret; } static const struct dev_pm_ops arm_smmu_pm_ops = { -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
Hi Robin, Will, On 2021-07-12 09:39, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-23 19:12, Sai Prakash Ranjan wrote: Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. On qcom implementation, there are several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. So the cost of over-invalidation is less compared to the unmap latency on several usecases like camera which deals with large buffers. So, ASID based TLB invalidations (TLBIASID) can be used to invalidate the entire context for partial walk flush thereby improving the unmap latency. Non-strict mode can use this by default for all platforms given its all about over-invalidation saving time on individual unmaps and non-deterministic generally. For this example of 32MB scatter-gather list unmap, this change results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 TLBIASID as compared to 16 TLBIASIDs. Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan --- Changes in v3: * Move the logic to arm-smmu driver from io-pgtable (Robin) * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for qcom impl Changes in v2: * Add a quirk to choose tlb_flush_all in partial walk flush * Set the quirk for QTI SoC implementation --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 13 + drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 - 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7771d40176de..218c71465819 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -10,6 +10,8 @@ #include "arm-smmu.h" +extern const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops; + struct qcom_smmu { struct arm_smmu_device smmu; bool bypass_quirk; @@ -146,6 +148,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -185,6 +189,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -308,6 +320,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom
Re: [PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
Hi Robin, On 2021-06-23 19:12, Sai Prakash Ranjan wrote: Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. On qcom implementation, there are several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. So the cost of over-invalidation is less compared to the unmap latency on several usecases like camera which deals with large buffers. So, ASID based TLB invalidations (TLBIASID) can be used to invalidate the entire context for partial walk flush thereby improving the unmap latency. Non-strict mode can use this by default for all platforms given its all about over-invalidation saving time on individual unmaps and non-deterministic generally. For this example of 32MB scatter-gather list unmap, this change results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 TLBIASID as compared to 16 TLBIASIDs. Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan --- Changes in v3: * Move the logic to arm-smmu driver from io-pgtable (Robin) * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for qcom impl Changes in v2: * Add a quirk to choose tlb_flush_all in partial walk flush * Set the quirk for QTI SoC implementation --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 13 + drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 - 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7771d40176de..218c71465819 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -10,6 +10,8 @@ #include "arm-smmu.h" +extern const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops; + struct qcom_smmu { struct arm_smmu_device smmu; bool bypass_quirk; @@ -146,6 +148,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -185,6 +189,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -308,6 +320,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .init_context = qcom_smmu_init_context, .cfg_probe = qcom_smmu
Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
Hi Will, On 2021-03-25 23:03, Will Deacon wrote: On Tue, Mar 09, 2021 at 12:10:44PM +0530, Sai Prakash Ranjan wrote: On 2021-02-05 17:38, Sai Prakash Ranjan wrote: > On 2021-02-04 03:16, Will Deacon wrote: > > On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote: > > > On 2021-02-01 23:50, Jordan Crouse wrote: > > > > On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote: > > > > > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon wrote: > > > > > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote: > > > > > > > On 2021-01-29 14:35, Will Deacon wrote: > > > > > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote: > > > > > > > > > +#define IOMMU_LLC(1 << 6) > > > > > > > > > > > > > > > > On reflection, I'm a bit worried about exposing this because I think it > > > > > > > > will > > > > > > > > introduce a mismatched virtual alias with the CPU (we don't even have a > > > > > > > > MAIR > > > > > > > > set up for this memory type). Now, we also have that issue for the PTW, > > > > > > > > but > > > > > > > > since we always use cache maintenance (i.e. the streaming API) for > > > > > > > > publishing the page-tables to a non-coheren walker, it works out. > > > > > > > > However, > > > > > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent > > > > > > > > allocation, then they're potentially in for a nasty surprise due to the > > > > > > > > mismatched outer-cacheability attributes. > > > > > > > > > > > > > > > > > > > > > > Can't we add the syscached memory type similar to what is done on android? > > > > > > > > > > > > Maybe. How does the GPU driver map these things on the CPU side? > > > > > > > > > > Currently we use writecombine mappings for everything, although there > > > > > are some cases that we'd like to use cached (but have not merged > > > > > patches that would give userspace a way to flush/invalidate) > > > > > > > > > > > > > LLC/system cache doesn't have a relationship with the CPU cache. Its > > > > just a > > > > little accelerator that sits on the connection from the GPU to DDR and > > > > caches > > > > accesses. The hint that Sai is suggesting is used to mark the buffers as > > > > 'no-write-allocate' to prevent GPU write operations from being cached in > > > > the LLC > > > > which a) isn't interesting and b) takes up cache space for read > > > > operations. > > > > > > > > Its easiest to think of the LLC as a bonus accelerator that has no cost > > > > for > > > > us to use outside of the unfortunate per buffer hint. > > > > > > > > We do have to worry about the CPU cache w.r.t I/O coherency (which is a > > > > different hint) and in that case we have all of concerns that Will > > > > identified. > > > > > > > > > > For mismatched outer cacheability attributes which Will > > > mentioned, I was > > > referring to [1] in android kernel. > > > > I've lost track of the conversation here :/ > > > > When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also > > mapped > > into the CPU and with what attributes? Rob said "writecombine for > > everything" -- does that mean ioremap_wc() / MEMREMAP_WC? > > > > Rob answered this. > > > Finally, we need to be careful when we use the word "hint" as > > "allocation > > hint" has a specific meaning in the architecture, and if we only > > mismatch on > > those then we're actually ok. But I think IOMMU_LLC is more than > > just a > > hint, since it actually drives eviction policy (i.e. it enables > > writeback). > > > > Sorry for the pedantry, but I just want to make sure we're all talking > > about the same things! > > > > Sorry for the confusion which probably was caused by my mentioning of > android, NWA(no write allocate) is an allocation hint which we can > ignore > for now as it is not introduced yet in upstream. >
Re: [PATCHv2 1/3] iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush
Hi Robin, On 2021-06-23 00:07, Robin Murphy wrote: On 2021-06-22 15:27, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-22 17:41, Robin Murphy wrote: On 2021-06-22 08:11, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-21 21:15, Robin Murphy wrote: On 2021-06-18 03:51, Sai Prakash Ranjan wrote: Add a quirk IO_PGTABLE_QUIRK_TLB_INV_ALL to invalidate entire context with tlb_flush_all() callback in partial walk flush to improve unmap performance on select few platforms where the cost of over-invalidation is less than the unmap latency. I still think this doesn't belong anywhere near io-pgtable at all. It's a driver-internal decision how exactly it implements a non-leaf invalidation, and that may be more complex than a predetermined boolean decision. For example, I've just realised for SMMUv3 we can't invalidate multiple levels of table at once with a range command, since if we assume the whole thing is mapped at worst-case page granularity we may fail to invalidate any parts which are mapped as intermediate-level blocks. If invalidating a 1GB region (with 4KB granule) means having to fall back to 256K non-range commands, we may not want to invalidate by VA then, even though doing so for a 2MB region is still optimal. It's also quite feasible that drivers might want to do this for leaf invalidations too - if you don't like issuing 512 commands to invalidate 2MB, do you like issuing 511 commands to invalidate 2044KB? - and at that point the logic really has to be in the driver anyway. Ok I will move this to tlb_flush_walk() functions in the drivers. In the previous v1 thread, you suggested to make the choice in iommu_get_dma_strict() test, I assume you meant the test in iommu_dma_init_domain() with a flag or was it the leaf driver(ex:arm-smmu.c) test of iommu_get_dma_strict() in init_domain? Yes, I meant literally inside the same condition where we currently set "pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;" in arm_smmu_init_domain_context(). Ok got it, thanks. I am still a bit confused on where this flag would be? Should this be a part of struct iommu_domain? Well, if you were to rewrite the config with an alternative set of flush_ops at that point it would be implicit. For a flag, probably either in arm_smmu_domain or arm_smmu_impl. Maybe a flag would be less useful than generalising straight to a "maximum number of by-VA invalidations it's worth sending individually" threshold value? But then we would still need some flag to make this implementation specific (qcom specific for now) and this threshold would just be another condition although it would have been useful if this was generic enough. Well, for that approach I assume we could do something like special-case 0, or if it's a mutable per-domain value maybe just initialise it to SIZE_MAX or whatever such that it would never be reached in practice. Whichever way, it was meant to be implied that anything at the domain level would still be subject to final adjustment by the init_context hook. Ok that should work, so I went ahead with another set of flush_ops and posted out v3. Thanks, Sai It's clear to me what overall shape and separation of responsibility is most logical, but beyond that I don't have a particularly strong opinion on the exact implementation; I've just been chucking ideas around :) Your ideas are very informative and useful :) Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv3] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation
Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. On qcom implementation, there are several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. So the cost of over-invalidation is less compared to the unmap latency on several usecases like camera which deals with large buffers. So, ASID based TLB invalidations (TLBIASID) can be used to invalidate the entire context for partial walk flush thereby improving the unmap latency. Non-strict mode can use this by default for all platforms given its all about over-invalidation saving time on individual unmaps and non-deterministic generally. For this example of 32MB scatter-gather list unmap, this change results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 TLBIASID as compared to 16 TLBIASIDs. Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan --- Changes in v3: * Move the logic to arm-smmu driver from io-pgtable (Robin) * Use a new set of iommu_flush_ops->arm_smmu_s1_tlb_impl_ops and use it for qcom impl Changes in v2: * Add a quirk to choose tlb_flush_all in partial walk flush * Set the quirk for QTI SoC implementation --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 13 + drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 - 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7771d40176de..218c71465819 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -10,6 +10,8 @@ #include "arm-smmu.h" +extern const struct iommu_flush_ops arm_smmu_s1_tlb_impl_ops; + struct qcom_smmu { struct arm_smmu_device smmu; bool bypass_quirk; @@ -146,6 +148,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -185,6 +189,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + pgtbl_cfg->tlb = &arm_smmu_s1_tlb_impl_ops; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -308,6 +320,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .init_context = qcom_smmu_init_context, .cfg_probe = qcom_smmu_cfg_probe, .def_domain_type = qcom_
Re: [PATCHv2 1/3] iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush
Hi Robin, On 2021-06-22 17:41, Robin Murphy wrote: On 2021-06-22 08:11, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-21 21:15, Robin Murphy wrote: On 2021-06-18 03:51, Sai Prakash Ranjan wrote: Add a quirk IO_PGTABLE_QUIRK_TLB_INV_ALL to invalidate entire context with tlb_flush_all() callback in partial walk flush to improve unmap performance on select few platforms where the cost of over-invalidation is less than the unmap latency. I still think this doesn't belong anywhere near io-pgtable at all. It's a driver-internal decision how exactly it implements a non-leaf invalidation, and that may be more complex than a predetermined boolean decision. For example, I've just realised for SMMUv3 we can't invalidate multiple levels of table at once with a range command, since if we assume the whole thing is mapped at worst-case page granularity we may fail to invalidate any parts which are mapped as intermediate-level blocks. If invalidating a 1GB region (with 4KB granule) means having to fall back to 256K non-range commands, we may not want to invalidate by VA then, even though doing so for a 2MB region is still optimal. It's also quite feasible that drivers might want to do this for leaf invalidations too - if you don't like issuing 512 commands to invalidate 2MB, do you like issuing 511 commands to invalidate 2044KB? - and at that point the logic really has to be in the driver anyway. Ok I will move this to tlb_flush_walk() functions in the drivers. In the previous v1 thread, you suggested to make the choice in iommu_get_dma_strict() test, I assume you meant the test in iommu_dma_init_domain() with a flag or was it the leaf driver(ex:arm-smmu.c) test of iommu_get_dma_strict() in init_domain? Yes, I meant literally inside the same condition where we currently set "pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;" in arm_smmu_init_domain_context(). Ok got it, thanks. I am still a bit confused on where this flag would be? Should this be a part of struct iommu_domain? Well, if you were to rewrite the config with an alternative set of flush_ops at that point it would be implicit. For a flag, probably either in arm_smmu_domain or arm_smmu_impl. Maybe a flag would be less useful than generalising straight to a "maximum number of by-VA invalidations it's worth sending individually" threshold value? But then we would still need some flag to make this implementation specific (qcom specific for now) and this threshold would just be another condition although it would have been useful if this was generic enough. It's clear to me what overall shape and separation of responsibility is most logical, but beyond that I don't have a particularly strong opinion on the exact implementation; I've just been chucking ideas around :) Your ideas are very informative and useful :) Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2 1/3] iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush
Hi Robin, On 2021-06-21 21:15, Robin Murphy wrote: On 2021-06-18 03:51, Sai Prakash Ranjan wrote: Add a quirk IO_PGTABLE_QUIRK_TLB_INV_ALL to invalidate entire context with tlb_flush_all() callback in partial walk flush to improve unmap performance on select few platforms where the cost of over-invalidation is less than the unmap latency. I still think this doesn't belong anywhere near io-pgtable at all. It's a driver-internal decision how exactly it implements a non-leaf invalidation, and that may be more complex than a predetermined boolean decision. For example, I've just realised for SMMUv3 we can't invalidate multiple levels of table at once with a range command, since if we assume the whole thing is mapped at worst-case page granularity we may fail to invalidate any parts which are mapped as intermediate-level blocks. If invalidating a 1GB region (with 4KB granule) means having to fall back to 256K non-range commands, we may not want to invalidate by VA then, even though doing so for a 2MB region is still optimal. It's also quite feasible that drivers might want to do this for leaf invalidations too - if you don't like issuing 512 commands to invalidate 2MB, do you like issuing 511 commands to invalidate 2044KB? - and at that point the logic really has to be in the driver anyway. Ok I will move this to tlb_flush_walk() functions in the drivers. In the previous v1 thread, you suggested to make the choice in iommu_get_dma_strict() test, I assume you meant the test in iommu_dma_init_domain() with a flag or was it the leaf driver(ex:arm-smmu.c) test of iommu_get_dma_strict() in init_domain? I am still a bit confused on where this flag would be? Should this be a part of struct iommu_domain? Thanks, Sai Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 3 ++- include/linux/io-pgtable.h | 5 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..5d362f2214bd 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -768,7 +768,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NON_STRICT | IO_PGTABLE_QUIRK_ARM_TTBR1 | - IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)) + IO_PGTABLE_QUIRK_ARM_OUTER_WBWA | + IO_PGTABLE_QUIRK_TLB_INV_ALL)) return NULL; data = arm_lpae_alloc_pgtable(cfg); diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 4d40dfa75b55..45441592a0e6 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -82,6 +82,10 @@ struct io_pgtable_cfg { * * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability * attributes set in the TCR for a non-coherent page-table walker. +* +* IO_PGTABLE_QUIRK_TLB_INV_ALL: Use TLBIALL/TLBIASID to invalidate +* entire context for partial walk flush to increase unmap +* performance on select few platforms. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -89,6 +93,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4) #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) + #define IO_PGTABLE_QUIRK_TLB_INV_ALLBIT(7) unsigned long quirks; unsigned long pgsize_bitmap; unsigned intias; -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2 2/3] iommu/io-pgtable: Optimize partial walk flush for large scatter-gather list
Hi, On 2021-06-19 03:39, Doug Anderson wrote: Hi, On Thu, Jun 17, 2021 at 7:51 PM Sai Prakash Ranjan wrote: Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA) which do not support range based invalidations like on arm-smmu-v3.2. For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. So instead use tlb_flush_all() callback (TLBIALL/TLBIASID) to invalidate the entire context for partial walk flush on select few platforms where cost of over-invalidation is less than unmap latency It would probably be worth punching this description up a little bit. Elsewhere you said in more detail why this over-invalidation is less of a big deal for the Qualcomm SMMU. It's probably worth saying something like that here, too. Like this bit paraphrased from your other email: On qcom impl, we have several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. Sure will add this info as well in the next version. using the newly introduced quirk IO_PGTABLE_QUIRK_TLB_INV_ALL. We also do this for non-strict mode given its all about over-invalidation saving time on individual unmaps and non-deterministic generally. As per usual I'm mostly clueless, but I don't quite understand why you want this new behavior for non-strict mode. To me it almost seems like the opposite? Specifically, non-strict mode is already outside the critical path today and so there's no need to optimize it. I'm probably not explaining myself clearly, but I guess i'm thinking: a) today for strict, unmap is in the critical path and it's important to get it out of there. Getting it out of the critical path is so important that we're willing to over-invalidate to speed up the critical path. b) today for non-strict, unmap is not in the critical path. So I would almost expect your patch to _disable_ your new feature for non-strict mappings, not auto-enable your new feature for non-strict mappings. If I'm babbling, feel free to ignore. ;-) Looking back, I guess Robin was the one that suggested the behavior you're implementing, so it's more likely he's right than I am. ;-) Thanks for taking a look. Non-strict mode is only for leaf entries and dma domains and this optimization is for non-leaf entries and is applicable for both, see __arm_lpae_unmap(). In other words, if you have iommu.strict=0 (non-strict mode) and try unmapping a large sg buffer as the problem described in the commit text, you would still go via this path in unmap and see the delay without this patch. So what Robin suggested is that, let's do this unconditionally for all users with non-strict mode as opposed to only restricting it to implementation specific in case of strict mode. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
On 2021-06-15 17:21, Sai Prakash Ranjan wrote: > Hi Krishna, > > On 2021-06-14 23:18, Krishna Reddy wrote: >>> Right but we won't know until we profile the specific usecases or try them >>> in >>> generic workload to see if they affect the performance. Sure, over >>> invalidation is >>> a concern where multiple buffers can be mapped to same context and the cache >>> is not usable at the time for lookup and such but we don't do it for small >>> buffers >>> and only for large buffers which means thousands of TLB entry mappings in >>> which case TLBIASID is preferred (note: I mentioned the HW team >>> recommendation to use it for anything greater than 128 TLB entries) in my >>> earlier reply. And also note that we do this only for partial walk flush, >>> we are not >>> arbitrarily changing all the TLBIs to ASID based. >> >> Most of the heavy bw use cases does involve processing larger buffers. >> When the physical memory is allocated dis-contiguously at page_size >> (let's use 4KB here) >> granularity, each aligned 2MB chunks IOVA unmap would involve >> performing a TLBIASID >> as 2MB is not a leaf. Essentially, It happens all the time during >> large buffer unmaps and >> potentially impact active traffic on other large buffers. Depending on how >> much >> latency HW engines can absorb, the overflow/underflow issues for ISO >> engines can be >> sporadic and vendor specific. >> Performing TLBIASID as default for all SoCs is not a safe operation. >> > > Ok so from what I gather from this is that its not easy to test for the > negative impact and you don't have data on such yet and the behaviour is > very vendor specific. To add on qcom impl, we have several performance > improvements for TLB cache invalidations in HW like wait-for-safe(for realtime > clients such as camera and display) and few others to allow for cache > lookups/updates when TLBI is in progress for the same context bank, so atleast > we are good here. > >> >>> I am no camera expert but from what the camera team mentioned is that there >>> is a thread which frees memory(large unused memory buffers) periodically >>> which >>> ends up taking around 100+ms and causing some camera test failures with >>> frame drops. Parallel efforts are already being made to optimize this usage >>> of >>> thread but as I mentioned previously, this is *not a camera specific*, lets >>> say >>> someone else invokes such large unmaps, it's going to face the same issue. >> >> From the above, It doesn't look like the root cause of frame drops is >> fully understood. >> Why is 100+ms delay causing camera frame drop? Is the same thread >> submitting the buffers >> to camera after unmap is complete? If not, how is the unmap latency >> causing issue here? >> > > Ok since you are interested in camera usecase, I have requested for more > details > from the camera team and will give it once they comeback. However I don't > think > its good to have unmap latency at all and that is being addressed by this > patch. > As promised, here are some more details shared by camera team: Mapping of a framework buffer happens at the time of process request and unmapping of a framework buffer happens once the buffer is available from hardware and result will be notified to camera framework. * When there is a delay in unmapping of a buffer, result notification to framework will be delayed and based on pipeline delay depth, new requests from framework will be delayed. * Camera stack uses internal buffer managers for internal and framework buffers. While mapping and unmapping these managers will be accessed, so uses common lock and hence is a blocking call. So unmapping delay will cause the delay for mapping of a new request and leads to framedrop. Map and unmap happens in the camera service process context. There is no separate perf path to perform unmapping. In Camera stack along with map/unmap delay, additional delays are due to HW. So HW should be able to get the requests in time from SW to avoid frame drops. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2 2/3] iommu/io-pgtable: Optimize partial walk flush for large scatter-gather list
Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA) which do not support range based invalidations like on arm-smmu-v3.2. For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. So instead use tlb_flush_all() callback (TLBIALL/TLBIASID) to invalidate the entire context for partial walk flush on select few platforms where cost of over-invalidation is less than unmap latency using the newly introduced quirk IO_PGTABLE_QUIRK_TLB_INV_ALL. We also do this for non-strict mode given its all about over-invalidation saving time on individual unmaps and non-deterministic generally. For this example of 32MB scatter-gather list unmap, this results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 TLBIASID as compared to 16 TLBIASIDs. Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan --- include/linux/io-pgtable.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 45441592a0e6..fd6b30cfdbf7 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -219,6 +219,12 @@ static inline void io_pgtable_tlb_flush_walk(struct io_pgtable *iop, unsigned long iova, size_t size, size_t granule) { + if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT || + iop->cfg.quirks & IO_PGTABLE_QUIRK_TLB_INV_ALL) { + iop->cfg.tlb->tlb_flush_all(iop->cookie); + return; + } + if (iop->cfg.tlb && iop->cfg.tlb->tlb_flush_walk) iop->cfg.tlb->tlb_flush_walk(iova, size, granule, iop->cookie); } -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2 3/3] iommu/arm-smmu-qcom: Set IO_PGTABLE_QUIRK_TLB_INV_ALL for QTI SoC impl
Set the pgtable quirk IO_PGTABLE_QUIRK_TLB_INV_ALL for QTI SoC implementation to use ::tlb_flush_all() for partial walk flush to improve unmap performance. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 7771d40176de..b8ae51592d00 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -146,6 +146,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + pgtbl_cfg->quirks |= IO_PGTABLE_QUIRK_TLB_INV_ALL; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -185,6 +187,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + pgtbl_cfg->quirks |= IO_PGTABLE_QUIRK_TLB_INV_ALL; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -308,6 +318,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .init_context = qcom_smmu_init_context, .cfg_probe = qcom_smmu_cfg_probe, .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2 1/3] iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush
Add a quirk IO_PGTABLE_QUIRK_TLB_INV_ALL to invalidate entire context with tlb_flush_all() callback in partial walk flush to improve unmap performance on select few platforms where the cost of over-invalidation is less than the unmap latency. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 3 ++- include/linux/io-pgtable.h | 5 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..5d362f2214bd 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -768,7 +768,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NON_STRICT | IO_PGTABLE_QUIRK_ARM_TTBR1 | - IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)) + IO_PGTABLE_QUIRK_ARM_OUTER_WBWA | + IO_PGTABLE_QUIRK_TLB_INV_ALL)) return NULL; data = arm_lpae_alloc_pgtable(cfg); diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 4d40dfa75b55..45441592a0e6 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -82,6 +82,10 @@ struct io_pgtable_cfg { * * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability * attributes set in the TCR for a non-coherent page-table walker. +* +* IO_PGTABLE_QUIRK_TLB_INV_ALL: Use TLBIALL/TLBIASID to invalidate +* entire context for partial walk flush to increase unmap +* performance on select few platforms. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -89,6 +93,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4) #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) + #define IO_PGTABLE_QUIRK_TLB_INV_ALLBIT(7) unsigned long quirks; unsigned long pgsize_bitmap; unsigned intias; -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2 0/3] iommu/io-pgtable: Optimize partial walk flush for large scatter-gather list
Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA) which do not support range based invalidations like on arm-smmu-v3.2. For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. So instead use tlb_flush_all() callback (TLBIALL/TLBIASID) to invalidate the entire context for partial walk flush on select few platforms where cost of over-invalidation is less than unmap latency using the newly introduced quirk IO_PGTABLE_QUIRK_TLB_INV_ALL. We also do this for non-strict mode given its all about over-invalidation saving time on individual unmaps and non-deterministic generally. For this example of 32MB scatter-gather list unmap, this results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 TLBIASID as compared to 16 TLBIASIDs. Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Changes in v2: * Add a quirk to choose tlb_flush_all in partial walk flush * Set the quirk for QTI SoC implementation Sai Prakash Ranjan (3): iommu/io-pgtable: Add a quirk to use tlb_flush_all() for partial walk flush iommu/io-pgtable: Optimize partial walk flush for large scatter-gather list iommu/arm-smmu-qcom: Set IO_PGTABLE_QUIRK_TLB_INV_ALL for QTI SoC impl drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++ drivers/iommu/io-pgtable-arm.c | 3 ++- include/linux/io-pgtable.h | 11 +++ 3 files changed, 24 insertions(+), 1 deletion(-) -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Krishna, On 2021-06-18 02:48, Krishna Reddy wrote: Instead of flush_ops in init_context hook, perhaps a io_pgtable quirk since this is related to tlb, probably a bad name but IO_PGTABLE_QUIRK_TLB_INV which will be set in init_context impl hook and the prev condition in io_pgtable_tlb_flush_walk() becomes something like below. Seems very minimal and neat instead of poking into tlb_flush_walk functions or touching dma strict with some flag? if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT || iop->cfg.quirks & IO_PGTABLE_QUIRK_TLB_INV) { iop->cfg.tlb->tlb_flush_all(iop->cookie); return; } Can you name it as IO_PGTABLE_QUIRK_TLB_INV_ASID or IO_PGTABLE_QUIRK_TLB_INV_ALL_ASID? tlb_flush_all() callback implementations can use TLBIALL or TLBIASID. so having ASID in the quirk name doesn't sound right given this quirk should be generic enough to be usable on other implementations as well. Instead I will go with IO_PGTABLE_QUIRK_TLB_INV_ALL and will be happy to change if others have some other preference. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
On 2021-06-16 12:28, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-15 19:23, Robin Murphy wrote: On 2021-06-15 12:51, Sai Prakash Ranjan wrote: ... Hi @Robin, from these discussions it seems they are not ok with the change for all SoC vendor implementations and do not have any data on such impact. As I mentioned above, on QCOM platforms we do have several optimizations in HW for TLBIs and would like to make use of it and reduce the unmap latency. What do you think, should this be made implementation specific? Yes, it sounds like there's enough uncertainty for now that this needs to be an opt-in feature. However, I still think that non-strict mode could use it generically, since that's all about over-invalidating to save time on individual unmaps - and relatively non-deterministic - already. So maybe we have a second set of iommu_flush_ops, or just a flag somewhere to control the tlb_flush_walk functions internally, and the choice can be made in the iommu_get_dma_strict() test, but also forced on all the time by your init_context hook. What do you reckon? Sounds good to me. Since you mentioned non-strict mode using it generically, can't we just set tlb_flush_all() in io_pgtable_tlb_flush_walk() like below based on quirk so that we don't need to add any check in iommu_get_dma_strict() and just force the new flush_ops in init_context hook? if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { iop->cfg.tlb->tlb_flush_all(iop->cookie); return; } Instead of flush_ops in init_context hook, perhaps a io_pgtable quirk since this is related to tlb, probably a bad name but IO_PGTABLE_QUIRK_TLB_INV which will be set in init_context impl hook and the prev condition in io_pgtable_tlb_flush_walk() becomes something like below. Seems very minimal and neat instead of poking into tlb_flush_walk functions or touching dma strict with some flag? if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT || iop->cfg.quirks & IO_PGTABLE_QUIRK_TLB_INV) { iop->cfg.tlb->tlb_flush_all(iop->cookie); return; } Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Robin, On 2021-06-15 19:23, Robin Murphy wrote: On 2021-06-15 12:51, Sai Prakash Ranjan wrote: ... Hi @Robin, from these discussions it seems they are not ok with the change for all SoC vendor implementations and do not have any data on such impact. As I mentioned above, on QCOM platforms we do have several optimizations in HW for TLBIs and would like to make use of it and reduce the unmap latency. What do you think, should this be made implementation specific? Yes, it sounds like there's enough uncertainty for now that this needs to be an opt-in feature. However, I still think that non-strict mode could use it generically, since that's all about over-invalidating to save time on individual unmaps - and relatively non-deterministic - already. So maybe we have a second set of iommu_flush_ops, or just a flag somewhere to control the tlb_flush_walk functions internally, and the choice can be made in the iommu_get_dma_strict() test, but also forced on all the time by your init_context hook. What do you reckon? Sounds good to me. Since you mentioned non-strict mode using it generically, can't we just set tlb_flush_all() in io_pgtable_tlb_flush_walk() like below based on quirk so that we don't need to add any check in iommu_get_dma_strict() and just force the new flush_ops in init_context hook? if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { iop->cfg.tlb->tlb_flush_all(iop->cookie); return; } Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Krishna, On 2021-06-14 23:18, Krishna Reddy wrote: Right but we won't know until we profile the specific usecases or try them in generic workload to see if they affect the performance. Sure, over invalidation is a concern where multiple buffers can be mapped to same context and the cache is not usable at the time for lookup and such but we don't do it for small buffers and only for large buffers which means thousands of TLB entry mappings in which case TLBIASID is preferred (note: I mentioned the HW team recommendation to use it for anything greater than 128 TLB entries) in my earlier reply. And also note that we do this only for partial walk flush, we are not arbitrarily changing all the TLBIs to ASID based. Most of the heavy bw use cases does involve processing larger buffers. When the physical memory is allocated dis-contiguously at page_size (let's use 4KB here) granularity, each aligned 2MB chunks IOVA unmap would involve performing a TLBIASID as 2MB is not a leaf. Essentially, It happens all the time during large buffer unmaps and potentially impact active traffic on other large buffers. Depending on how much latency HW engines can absorb, the overflow/underflow issues for ISO engines can be sporadic and vendor specific. Performing TLBIASID as default for all SoCs is not a safe operation. Ok so from what I gather from this is that its not easy to test for the negative impact and you don't have data on such yet and the behaviour is very vendor specific. To add on qcom impl, we have several performance improvements for TLB cache invalidations in HW like wait-for-safe(for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank, so atleast we are good here. I am no camera expert but from what the camera team mentioned is that there is a thread which frees memory(large unused memory buffers) periodically which ends up taking around 100+ms and causing some camera test failures with frame drops. Parallel efforts are already being made to optimize this usage of thread but as I mentioned previously, this is *not a camera specific*, lets say someone else invokes such large unmaps, it's going to face the same issue. From the above, It doesn't look like the root cause of frame drops is fully understood. Why is 100+ms delay causing camera frame drop? Is the same thread submitting the buffers to camera after unmap is complete? If not, how is the unmap latency causing issue here? Ok since you are interested in camera usecase, I have requested for more details from the camera team and will give it once they comeback. However I don't think its good to have unmap latency at all and that is being addressed by this patch. > If unmap is queued and performed on a back ground thread, would it > resolve the frame drops? Not sure I understand what you mean by queuing on background thread but with that or not, we still do the same number of TLBIs and hop through iommu->io-pgtable->arm-smmu to perform the the unmap, so how will that help? I mean adding the unmap requests into a queue and processing them from a different thread. It is not to reduce the TLBIs. But, not to block subsequent buffer allocation, IOVA map requests, if they are being requested from same thread that is performing unmap. If unmap is already performed from a different thread, then the issue still need to be root caused to understand it fully. Check for any serialization issues. This patch is to optimize unmap latency because of large number of mmio writes(TLBIVAs) wasting CPU cycles and not to fix camera issue which can probably be solved by parallelization. It seems to me like you are ok with the unmap latency in general which we are not and want to avoid that latency. Hi @Robin, from these discussions it seems they are not ok with the change for all SoC vendor implementations and do not have any data on such impact. As I mentioned above, on QCOM platforms we do have several optimizations in HW for TLBIs and would like to make use of it and reduce the unmap latency. What do you think, should this be made implementation specific? Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Krishna, On 2021-06-11 22:19, Krishna Reddy wrote: Hi Sai, >> > No, the unmap latency is not just in some test case written, the >> > issue is very real and we have workloads where camera is reporting >> > frame drops because of this unmap latency in the order of 100s of milliseconds. Not exactly, this issue is not specific to camera. If you look at the numbers in the commit text, even for the test device its the same observation. It depends on the buffer size we are unmapping which affects the number of TLBIs issue. I am not aware of any such HW side bw issues for camera specifically on QCOM devices. It is clear that reducing number of TLBIs reduces the umap API latency. But, It is at the expense of throwing away valid tlb entries. Quantifying the impact of arbitrary invalidation of valid tlb entries at context level is not straight forward and use case dependent. The side-effects might be rare or won't be known until they are noticed. Right but we won't know until we profile the specific usecases or try them in generic workload to see if they affect the performance. Sure, over invalidation is a concern where multiple buffers can be mapped to same context and the cache is not usable at the time for lookup and such but we don't do it for small buffers and only for large buffers which means thousands of TLB entry mappings in which case TLBIASID is preferred (note: I mentioned the HW team recommendation to use it for anything greater than 128 TLB entries) in my earlier reply. And also note that we do this only for partial walk flush, we are not arbitrarily changing all the TLBIs to ASID based. Can you provide more details on How the unmap latency is causing camera to drop frames? Is unmap performed in the perf path? I am no camera expert but from what the camera team mentioned is that there is a thread which frees memory(large unused memory buffers) periodically which ends up taking around 100+ms and causing some camera test failures with frame drops. Parallel efforts are already being made to optimize this usage of thread but as I mentioned previously, this is *not a camera specific*, lets say someone else invokes such large unmaps, it's going to face the same issue. If unmap is queued and performed on a back ground thread, would it resolve the frame drops? Not sure I understand what you mean by queuing on background thread but with that or not, we still do the same number of TLBIs and hop through iommu->io-pgtable->arm-smmu to perform the the unmap, so how will that help? Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Krishna, On 2021-06-11 06:07, Krishna Reddy wrote: > No, the unmap latency is not just in some test case written, the issue > is very real and we have workloads where camera is reporting frame > drops because of this unmap latency in the order of 100s of milliseconds. > And hardware team recommends using ASID based invalidations for > anything larger than 128 TLB entries. So yes, we have taken note of > impacts here before going this way and hence feel more inclined to > make this qcom specific if required. Seems like the real issue here is not the unmap API latency. It should be the high number of back to back SMMU TLB invalidate register writes that is resulting in lower ISO BW to Camera and overflow. Isn't it? Even Tegra186 SoC has similar issue and HW team recommended to rate limit the number of back to back SMMU tlb invalidate registers writes. The subsequent Tegra194 SoC has a dedicated SMMU for ISO clients to avoid the impact of TLB invalidates from NISO clients on ISO BW. Not exactly, this issue is not specific to camera. If you look at the numbers in the commit text, even for the test device its the same observation. It depends on the buffer size we are unmapping which affects the number of TLBIs issue. I am not aware of any such HW side bw issues for camera specifically on QCOM devices. Thanks, Sai Thinking some more, I wonder if the Tegra folks might have an opinion to add here, given that their multiple-SMMU solution was seemingly about trying to get enough TLB and pagetable walk bandwidth in the first place? While it is good to reduce the number of tlb register writes, Flushing all TLB entries at context granularity arbitrarily can have negative impact on active traffic and BW. I don't have much data on possible impact at this point. Can the flushing at context granularity be made a quirk than performing it as default? -KR -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Robin, On 2021-06-10 20:59, Robin Murphy wrote: On 2021-06-10 12:54, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-10 17:03, Robin Murphy wrote: On 2021-06-10 10:36, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-10 14:38, Robin Murphy wrote: On 2021-06-10 06:24, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-10 00:14, Robin Murphy wrote: On 2021-06-09 15:53, Sai Prakash Ranjan wrote: Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA for arm-smmu). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. So instead use io_pgtable_tlb_flush_all() to invalidate the entire context if size (pgsize) is greater than the granule size (4K, 16K, 64K). For this example of 32MB scatter-gather list unmap, this results in just 16 ASID based TLB invalidations or tlb_flush_all() callback (TLBIASID in case of arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Condition (size > granule size) is chosen for io_pgtable_tlb_flush_all() because for any granule with supported pgsizes, we will have at least 512 TLB invalidations for which tlb_flush_all() is already recommended. For example, take 4K granule with 2MB pgsize, this will result in 512 TLBIVA in partial walk flush. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: size iommu_map_sg iommu_unmap 4K 2.067 us 1.854 us 64K 9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us 67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: size iommu_map_sg iommu_unmap 4K 1.723 us 1.765 us 64K 9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us 21.250 us 16M 2391.890 us 27.437 us 24M 3570.895 us 39.937 us 32M 4755.234 us 51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 tlb_flush_all() as opposed to 16 tlb_flush_all(). Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..c3cb9add3179 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, if (!iopte_leaf(pte, lvl, iop->fmt)) { /* Also flush any partial walks */ - io_pgtable_tlb_flush_walk(iop, iova, size, - ARM_LPAE_GRANULE(data)); + if (size > ARM_LPAE_GRANULE(data)) + io_pgtable_tlb_flush_all(iop); + else Erm, when will the above condition ever not be true? ;) Ah right, silly me :) Taking a step back, though, what about the impact to drivers other than SMMUv2? Other drivers would be msm_iommu.c, qcom_iommu.c which does the same thing as arm-smmu-v2 (page based invalidations), then there is ipmmu-vmsa.c which does tlb_flush_all() for flush walk. In particular I'm thinking of SMMUv3.2 where the whole range can be invalidated by VA in a single command anyway, so the additional penalties of TLBIALL are undesirable. Right, so I am thinking we can have a new generic quirk IO_PGTABLE_QUIRK_RANGE_INV to choose between range based invalidations(tlb_flush_walk) and tlb_flush_all(). In this case of arm-smmu-v3.2, we can tie up ARM_SMMU_FEAT_RANGE_INV with this quirk and have something like below, thoughts? if (iop->cfg.quirks & IO_PGTABLE_QUIRK_RANGE_INV) io_pgtable_tlb_flush_walk(iop, iova, size, ARM_LPAE_GRANULE(data)); else io_pgtable_tlb_flush_all(iop); The design here has always been that io-pgtable says *what* needs invalidating, and we left it up to the drivers to decide exactly *how*. Even though things have evolved a bit I don't think that has fundamentally changed - tlb_flush_walk is now only used in this one place (technically I suppose it could be renamed tlb_flush_table but it's not worth the churn), so driv
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Robin, On 2021-06-10 17:03, Robin Murphy wrote: On 2021-06-10 10:36, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-10 14:38, Robin Murphy wrote: On 2021-06-10 06:24, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-10 00:14, Robin Murphy wrote: On 2021-06-09 15:53, Sai Prakash Ranjan wrote: Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA for arm-smmu). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. So instead use io_pgtable_tlb_flush_all() to invalidate the entire context if size (pgsize) is greater than the granule size (4K, 16K, 64K). For this example of 32MB scatter-gather list unmap, this results in just 16 ASID based TLB invalidations or tlb_flush_all() callback (TLBIASID in case of arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Condition (size > granule size) is chosen for io_pgtable_tlb_flush_all() because for any granule with supported pgsizes, we will have at least 512 TLB invalidations for which tlb_flush_all() is already recommended. For example, take 4K granule with 2MB pgsize, this will result in 512 TLBIVA in partial walk flush. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: size iommu_map_sg iommu_unmap 4K 2.067 us 1.854 us 64K 9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us 67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: size iommu_map_sg iommu_unmap 4K 1.723 us 1.765 us 64K 9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us 21.250 us 16M 2391.890 us 27.437 us 24M 3570.895 us 39.937 us 32M 4755.234 us 51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 tlb_flush_all() as opposed to 16 tlb_flush_all(). Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..c3cb9add3179 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, if (!iopte_leaf(pte, lvl, iop->fmt)) { /* Also flush any partial walks */ - io_pgtable_tlb_flush_walk(iop, iova, size, - ARM_LPAE_GRANULE(data)); + if (size > ARM_LPAE_GRANULE(data)) + io_pgtable_tlb_flush_all(iop); + else Erm, when will the above condition ever not be true? ;) Ah right, silly me :) Taking a step back, though, what about the impact to drivers other than SMMUv2? Other drivers would be msm_iommu.c, qcom_iommu.c which does the same thing as arm-smmu-v2 (page based invalidations), then there is ipmmu-vmsa.c which does tlb_flush_all() for flush walk. In particular I'm thinking of SMMUv3.2 where the whole range can be invalidated by VA in a single command anyway, so the additional penalties of TLBIALL are undesirable. Right, so I am thinking we can have a new generic quirk IO_PGTABLE_QUIRK_RANGE_INV to choose between range based invalidations(tlb_flush_walk) and tlb_flush_all(). In this case of arm-smmu-v3.2, we can tie up ARM_SMMU_FEAT_RANGE_INV with this quirk and have something like below, thoughts? if (iop->cfg.quirks & IO_PGTABLE_QUIRK_RANGE_INV) io_pgtable_tlb_flush_walk(iop, iova, size, ARM_LPAE_GRANULE(data)); else io_pgtable_tlb_flush_all(iop); The design here has always been that io-pgtable says *what* needs invalidating, and we left it up to the drivers to decide exactly *how*. Even though things have evolved a bit I don't think that has fundamentally changed - tlb_flush_walk is now only used in this one place (technically I suppose it could be renamed tlb_flush_table but it's not worth the churn), so drivers can implement their own preferred table-invalidating behaviour even more easily than choosing whether
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Robin, On 2021-06-10 14:38, Robin Murphy wrote: On 2021-06-10 06:24, Sai Prakash Ranjan wrote: Hi Robin, On 2021-06-10 00:14, Robin Murphy wrote: On 2021-06-09 15:53, Sai Prakash Ranjan wrote: Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA for arm-smmu). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. So instead use io_pgtable_tlb_flush_all() to invalidate the entire context if size (pgsize) is greater than the granule size (4K, 16K, 64K). For this example of 32MB scatter-gather list unmap, this results in just 16 ASID based TLB invalidations or tlb_flush_all() callback (TLBIASID in case of arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Condition (size > granule size) is chosen for io_pgtable_tlb_flush_all() because for any granule with supported pgsizes, we will have at least 512 TLB invalidations for which tlb_flush_all() is already recommended. For example, take 4K granule with 2MB pgsize, this will result in 512 TLBIVA in partial walk flush. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: size iommu_map_sg iommu_unmap 4K 2.067 us 1.854 us 64K 9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us 67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: size iommu_map_sg iommu_unmap 4K 1.723 us 1.765 us 64K 9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us 21.250 us 16M 2391.890 us 27.437 us 24M 3570.895 us 39.937 us 32M 4755.234 us 51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 tlb_flush_all() as opposed to 16 tlb_flush_all(). Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..c3cb9add3179 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, if (!iopte_leaf(pte, lvl, iop->fmt)) { /* Also flush any partial walks */ - io_pgtable_tlb_flush_walk(iop, iova, size, - ARM_LPAE_GRANULE(data)); + if (size > ARM_LPAE_GRANULE(data)) + io_pgtable_tlb_flush_all(iop); + else Erm, when will the above condition ever not be true? ;) Ah right, silly me :) Taking a step back, though, what about the impact to drivers other than SMMUv2? Other drivers would be msm_iommu.c, qcom_iommu.c which does the same thing as arm-smmu-v2 (page based invalidations), then there is ipmmu-vmsa.c which does tlb_flush_all() for flush walk. In particular I'm thinking of SMMUv3.2 where the whole range can be invalidated by VA in a single command anyway, so the additional penalties of TLBIALL are undesirable. Right, so I am thinking we can have a new generic quirk IO_PGTABLE_QUIRK_RANGE_INV to choose between range based invalidations(tlb_flush_walk) and tlb_flush_all(). In this case of arm-smmu-v3.2, we can tie up ARM_SMMU_FEAT_RANGE_INV with this quirk and have something like below, thoughts? if (iop->cfg.quirks & IO_PGTABLE_QUIRK_RANGE_INV) io_pgtable_tlb_flush_walk(iop, iova, size, ARM_LPAE_GRANULE(data)); else io_pgtable_tlb_flush_all(iop); The design here has always been that io-pgtable says *what* needs invalidating, and we left it up to the drivers to decide exactly *how*. Even though things have evolved a bit I don't think that has fundamentally changed - tlb_flush_walk is now only used in this one place (technically I suppose it could be renamed tlb_flush_table but it's not worth the churn), so drivers can implement their own preferred table-invalidating behaviour even more easily than choosing whether to bounce a quirk through the common code or not. Consider what you've already seen for the Renesas
Re: [PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Hi Robin, On 2021-06-10 00:14, Robin Murphy wrote: On 2021-06-09 15:53, Sai Prakash Ranjan wrote: Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA for arm-smmu). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. So instead use io_pgtable_tlb_flush_all() to invalidate the entire context if size (pgsize) is greater than the granule size (4K, 16K, 64K). For this example of 32MB scatter-gather list unmap, this results in just 16 ASID based TLB invalidations or tlb_flush_all() callback (TLBIASID in case of arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Condition (size > granule size) is chosen for io_pgtable_tlb_flush_all() because for any granule with supported pgsizes, we will have at least 512 TLB invalidations for which tlb_flush_all() is already recommended. For example, take 4K granule with 2MB pgsize, this will result in 512 TLBIVA in partial walk flush. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 tlb_flush_all() as opposed to 16 tlb_flush_all(). Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..c3cb9add3179 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, if (!iopte_leaf(pte, lvl, iop->fmt)) { /* Also flush any partial walks */ - io_pgtable_tlb_flush_walk(iop, iova, size, - ARM_LPAE_GRANULE(data)); + if (size > ARM_LPAE_GRANULE(data)) + io_pgtable_tlb_flush_all(iop); + else Erm, when will the above condition ever not be true? ;) Ah right, silly me :) Taking a step back, though, what about the impact to drivers other than SMMUv2? Other drivers would be msm_iommu.c, qcom_iommu.c which does the same thing as arm-smmu-v2 (page based invalidations), then there is ipmmu-vmsa.c which does tlb_flush_all() for flush walk. In particular I'm thinking of SMMUv3.2 where the whole range can be invalidated by VA in a single command anyway, so the additional penalties of TLBIALL are undesirable. Right, so I am thinking we can have a new generic quirk IO_PGTABLE_QUIRK_RANGE_INV to choose between range based invalidations(tlb_flush_walk) and tlb_flush_all(). In this case of arm-smmu-v3.2, we can tie up ARM_SMMU_FEAT_RANGE_INV with this quirk and have something like below, thoughts? if (iop->cfg.quirks & IO_PGTABLE_QUIRK_RANGE_INV) io_pgtable_tlb_flush_walk(iop, iova, size, ARM_LPAE_GRANULE(data)); else io_pgtable_tlb_flush_all(iop); Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] iommu/io-pgtable-arm: Optimize partial walk flush for large scatter-gather list
Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation (TLBIVA for arm-smmu). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. So instead use io_pgtable_tlb_flush_all() to invalidate the entire context if size (pgsize) is greater than the granule size (4K, 16K, 64K). For this example of 32MB scatter-gather list unmap, this results in just 16 ASID based TLB invalidations or tlb_flush_all() callback (TLBIASID in case of arm-smmu) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Condition (size > granule size) is chosen for io_pgtable_tlb_flush_all() because for any granule with supported pgsizes, we will have at least 512 TLB invalidations for which tlb_flush_all() is already recommended. For example, take 4K granule with 2MB pgsize, this will result in 512 TLBIVA in partial walk flush. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: sizeiommu_map_sg iommu_unmap 4K2.067 us 1.854 us 64K9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: sizeiommu_map_sg iommu_unmap 4K1.723 us 1.765 us 64K9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us21.250 us 16M 2391.890 us27.437 us 24M 3570.895 us39.937 us 32M 4755.234 us51.797 us This is further reduced once the map/unmap_pages() support gets in which will result in just 1 tlb_flush_all() as opposed to 16 tlb_flush_all(). Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..c3cb9add3179 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -589,8 +589,11 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, if (!iopte_leaf(pte, lvl, iop->fmt)) { /* Also flush any partial walks */ - io_pgtable_tlb_flush_walk(iop, iova, size, - ARM_LPAE_GRANULE(data)); + if (size > ARM_LPAE_GRANULE(data)) + io_pgtable_tlb_flush_all(iop); + else + io_pgtable_tlb_flush_walk(iop, iova, size, + ARM_LPAE_GRANULE(data)); ptep = iopte_deref(pte, data); __arm_lpae_free_pgtable(data, lvl + 1, ptep); } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv3 0/2] iommu/arm-smmu-qcom: Add SC7280 support
On 2021-06-08 17:31, Will Deacon wrote: On Tue, Apr 20, 2021 at 11:34:55AM +0530, Sai Prakash Ranjan wrote: Patch 1 adds the sc7280 smmu compatible. Patch 2 moves the adreno smmu check before apss smmu to enable adreno smmu specific implementation. Note that dt-binding for sc7280 is already merged. This conflicts with what I've already got queued at [1]. Please can you send an updated version, as I wasn't sure about the initialisation order you need here wrt to the ACPI parts. Thanks, Will [1] https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/log/?h=for-joerg/arm-smmu/updates Sure, have rebased and sent the updated patch [1] after testing for the order. Thanks, Sai [1] https://lore.kernel.org/lkml/cover.1623155117.git.saiprakash.ran...@codeaurora.org/ -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv4 1/2] iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
Add compatible for SC7280 SMMU to use the Qualcomm Technologies, Inc. specific implementation. Signed-off-by: Sai Prakash Ranjan Reviewed-by: Bjorn Andersson --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 6f70f0e57c64..e93b5dbda7de 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -178,6 +178,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,mdss" }, { .compatible = "qcom,sc7180-mdss" }, { .compatible = "qcom,sc7180-mss-pil" }, + { .compatible = "qcom,sc7280-mdss" }, { .compatible = "qcom,sc8180x-mdss" }, { .compatible = "qcom,sdm845-mdss" }, { .compatible = "qcom,sdm845-mss-pil" }, @@ -342,6 +343,7 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { { .compatible = "qcom,msm8998-smmu-v2" }, { .compatible = "qcom,sc7180-smmu-500" }, + { .compatible = "qcom,sc7280-smmu-500" }, { .compatible = "qcom,sc8180x-smmu-500" }, { .compatible = "qcom,sdm630-smmu-v2" }, { .compatible = "qcom,sdm845-smmu-500" }, -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv4 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl
Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU both implement "arm,mmu-500" in some QTI SoCs and to run through adreno smmu specific implementation such as enabling split pagetables support, we need to match the "qcom,adreno-smmu" compatible first before apss smmu or else we will be running apps smmu implementation for adreno smmu and the additional features for adreno smmu is never set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps and adreno smmu implementing "arm,mmu-500", so the adreno smmu implementation is never reached because the current sequence checks for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that specific impl and we never reach adreno smmu specific implementation. Suggested-by: Akhil P Oommen Signed-off-by: Sai Prakash Ranjan Reviewed-by: Bjorn Andersson Acked-by: Jordan Crouse --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index e93b5dbda7de..83c32566bf64 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -370,11 +370,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) return qcom_smmu_create(smmu, &qcom_smmu_impl); } - if (of_match_node(qcom_smmu_impl_of_match, np)) - return qcom_smmu_create(smmu, &qcom_smmu_impl); - + /* +* Do not change this order of implementation, i.e., first adreno +* smmu impl and then apss smmu since we can have both implementing +* arm,mmu-500 in which case we will miss setting adreno smmu specific +* features if the order is changed. +*/ if (of_device_is_compatible(np, "qcom,adreno-smmu")) return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + if (of_match_node(qcom_smmu_impl_of_match, np)) + return qcom_smmu_create(smmu, &qcom_smmu_impl); + return smmu; } -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv4 0/2] iommu/arm-smmu-qcom: Add SC7280 support
Patch 1 adds the sc7280 smmu compatible. Patch 2 moves the adreno smmu check before apss smmu to enable adreno smmu specific implementation. Note that dt-binding for sc7280 is already merged. Changes in v4: * Rebased on top of arm-smmu/updates with acpi changes. Changes in v3: * Collect acks and reviews * Rebase on top of for-joerg/arm-smmu/updates Changes in v2: * Add a comment to make sure this order is not changed in future (Jordan) Sai Prakash Ranjan (2): iommu/arm-smmu-qcom: Add SC7280 SMMU compatible iommu/arm-smmu-qcom: Move the adreno smmu specific impl drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv3 0/2] iommu/arm-smmu-qcom: Add SC7280 support
Hi Will, On 2021-05-24 08:13, Sai Prakash Ranjan wrote: Hi Will, On 2021-04-20 11:34, Sai Prakash Ranjan wrote: Patch 1 adds the sc7280 smmu compatible. Patch 2 moves the adreno smmu check before apss smmu to enable adreno smmu specific implementation. Note that dt-binding for sc7280 is already merged. Changes in v3: * Collect acks and reviews * Rebase on top of for-joerg/arm-smmu/updates Changes in v2: * Add a comment to make sure this order is not changed in future (Jordan) Sai Prakash Ranjan (2): iommu/arm-smmu-qcom: Add SC7280 SMMU compatible iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) Gentle Ping! Is this going to be taken for 5.14 or needs one more release cycle? Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv3 0/2] iommu/arm-smmu-qcom: Add SC7280 support
Hi Will, On 2021-04-20 11:34, Sai Prakash Ranjan wrote: Patch 1 adds the sc7280 smmu compatible. Patch 2 moves the adreno smmu check before apss smmu to enable adreno smmu specific implementation. Note that dt-binding for sc7280 is already merged. Changes in v3: * Collect acks and reviews * Rebase on top of for-joerg/arm-smmu/updates Changes in v2: * Add a comment to make sure this order is not changed in future (Jordan) Sai Prakash Ranjan (2): iommu/arm-smmu-qcom: Add SC7280 SMMU compatible iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) Gentle Ping! Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
On 2021-04-19 20:08, Bjorn Andersson wrote: On Fri 26 Feb 03:55 CST 2021, Sai Prakash Ranjan wrote: Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU both implement "arm,mmu-500" in some QTI SoCs and to run through adreno smmu specific implementation such as enabling split pagetables support, we need to match the "qcom,adreno-smmu" compatible first before apss smmu or else we will be running apps smmu implementation for adreno smmu and the additional features for adreno smmu is never set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps and adreno smmu implementing "arm,mmu-500", so the adreno smmu implementation is never reached because the current sequence checks for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that specific impl and we never reach adreno smmu specific implementation. Suggested-by: Akhil P Oommen Signed-off-by: Sai Prakash Ranjan Sorry for taking my time thinking about this. Reviewed-by: Bjorn Andersson No worries, thanks Bjorn. -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv3 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU both implement "arm,mmu-500" in some QTI SoCs and to run through adreno smmu specific implementation such as enabling split pagetables support, we need to match the "qcom,adreno-smmu" compatible first before apss smmu or else we will be running apps smmu implementation for adreno smmu and the additional features for adreno smmu is never set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps and adreno smmu implementing "arm,mmu-500", so the adreno smmu implementation is never reached because the current sequence checks for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that specific impl and we never reach adreno smmu specific implementation. Suggested-by: Akhil P Oommen Signed-off-by: Sai Prakash Ranjan Reviewed-by: Bjorn Andersson Acked-by: Jordan Crouse --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index bea3ee0dabc2..03f048aebb80 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -345,11 +345,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) { const struct device_node *np = smmu->dev->of_node; - if (of_match_node(qcom_smmu_impl_of_match, np)) - return qcom_smmu_create(smmu, &qcom_smmu_impl); - + /* +* Do not change this order of implementation, i.e., first adreno +* smmu impl and then apss smmu since we can have both implementing +* arm,mmu-500 in which case we will miss setting adreno smmu specific +* features if the order is changed. +*/ if (of_device_is_compatible(np, "qcom,adreno-smmu")) return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + if (of_match_node(qcom_smmu_impl_of_match, np)) + return qcom_smmu_create(smmu, &qcom_smmu_impl); + return smmu; } -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv3 1/2] iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
Add compatible for SC7280 SMMU to use the Qualcomm Technologies, Inc. specific implementation. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 98b3a1c2a181..bea3ee0dabc2 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -166,6 +166,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,mdss" }, { .compatible = "qcom,sc7180-mdss" }, { .compatible = "qcom,sc7180-mss-pil" }, + { .compatible = "qcom,sc7280-mdss" }, { .compatible = "qcom,sc8180x-mdss" }, { .compatible = "qcom,sdm845-mdss" }, { .compatible = "qcom,sdm845-mss-pil" }, @@ -330,6 +331,7 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { { .compatible = "qcom,msm8998-smmu-v2" }, { .compatible = "qcom,sc7180-smmu-500" }, + { .compatible = "qcom,sc7280-smmu-500" }, { .compatible = "qcom,sc8180x-smmu-500" }, { .compatible = "qcom,sdm630-smmu-v2" }, { .compatible = "qcom,sdm845-smmu-500" }, -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv3 0/2] iommu/arm-smmu-qcom: Add SC7280 support
Patch 1 adds the sc7280 smmu compatible. Patch 2 moves the adreno smmu check before apss smmu to enable adreno smmu specific implementation. Note that dt-binding for sc7280 is already merged. Changes in v3: * Collect acks and reviews * Rebase on top of for-joerg/arm-smmu/updates Changes in v2: * Add a comment to make sure this order is not changed in future (Jordan) Sai Prakash Ranjan (2): iommu/arm-smmu-qcom: Add SC7280 SMMU compatible iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
On 2021-04-05 14:12, Sai Prakash Ranjan wrote: Hi Bjorn, On 2021-03-25 20:35, Will Deacon wrote: On Thu, Mar 25, 2021 at 01:10:12PM +0530, Sai Prakash Ranjan wrote: ... I think there is consensus on this series. I can resend if required but it still applies cleanly, let me know if you have any comments? Please resend with the bindings patch, and I'd like Bjorn's Ack as well. Can we have your review/ack in case there is nothing pending here? Gentle Ping! Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
Hi Bjorn, On 2021-03-25 20:35, Will Deacon wrote: On Thu, Mar 25, 2021 at 01:10:12PM +0530, Sai Prakash Ranjan wrote: ... I think there is consensus on this series. I can resend if required but it still applies cleanly, let me know if you have any comments? Please resend with the bindings patch, and I'd like Bjorn's Ack as well. Can we have your review/ack in case there is nothing pending here? Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
Hi Will, On 2021-03-15 00:31, Sai Prakash Ranjan wrote: On 2021-03-12 04:59, Bjorn Andersson wrote: On Sat 27 Feb 07:53 CST 2021, Sai Prakash Ranjan wrote: Hi Bjorn, On 2021-02-27 00:44, Bjorn Andersson wrote: > On Fri 26 Feb 12:23 CST 2021, Rob Clark wrote: > > > The current logic picks one of: > 1) is the compatible mentioned in qcom_smmu_impl_of_match[] > 2) is the compatible an adreno > 3) no quirks needed > > The change flips the order of these, so the only way I can see this > change affecting things is if we expected a match on #2, but we got one > on #1. > > Which implies that the instance that we want to act according to the > adreno impl was listed in qcom_smmu_impl_of_match[] - which either is > wrong, or there's a single instance that needs both behaviors. > > (And I believe Jordan's answer confirms the latter - there's a single > SMMU instance that needs all them quirks at once) > Let me go through the problem statement in case my commit message wasn't clear. There are two SMMUs (APSS and GPU) on SC7280 and both are SMMU500 (ARM SMMU IP). APSS SMMU compatible - ("qcom,sc7280-smmu-500", "arm,mmu-500") GPU SMMU compatible - ("qcom,sc7280-smmu-500", "qcom,adreno-smmu", "arm,mmu-500") Now if we take SC7180 as an example, GPU SMMU was QSMMU(QCOM SMMU IP) and APSS SMMU was SMMU500(ARM SMMU IP). APSS SMMU compatible - ("qcom,sc7180-smmu-500", "arm,mmu-500") GPU SMMU compatible - ("qcom,sc7180-smmu-v2", "qcom,adreno-smmu", "qcom,smmu-v2") Current code sequence without this patch, if (of_match_node(qcom_smmu_impl_of_match, np)) return qcom_smmu_create(smmu, &qcom_smmu_impl); if (of_device_is_compatible(np, "qcom,adreno-smmu")) return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); Now if we look at the compatible for SC7180, there is no problem because the APSS SMMU will match the one in qcom_smmu_impl_of_match[] and GPU SMMU will match "qcom,adreno-smmu" because the compatible strings are different. But for SC7280, both the APSS SMMU and GPU SMMU compatible("qcom,sc7280-smmu-500") are same. So GPU SMMU will match with the one in qcom_smmu_impl_of_match[] i.e.., "qcom,sc7280-smmu-500" which functionally doesn't cause any problem but we will miss settings for split pagetables which are part of GPU SMMU specific implementation. We can avoid this with yet another new compatible for GPU SMMU something like "qcom,sc7280-adreno-smmu-500" but since we can handle this easily in the driver and since the IPs are same, meaning if there was a hardware quirk required, then we would need to apply to both of them and would this additional compatible be of any help? No, I think you're doing the right thing of having them both. I just didn't remember us doing that. Coming to the part of quirks now, you are right saying both SMMUs will need to have the same quirks in SC7280 and similar others where both are based on same IPs but those should probably be *hardware quirks* and if they are software based like the S2CR quirk depending on the firmware, then it might not be applicable to both. In case if it is applicable, then as Jordan mentioned, we can add the same quirks in GPU SMMU implementation. I do suspect that at some point (probably sooner than later) we'd have to support both inheriting of stream from the bootloader and the Adreno "quirks" in the same instance. But for now this is okay to me. Sure, let me know if you or anyone face any issues without it and I will add it. I will resend this series with the dt-bindings patch for sc7280 smmu which wasn't cc'd to smmu folks by mistake. I think there is consensus on this series. I can resend if required but it still applies cleanly, let me know if you have any comments? Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
On 2021-03-12 04:59, Bjorn Andersson wrote: On Sat 27 Feb 07:53 CST 2021, Sai Prakash Ranjan wrote: Hi Bjorn, On 2021-02-27 00:44, Bjorn Andersson wrote: > On Fri 26 Feb 12:23 CST 2021, Rob Clark wrote: > > > The current logic picks one of: > 1) is the compatible mentioned in qcom_smmu_impl_of_match[] > 2) is the compatible an adreno > 3) no quirks needed > > The change flips the order of these, so the only way I can see this > change affecting things is if we expected a match on #2, but we got one > on #1. > > Which implies that the instance that we want to act according to the > adreno impl was listed in qcom_smmu_impl_of_match[] - which either is > wrong, or there's a single instance that needs both behaviors. > > (And I believe Jordan's answer confirms the latter - there's a single > SMMU instance that needs all them quirks at once) > Let me go through the problem statement in case my commit message wasn't clear. There are two SMMUs (APSS and GPU) on SC7280 and both are SMMU500 (ARM SMMU IP). APSS SMMU compatible - ("qcom,sc7280-smmu-500", "arm,mmu-500") GPU SMMU compatible - ("qcom,sc7280-smmu-500", "qcom,adreno-smmu", "arm,mmu-500") Now if we take SC7180 as an example, GPU SMMU was QSMMU(QCOM SMMU IP) and APSS SMMU was SMMU500(ARM SMMU IP). APSS SMMU compatible - ("qcom,sc7180-smmu-500", "arm,mmu-500") GPU SMMU compatible - ("qcom,sc7180-smmu-v2", "qcom,adreno-smmu", "qcom,smmu-v2") Current code sequence without this patch, if (of_match_node(qcom_smmu_impl_of_match, np)) return qcom_smmu_create(smmu, &qcom_smmu_impl); if (of_device_is_compatible(np, "qcom,adreno-smmu")) return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); Now if we look at the compatible for SC7180, there is no problem because the APSS SMMU will match the one in qcom_smmu_impl_of_match[] and GPU SMMU will match "qcom,adreno-smmu" because the compatible strings are different. But for SC7280, both the APSS SMMU and GPU SMMU compatible("qcom,sc7280-smmu-500") are same. So GPU SMMU will match with the one in qcom_smmu_impl_of_match[] i.e.., "qcom,sc7280-smmu-500" which functionally doesn't cause any problem but we will miss settings for split pagetables which are part of GPU SMMU specific implementation. We can avoid this with yet another new compatible for GPU SMMU something like "qcom,sc7280-adreno-smmu-500" but since we can handle this easily in the driver and since the IPs are same, meaning if there was a hardware quirk required, then we would need to apply to both of them and would this additional compatible be of any help? No, I think you're doing the right thing of having them both. I just didn't remember us doing that. Coming to the part of quirks now, you are right saying both SMMUs will need to have the same quirks in SC7280 and similar others where both are based on same IPs but those should probably be *hardware quirks* and if they are software based like the S2CR quirk depending on the firmware, then it might not be applicable to both. In case if it is applicable, then as Jordan mentioned, we can add the same quirks in GPU SMMU implementation. I do suspect that at some point (probably sooner than later) we'd have to support both inheriting of stream from the bootloader and the Adreno "quirks" in the same instance. But for now this is okay to me. Sure, let me know if you or anyone face any issues without it and I will add it. I will resend this series with the dt-bindings patch for sc7280 smmu which wasn't cc'd to smmu folks by mistake. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
Hi, On 2021-02-05 17:38, Sai Prakash Ranjan wrote: On 2021-02-04 03:16, Will Deacon wrote: On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote: On 2021-02-01 23:50, Jordan Crouse wrote: > On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote: > > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon wrote: > > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote: > > > > On 2021-01-29 14:35, Will Deacon wrote: > > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote: > > > > > > +#define IOMMU_LLC(1 << 6) > > > > > > > > > > On reflection, I'm a bit worried about exposing this because I think it > > > > > will > > > > > introduce a mismatched virtual alias with the CPU (we don't even have a > > > > > MAIR > > > > > set up for this memory type). Now, we also have that issue for the PTW, > > > > > but > > > > > since we always use cache maintenance (i.e. the streaming API) for > > > > > publishing the page-tables to a non-coheren walker, it works out. > > > > > However, > > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent > > > > > allocation, then they're potentially in for a nasty surprise due to the > > > > > mismatched outer-cacheability attributes. > > > > > > > > > > > > > Can't we add the syscached memory type similar to what is done on android? > > > > > > Maybe. How does the GPU driver map these things on the CPU side? > > > > Currently we use writecombine mappings for everything, although there > > are some cases that we'd like to use cached (but have not merged > > patches that would give userspace a way to flush/invalidate) > > > > LLC/system cache doesn't have a relationship with the CPU cache. Its > just a > little accelerator that sits on the connection from the GPU to DDR and > caches > accesses. The hint that Sai is suggesting is used to mark the buffers as > 'no-write-allocate' to prevent GPU write operations from being cached in > the LLC > which a) isn't interesting and b) takes up cache space for read > operations. > > Its easiest to think of the LLC as a bonus accelerator that has no cost > for > us to use outside of the unfortunate per buffer hint. > > We do have to worry about the CPU cache w.r.t I/O coherency (which is a > different hint) and in that case we have all of concerns that Will > identified. > For mismatched outer cacheability attributes which Will mentioned, I was referring to [1] in android kernel. I've lost track of the conversation here :/ When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also mapped into the CPU and with what attributes? Rob said "writecombine for everything" -- does that mean ioremap_wc() / MEMREMAP_WC? Rob answered this. Finally, we need to be careful when we use the word "hint" as "allocation hint" has a specific meaning in the architecture, and if we only mismatch on those then we're actually ok. But I think IOMMU_LLC is more than just a hint, since it actually drives eviction policy (i.e. it enables writeback). Sorry for the pedantry, but I just want to make sure we're all talking about the same things! Sorry for the confusion which probably was caused by my mentioning of android, NWA(no write allocate) is an allocation hint which we can ignore for now as it is not introduced yet in upstream. Any chance of taking this forward? We do not want to miss out on small fps gain when the product gets released. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
Hi Bjorn, On 2021-02-27 00:44, Bjorn Andersson wrote: > On Fri 26 Feb 12:23 CST 2021, Rob Clark wrote: > > > The current logic picks one of: > 1) is the compatible mentioned in qcom_smmu_impl_of_match[] > 2) is the compatible an adreno > 3) no quirks needed > > The change flips the order of these, so the only way I can see this > change affecting things is if we expected a match on #2, but we got one > on #1. > > Which implies that the instance that we want to act according to the > adreno impl was listed in qcom_smmu_impl_of_match[] - which either is > wrong, or there's a single instance that needs both behaviors. > > (And I believe Jordan's answer confirms the latter - there's a single > SMMU instance that needs all them quirks at once) > Let me go through the problem statement in case my commit message wasn't clear. There are two SMMUs (APSS and GPU) on SC7280 and both are SMMU500 (ARM SMMU IP). APSS SMMU compatible - ("qcom,sc7280-smmu-500", "arm,mmu-500") GPU SMMU compatible - ("qcom,sc7280-smmu-500", "qcom,adreno-smmu", "arm,mmu-500") Now if we take SC7180 as an example, GPU SMMU was QSMMU(QCOM SMMU IP) and APSS SMMU was SMMU500(ARM SMMU IP). APSS SMMU compatible - ("qcom,sc7180-smmu-500", "arm,mmu-500") GPU SMMU compatible - ("qcom,sc7180-smmu-v2", "qcom,adreno-smmu", "qcom,smmu-v2") Current code sequence without this patch, if (of_match_node(qcom_smmu_impl_of_match, np)) return qcom_smmu_create(smmu, &qcom_smmu_impl); if (of_device_is_compatible(np, "qcom,adreno-smmu")) return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); Now if we look at the compatible for SC7180, there is no problem because the APSS SMMU will match the one in qcom_smmu_impl_of_match[] and GPU SMMU will match "qcom,adreno-smmu" because the compatible strings are different. But for SC7280, both the APSS SMMU and GPU SMMU compatible("qcom,sc7280-smmu-500") are same. So GPU SMMU will match with the one in qcom_smmu_impl_of_match[] i.e.., "qcom,sc7280-smmu-500" which functionally doesn't cause any problem but we will miss settings for split pagetables which are part of GPU SMMU specific implementation. We can avoid this with yet another new compatible for GPU SMMU something like "qcom,sc7280-adreno-smmu-500" but since we can handle this easily in the driver and since the IPs are same, meaning if there was a hardware quirk required, then we would need to apply to both of them and would this additional compatible be of any help? Coming to the part of quirks now, you are right saying both SMMUs will need to have the same quirks in SC7280 and similar others where both are based on same IPs but those should probably be *hardware quirks* and if they are software based like the S2CR quirk depending on the firmware, then it might not be applicable to both. In case if it is applicable, then as Jordan mentioned, we can add the same quirks in GPU SMMU implementation. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU both implement "arm,mmu-500" in some QTI SoCs and to run through adreno smmu specific implementation such as enabling split pagetables support, we need to match the "qcom,adreno-smmu" compatible first before apss smmu or else we will be running apps smmu implementation for adreno smmu and the additional features for adreno smmu is never set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps and adreno smmu implementing "arm,mmu-500", so the adreno smmu implementation is never reached because the current sequence checks for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that specific impl and we never reach adreno smmu specific implementation. Suggested-by: Akhil P Oommen Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index bea3ee0dabc2..03f048aebb80 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -345,11 +345,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) { const struct device_node *np = smmu->dev->of_node; - if (of_match_node(qcom_smmu_impl_of_match, np)) - return qcom_smmu_create(smmu, &qcom_smmu_impl); - + /* +* Do not change this order of implementation, i.e., first adreno +* smmu impl and then apss smmu since we can have both implementing +* arm,mmu-500 in which case we will miss setting adreno smmu specific +* features if the order is changed. +*/ if (of_device_is_compatible(np, "qcom,adreno-smmu")) return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + if (of_match_node(qcom_smmu_impl_of_match, np)) + return qcom_smmu_create(smmu, &qcom_smmu_impl); + return smmu; } -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2 1/2] iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
Add compatible for SC7280 SMMU to use the Qualcomm Technologies, Inc. specific implementation. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 98b3a1c2a181..bea3ee0dabc2 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -166,6 +166,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,mdss" }, { .compatible = "qcom,sc7180-mdss" }, { .compatible = "qcom,sc7180-mss-pil" }, + { .compatible = "qcom,sc7280-mdss" }, { .compatible = "qcom,sc8180x-mdss" }, { .compatible = "qcom,sdm845-mdss" }, { .compatible = "qcom,sdm845-mss-pil" }, @@ -330,6 +331,7 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { { .compatible = "qcom,msm8998-smmu-v2" }, { .compatible = "qcom,sc7180-smmu-500" }, + { .compatible = "qcom,sc7280-smmu-500" }, { .compatible = "qcom,sc8180x-smmu-500" }, { .compatible = "qcom,sdm630-smmu-v2" }, { .compatible = "qcom,sdm845-smmu-500" }, -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv2 0/2] iommu/arm-smmu-qcom: Add SC7280 support
Patch 1 adds the sc7280 smmu compatible. Patch 2 moves the adreno smmu check before apss smmu to enable adreno smmu specific implementation. Changes in v2: * Add a comment to make sure this order is not changed in future (Jordan) Sai Prakash Ranjan (2): iommu/arm-smmu-qcom: Add SC7280 SMMU compatible iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) base-commit: 7060377ce06f9cd3ed6274c0f2310463feb5baec -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
On 2021-02-25 23:36, Jordan Crouse wrote: On Thu, Feb 25, 2021 at 03:54:10PM +0530, Sai Prakash Ranjan wrote: Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU both implement "arm,mmu-500" in some QTI SoCs and to run through adreno smmu specific implementation such as enabling split pagetables support, we need to match the "qcom,adreno-smmu" compatible first before apss smmu or else we will be running apps smmu implementation for adreno smmu and the additional features for adreno smmu is never set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps and adreno smmu implementing "arm,mmu-500", so the adreno smmu implementation is never reached because the current sequence checks for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that specific impl and we never reach adreno smmu specific implementation. Suggested-by: Akhil P Oommen Signed-off-by: Sai Prakash Ranjan --- Its either this or we add a new compatible for adreno smmu implementing arm,mmu-500 like "qcom,sc7280-adreno-smmu-500". --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index bea3ee0dabc2..7d0fc2c8e72f 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -345,11 +345,11 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) { const struct device_node *np = smmu->dev->of_node; - if (of_match_node(qcom_smmu_impl_of_match, np)) - return qcom_smmu_create(smmu, &qcom_smmu_impl); - if (of_device_is_compatible(np, "qcom,adreno-smmu")) return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + if (of_match_node(qcom_smmu_impl_of_match, np)) + return qcom_smmu_create(smmu, &qcom_smmu_impl); + It would be good to add a comment here explaining the order here so we don't accidentally reorganize ourselves back into a problem later. Sure its better, will add it. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 2/2] iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier
Adreno(GPU) SMMU and APSS(Application Processor SubSystem) SMMU both implement "arm,mmu-500" in some QTI SoCs and to run through adreno smmu specific implementation such as enabling split pagetables support, we need to match the "qcom,adreno-smmu" compatible first before apss smmu or else we will be running apps smmu implementation for adreno smmu and the additional features for adreno smmu is never set. For ex: we have "qcom,sc7280-smmu-500" compatible for both apps and adreno smmu implementing "arm,mmu-500", so the adreno smmu implementation is never reached because the current sequence checks for apps smmu compatible(qcom,sc7280-smmu-500) first and runs that specific impl and we never reach adreno smmu specific implementation. Suggested-by: Akhil P Oommen Signed-off-by: Sai Prakash Ranjan --- Its either this or we add a new compatible for adreno smmu implementing arm,mmu-500 like "qcom,sc7280-adreno-smmu-500". --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index bea3ee0dabc2..7d0fc2c8e72f 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -345,11 +345,11 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) { const struct device_node *np = smmu->dev->of_node; - if (of_match_node(qcom_smmu_impl_of_match, np)) - return qcom_smmu_create(smmu, &qcom_smmu_impl); - if (of_device_is_compatible(np, "qcom,adreno-smmu")) return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + if (of_match_node(qcom_smmu_impl_of_match, np)) + return qcom_smmu_create(smmu, &qcom_smmu_impl); + return smmu; } -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 1/2] iommu/arm-smmu-qcom: Add SC7280 SMMU compatible
Add compatible for SC7280 SMMU to use the Qualcomm Technologies, Inc. specific implementation. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 98b3a1c2a181..bea3ee0dabc2 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -166,6 +166,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,mdss" }, { .compatible = "qcom,sc7180-mdss" }, { .compatible = "qcom,sc7180-mss-pil" }, + { .compatible = "qcom,sc7280-mdss" }, { .compatible = "qcom,sc8180x-mdss" }, { .compatible = "qcom,sdm845-mdss" }, { .compatible = "qcom,sdm845-mss-pil" }, @@ -330,6 +331,7 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { { .compatible = "qcom,msm8998-smmu-v2" }, { .compatible = "qcom,sc7180-smmu-500" }, + { .compatible = "qcom,sc7280-smmu-500" }, { .compatible = "qcom,sc8180x-smmu-500" }, { .compatible = "qcom,sdm630-smmu-v2" }, { .compatible = "qcom,sdm845-smmu-500" }, -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 0/2] iommu/arm-smmu-qcom: Add SC7280 support
Patch 1 adds the sc7280 smmu compatible. Patch 2 moves the adreno smmu check before apss smmu to enable adreno smmu specific implementation. Sai Prakash Ranjan (2): iommu/arm-smmu-qcom: Add SC7280 SMMU compatible iommu/arm-smmu-qcom: Move the adreno smmu specific impl earlier drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) base-commit: 7060377ce06f9cd3ed6274c0f2310463feb5baec -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] iommu: Add device name to iommu map/unmap trace events
IOMMU map/unmap traces become hard to decode i.e., it becomes hard to associate the map/unmap events with the particular device from the iova/paddr/size parameters alone when there are multiple devices attached. So it is useful to add the device name to iommu trace events which can be used to filter out map/unmap traces for a particular device when we are debugging iommu faults such as context faults where we are interested with the map/unmap traces for a specific device. Before: map: IOMMU: iova=0x00036000 paddr=0x0001164d8000 size=4096 unmap: IOMMU: iova=0x00036000 size=4096 unmapped_size=4096 After: map: IOMMU: dev=1d84000.ufshc iova=0x000fffa88000 paddr=0x0001063db000 size=4096 unmap: IOMMU: dev=1d84000.ufshc iova=0x000fffa88000 size=4096 unmapped_size=4096 Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/iommu.c| 8 +--- include/linux/iommu.h| 1 + include/trace/events/iommu.h | 20 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d0b0a15dba84..37081b745f38 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1947,8 +1947,10 @@ static int __iommu_attach_device(struct iommu_domain *domain, return -ENODEV; ret = domain->ops->attach_dev(domain, dev); - if (!ret) + if (!ret) { trace_attach_device_to_domain(dev); + strscpy(domain->dev_name, dev_name(dev), sizeof(domain->dev_name)); + } return ret; } @@ -2440,7 +2442,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, if (ret) iommu_unmap(domain, orig_iova, orig_size - size); else - trace_map(orig_iova, orig_paddr, orig_size); + trace_map(orig_iova, orig_paddr, orig_size, domain->dev_name); return ret; } @@ -2523,7 +2525,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unmapped += unmapped_page; } - trace_unmap(orig_iova, size, unmapped); + trace_unmap(orig_iova, size, unmapped, domain->dev_name); return unmapped; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5e7fe519430a..6064187d9bb6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -87,6 +87,7 @@ struct iommu_domain { void *handler_token; struct iommu_domain_geometry geometry; void *iova_cookie; + char dev_name[32]; }; enum iommu_cap { diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h index 72b4582322ff..44e48fb8b677 100644 --- a/include/trace/events/iommu.h +++ b/include/trace/events/iommu.h @@ -85,47 +85,51 @@ DEFINE_EVENT(iommu_device_event, detach_device_from_domain, TRACE_EVENT(map, - TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size), + TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size, const char *dev_name), - TP_ARGS(iova, paddr, size), + TP_ARGS(iova, paddr, size, dev_name), TP_STRUCT__entry( __field(u64, iova) __field(u64, paddr) __field(size_t, size) + __string(dev_name, dev_name) ), TP_fast_assign( __entry->iova = iova; __entry->paddr = paddr; __entry->size = size; + __assign_str(dev_name, dev_name); ), - TP_printk("IOMMU: iova=0x%016llx paddr=0x%016llx size=%zu", - __entry->iova, __entry->paddr, __entry->size + TP_printk("IOMMU: dev=%s iova=0x%016llx paddr=0x%016llx size=%zu", + __get_str(dev_name), __entry->iova, __entry->paddr, __entry->size ) ); TRACE_EVENT(unmap, - TP_PROTO(unsigned long iova, size_t size, size_t unmapped_size), + TP_PROTO(unsigned long iova, size_t size, size_t unmapped_size, const char *dev_name), - TP_ARGS(iova, size, unmapped_size), + TP_ARGS(iova, size, unmapped_size, dev_name), TP_STRUCT__entry( __field(u64, iova) __field(size_t, size) __field(size_t, unmapped_size) + __string(dev_name, dev_name) ), TP_fast_assign( __entry->iova = iova; __entry->size = size; __entry->unmapped_size = unmapped_size; + __assign_str(dev_name, dev_name); ), - TP_printk("IOMMU: iova=0x%016llx size=%zu unmapped_size=%zu", - __entry->iova, __entry->size, __entry->unmapped_size + TP_printk("IOMMU: dev=%s iova=0x%016llx size=%zu unmapped_size=%zu", + __get_str(dev_name), __entry->iova, __entry->size, __entry->unmapped_size ) ); -- QUALCOMM INDIA
Re: Consult on ARM SMMU debugfs
On 2021-01-15 22:47, Robin Murphy wrote: On 2021-01-15 15:14, Russell King - ARM Linux admin wrote: On Mon, Jan 11, 2021 at 08:01:48PM +, Robin Murphy wrote: On 2021-01-07 02:45, chenxiang (M) wrote: Hi Will,� Robin or other guys, When debugging SMMU/SVA issue on huawei ARM64 board, we find that it lacks of enough debugfs for ARM SMMU driver (such as the value of STE/CD which we need to check sometimes). Currently it creates top-level iommu directory in debugfs, but there is no debugfs for ARM SMMU driver specially. Do you know whether ARM have the plan to do that recently? FWIW I don't think I've ever felt the need to need to inspect the Stream Table on a live system. So far the nature of the STE code has been simple enough that it's very hard for any given STE to be *wrong* - either it's set up as expected and thus works fine, or it's not initialised at all and you get C_BAD_STE, where 99% of the time you then just cross-reference the Stream ID against the firmware and find that the DT/IORT is wrong. Similarly I don't think I've even even *seen* an issue that could be attributed to a context descriptor, although I appreciate that as we start landing more PASID and SVA support the scope for that starts to widen considerably. Feel free to propose a patch if you believe it would be genuinely useful and won't just bit-rot into a maintenance burden, but it's not something that's on our roadmap here. I do think that the IOMMU stuff needs better debugging. I've hit the WARN_ON() in __arm_lpae_map(), and it's been pretty much undebuggable, so I've resorted to putting the IOMMU into bypass mode permanently to work around the issue. The reason that it's undebuggable is if one puts printk() or trace statements in the code, boots the platform, you get flooded with those debugging messages, because every access to the rootfs generates and tears down a mapping. It would be nice to be able to inspect the IOMMU page tables and state of the IOMMU, rather than having to resort to effectively disabling the IOMMU. Certainly once we get to stuff like unpinned VFIO, having the ability to inspect pagetables for arbitrary IOMMU API usage will indeed be useful. From the DMA mapping perspective, though, unless you're working on the io-pgtable code itself it's not really going to tell you much that dumping the mappings from dma-debug can't already. FWIW whenever I encounter that particular warning in iommu-dma context, I don't care where the existing mapping is pointing, since it's merely a symptom of the damage already having been done. At that point I'd usually go off and audit all the DMA API calls in the offending driver, since it's typically caused by corruption in the IOVA allocator from passing the wrong size in a dma_unmap_*() call, and those can often be spotted by inspection. For active debugging, what you really want to know is the *history* of operations around that IOVA, since you're primarily interested in the request that last mapped it, then the corresponding unmap request for nominally the same buffer (which allowed the IOVA region to be freed for reuse) that for some reason didn't cover one or more pages that it should have. The IOMMU API tracepoints can be a handy tool there. Currently IOMMU trace events are not straight forward to decode if there are multiple devices attached. For ex: consider below: map: IOMMU: iova=0x00035000 paddr=0x000113be2000 size=4096 unmap: IOMMU: iova=0x00034000 size=4096 unmapped_size=4096 unmap: IOMMU: iova=0x00035000 size=4096 unmapped_size=4096 map: IOMMU: iova=0x00036000 paddr=0x0001164d8000 size=4096 map: IOMMU: iova=0x00037000 paddr=0x0001164da000 size=4096 unmap: IOMMU: iova=0x00036000 size=4096 unmapped_size=4096 unmap: IOMMU: iova=0x00037000 size=4096 unmapped_size=4096 How about making it more useful adding the device name as well? Ex: map: IOMMU:ae0.mdss iova=0x0002b000 paddr=0x00010a9e6000 size=8192 map: IOMMU:ae0.mdss iova=0x0002d000 paddr=0x00010a9ec000 size=21790 map: IOMMU:ae0.mdss iova=0x00241000 paddr=0x00010c40 size=59392 map: IOMMU:a60.dwc3 iova=0x0004a000 paddr=0x00010a821000 size=4096 map: IOMMU:a60.dwc3 iova=0x00049000 paddr=0x00010a82 size=4096 unmap: IOMMU:a60.dwc3 iova=0x0004a000 size=4096 unmapped_size=4096 unmap: IOMMU:a60.dwc3 iova=0x00049000 size=4096 unmapped_size=4096 We have been carrying a local patch downstream like forever, I can post a patch if you guys think it is useful in general. Thanks Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
On 2021-02-04 03:16, Will Deacon wrote: On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote: On 2021-02-01 23:50, Jordan Crouse wrote: > On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote: > > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon wrote: > > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote: > > > > On 2021-01-29 14:35, Will Deacon wrote: > > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote: > > > > > > +#define IOMMU_LLC(1 << 6) > > > > > > > > > > On reflection, I'm a bit worried about exposing this because I think it > > > > > will > > > > > introduce a mismatched virtual alias with the CPU (we don't even have a > > > > > MAIR > > > > > set up for this memory type). Now, we also have that issue for the PTW, > > > > > but > > > > > since we always use cache maintenance (i.e. the streaming API) for > > > > > publishing the page-tables to a non-coheren walker, it works out. > > > > > However, > > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent > > > > > allocation, then they're potentially in for a nasty surprise due to the > > > > > mismatched outer-cacheability attributes. > > > > > > > > > > > > > Can't we add the syscached memory type similar to what is done on android? > > > > > > Maybe. How does the GPU driver map these things on the CPU side? > > > > Currently we use writecombine mappings for everything, although there > > are some cases that we'd like to use cached (but have not merged > > patches that would give userspace a way to flush/invalidate) > > > > LLC/system cache doesn't have a relationship with the CPU cache. Its > just a > little accelerator that sits on the connection from the GPU to DDR and > caches > accesses. The hint that Sai is suggesting is used to mark the buffers as > 'no-write-allocate' to prevent GPU write operations from being cached in > the LLC > which a) isn't interesting and b) takes up cache space for read > operations. > > Its easiest to think of the LLC as a bonus accelerator that has no cost > for > us to use outside of the unfortunate per buffer hint. > > We do have to worry about the CPU cache w.r.t I/O coherency (which is a > different hint) and in that case we have all of concerns that Will > identified. > For mismatched outer cacheability attributes which Will mentioned, I was referring to [1] in android kernel. I've lost track of the conversation here :/ When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also mapped into the CPU and with what attributes? Rob said "writecombine for everything" -- does that mean ioremap_wc() / MEMREMAP_WC? Rob answered this. Finally, we need to be careful when we use the word "hint" as "allocation hint" has a specific meaning in the architecture, and if we only mismatch on those then we're actually ok. But I think IOMMU_LLC is more than just a hint, since it actually drives eviction policy (i.e. it enables writeback). Sorry for the pedantry, but I just want to make sure we're all talking about the same things! Sorry for the confusion which probably was caused by my mentioning of android, NWA(no write allocate) is an allocation hint which we can ignore for now as it is not introduced yet in upstream. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
On 2021-02-01 23:50, Jordan Crouse wrote: On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote: On Mon, Feb 1, 2021 at 3:16 AM Will Deacon wrote: > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote: > > On 2021-01-29 14:35, Will Deacon wrote: > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote: > > > > Add a new page protection flag IOMMU_LLC which can be used > > > > by non-coherent masters to set cacheable memory attributes > > > > for an outer level of cache called as last-level cache or > > > > system cache. Initial user of this page protection flag is > > > > the adreno gpu and then can later be used by other clients > > > > such as video where this can be used for per-buffer based > > > > mapping. > > > > > > > > Signed-off-by: Sai Prakash Ranjan > > > > --- > > > > drivers/iommu/io-pgtable-arm.c | 3 +++ > > > > include/linux/iommu.h | 6 ++ > > > > 2 files changed, 9 insertions(+) > > > > > > > > diff --git a/drivers/iommu/io-pgtable-arm.c > > > > b/drivers/iommu/io-pgtable-arm.c > > > > index 7439ee7fdcdb..ebe653ef601b 100644 > > > > --- a/drivers/iommu/io-pgtable-arm.c > > > > +++ b/drivers/iommu/io-pgtable-arm.c > > > > @@ -415,6 +415,9 @@ static arm_lpae_iopte > > > > arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, > > > > else if (prot & IOMMU_CACHE) > > > > pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE > > > > << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > > > + else if (prot & IOMMU_LLC) > > > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE > > > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > > > } > > > > > > > > if (prot & IOMMU_CACHE) > > > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h > > > > index ffaa389ea128..1f82057df531 100644 > > > > --- a/include/linux/iommu.h > > > > +++ b/include/linux/iommu.h > > > > @@ -31,6 +31,12 @@ > > > > * if the IOMMU page table format is equivalent. > > > > */ > > > > #define IOMMU_PRIV (1 << 5) > > > > +/* > > > > + * Non-coherent masters can use this page protection flag to set > > > > cacheable > > > > + * memory attributes for only a transparent outer level of cache, > > > > also known as > > > > + * the last-level or system cache. > > > > + */ > > > > +#define IOMMU_LLC(1 << 6) > > > > > > On reflection, I'm a bit worried about exposing this because I think it > > > will > > > introduce a mismatched virtual alias with the CPU (we don't even have a > > > MAIR > > > set up for this memory type). Now, we also have that issue for the PTW, > > > but > > > since we always use cache maintenance (i.e. the streaming API) for > > > publishing the page-tables to a non-coheren walker, it works out. > > > However, > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent > > > allocation, then they're potentially in for a nasty surprise due to the > > > mismatched outer-cacheability attributes. > > > > > > > Can't we add the syscached memory type similar to what is done on android? > > Maybe. How does the GPU driver map these things on the CPU side? Currently we use writecombine mappings for everything, although there are some cases that we'd like to use cached (but have not merged patches that would give userspace a way to flush/invalidate) BR, -R LLC/system cache doesn't have a relationship with the CPU cache. Its just a little accelerator that sits on the connection from the GPU to DDR and caches accesses. The hint that Sai is suggesting is used to mark the buffers as 'no-write-allocate' to prevent GPU write operations from being cached in the LLC which a) isn't interesting and b) takes up cache space for read operations. Its easiest to think of the LLC as a bonus accelerator that has no cost for us to use outside of the unfortunate per buffer hint. We do have to worry about the CPU cache w.r.t I/O coherency (which is a different hint) and in that case we have all of concerns that Will identified. For mismatched outer cacheability attributes which Will mentioned, I was referring to [1] in android kernel. [1] https://android-review.googlesource.com/c/kernel/common/+/1549097/3 Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
On 2021-02-01 23:50, Jordan Crouse wrote: On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote: On Mon, Feb 1, 2021 at 3:16 AM Will Deacon wrote: > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote: > > On 2021-01-29 14:35, Will Deacon wrote: > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote: > > > > Add a new page protection flag IOMMU_LLC which can be used > > > > by non-coherent masters to set cacheable memory attributes > > > > for an outer level of cache called as last-level cache or > > > > system cache. Initial user of this page protection flag is > > > > the adreno gpu and then can later be used by other clients > > > > such as video where this can be used for per-buffer based > > > > mapping. > > > > > > > > Signed-off-by: Sai Prakash Ranjan > > > > --- > > > > drivers/iommu/io-pgtable-arm.c | 3 +++ > > > > include/linux/iommu.h | 6 ++ > > > > 2 files changed, 9 insertions(+) > > > > > > > > diff --git a/drivers/iommu/io-pgtable-arm.c > > > > b/drivers/iommu/io-pgtable-arm.c > > > > index 7439ee7fdcdb..ebe653ef601b 100644 > > > > --- a/drivers/iommu/io-pgtable-arm.c > > > > +++ b/drivers/iommu/io-pgtable-arm.c > > > > @@ -415,6 +415,9 @@ static arm_lpae_iopte > > > > arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, > > > > else if (prot & IOMMU_CACHE) > > > > pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE > > > > << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > > > + else if (prot & IOMMU_LLC) > > > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE > > > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > > > } > > > > > > > > if (prot & IOMMU_CACHE) > > > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h > > > > index ffaa389ea128..1f82057df531 100644 > > > > --- a/include/linux/iommu.h > > > > +++ b/include/linux/iommu.h > > > > @@ -31,6 +31,12 @@ > > > > * if the IOMMU page table format is equivalent. > > > > */ > > > > #define IOMMU_PRIV (1 << 5) > > > > +/* > > > > + * Non-coherent masters can use this page protection flag to set > > > > cacheable > > > > + * memory attributes for only a transparent outer level of cache, > > > > also known as > > > > + * the last-level or system cache. > > > > + */ > > > > +#define IOMMU_LLC(1 << 6) > > > > > > On reflection, I'm a bit worried about exposing this because I think it > > > will > > > introduce a mismatched virtual alias with the CPU (we don't even have a > > > MAIR > > > set up for this memory type). Now, we also have that issue for the PTW, > > > but > > > since we always use cache maintenance (i.e. the streaming API) for > > > publishing the page-tables to a non-coheren walker, it works out. > > > However, > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent > > > allocation, then they're potentially in for a nasty surprise due to the > > > mismatched outer-cacheability attributes. > > > > > > > Can't we add the syscached memory type similar to what is done on android? > > Maybe. How does the GPU driver map these things on the CPU side? Currently we use writecombine mappings for everything, although there are some cases that we'd like to use cached (but have not merged patches that would give userspace a way to flush/invalidate) BR, -R LLC/system cache doesn't have a relationship with the CPU cache. Its just a little accelerator that sits on the connection from the GPU to DDR and caches accesses. The hint that Sai is suggesting is used to mark the buffers as 'no-write-allocate' to prevent GPU write operations from being cached in the LLC which a) isn't interesting and b) takes up cache space for read operations. Its easiest to think of the LLC as a bonus accelerator that has no cost for us to use outside of the unfortunate per buffer hint. We do have to worry about the CPU cache w.r.t I/O coherency (which is a different hint) and in that case we have all of concerns that Will identified. For mismatched outer cacheability attributes which Will mentioned, I was referring to [1] in android kernel. [1] https://android-review.googlesource.com/c/kernel/common/+/1549097/3 Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
On 2021-01-29 14:35, Will Deacon wrote: On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote: Add a new page protection flag IOMMU_LLC which can be used by non-coherent masters to set cacheable memory attributes for an outer level of cache called as last-level cache or system cache. Initial user of this page protection flag is the adreno gpu and then can later be used by other clients such as video where this can be used for per-buffer based mapping. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 3 +++ include/linux/iommu.h | 6 ++ 2 files changed, 9 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7439ee7fdcdb..ebe653ef601b 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, else if (prot & IOMMU_CACHE) pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if (prot & IOMMU_LLC) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE + << ARM_LPAE_PTE_ATTRINDX_SHIFT); } if (prot & IOMMU_CACHE) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ffaa389ea128..1f82057df531 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -31,6 +31,12 @@ * if the IOMMU page table format is equivalent. */ #define IOMMU_PRIV (1 << 5) +/* + * Non-coherent masters can use this page protection flag to set cacheable + * memory attributes for only a transparent outer level of cache, also known as + * the last-level or system cache. + */ +#define IOMMU_LLC (1 << 6) On reflection, I'm a bit worried about exposing this because I think it will introduce a mismatched virtual alias with the CPU (we don't even have a MAIR set up for this memory type). Now, we also have that issue for the PTW, but since we always use cache maintenance (i.e. the streaming API) for publishing the page-tables to a non-coheren walker, it works out. However, if somebody expects IOMMU_LLC to be coherent with a DMA API coherent allocation, then they're potentially in for a nasty surprise due to the mismatched outer-cacheability attributes. Can't we add the syscached memory type similar to what is done on android? So I can take patch (1) as a trivial rename, but unfortunately I think this needs more thought before exposing it beyond the PTW. That wouldn't be of much use, would it :) , we would be losing on perf gain for GPU usecases without the rest of the patches. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache
On 2021-01-20 10:48, Sai Prakash Ranjan wrote: On 2021-01-11 19:45, Sai Prakash Ranjan wrote: commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went the memory type setting required for the non-coherent masters to use system cache. Now that system cache support for GPU is added, we will need to set the right PTE attribute for GPU buffers to be sys cached. Without this, the system cache lines are not allocated for GPU. So the patches in this series introduces a new prot flag IOMMU_LLC, renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC and makes GPU the user of this protection flag. The series slightly depends on following 2 patches posted earlier and is based on msm-next branch: * https://lore.kernel.org/patchwork/patch/1363008/ * https://lore.kernel.org/patchwork/patch/1363010/ Sai Prakash Ranjan (3): iommu/io-pgtable: Rename last-level cache quirk to IO_PGTABLE_QUIRK_PTW_LLC iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 3 +++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +- drivers/gpu/drm/msm/msm_iommu.c | 3 +++ drivers/gpu/drm/msm/msm_mmu.h | 4 drivers/iommu/io-pgtable-arm.c | 9 ++--- include/linux/io-pgtable.h | 6 +++--- include/linux/iommu.h | 6 ++ 7 files changed, 26 insertions(+), 7 deletions(-) base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709 Gentle Ping! Gentle Ping!! Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache
On 2021-01-11 19:45, Sai Prakash Ranjan wrote: commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went the memory type setting required for the non-coherent masters to use system cache. Now that system cache support for GPU is added, we will need to set the right PTE attribute for GPU buffers to be sys cached. Without this, the system cache lines are not allocated for GPU. So the patches in this series introduces a new prot flag IOMMU_LLC, renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC and makes GPU the user of this protection flag. The series slightly depends on following 2 patches posted earlier and is based on msm-next branch: * https://lore.kernel.org/patchwork/patch/1363008/ * https://lore.kernel.org/patchwork/patch/1363010/ Sai Prakash Ranjan (3): iommu/io-pgtable: Rename last-level cache quirk to IO_PGTABLE_QUIRK_PTW_LLC iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 3 +++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +- drivers/gpu/drm/msm/msm_iommu.c | 3 +++ drivers/gpu/drm/msm/msm_mmu.h | 4 drivers/iommu/io-pgtable-arm.c | 9 ++--- include/linux/io-pgtable.h | 6 +++--- include/linux/iommu.h | 6 ++ 7 files changed, 26 insertions(+), 7 deletions(-) base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709 Gentle Ping! Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 3/3] drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers
Use the newly introduced IOMMU_LLC page protection flag to map GPU buffers. This will make sure that proper stage-1 PTE attributes are set for GPU buffers to use system cache. This also introduces MMU_FEATURE_USE_LLC features bit to check for GPUs supporting LLC and set them in the target specific address space creation, in this case we set them for A6XX GPUs. Signed-off-by: Sai Prakash Ranjan --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_iommu.c | 3 +++ drivers/gpu/drm/msm/msm_mmu.h | 4 3 files changed, 10 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 3c7ad51732bb..23da21b6f0ff 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1266,6 +1266,9 @@ a6xx_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) return ERR_CAST(mmu); } + if (!IS_ERR_OR_NULL(a6xx_gpu->llc_slice)) + mmu->features |= MMU_FEATURE_USE_LLC; + /* * Use the aperture start or SZ_16M, whichever is greater. This will * ensure that we align with the allocated pagetable range while still diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c index 22ac7c692a81..a329f9836422 100644 --- a/drivers/gpu/drm/msm/msm_iommu.c +++ b/drivers/gpu/drm/msm/msm_iommu.c @@ -235,6 +235,9 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova, if (iova & BIT_ULL(48)) iova |= GENMASK_ULL(63, 49); + if (mmu->features & MMU_FEATURE_USE_LLC) + prot |= IOMMU_LLC; + ret = iommu_map_sgtable(iommu->domain, iova, sgt, prot); WARN_ON(!ret); diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h index 61ade89d9e48..efcd1939c98e 100644 --- a/drivers/gpu/drm/msm/msm_mmu.h +++ b/drivers/gpu/drm/msm/msm_mmu.h @@ -23,12 +23,16 @@ enum msm_mmu_type { MSM_MMU_IOMMU_PAGETABLE, }; +/* MMU features */ +#define MMU_FEATURE_USE_LLCBIT(0) + struct msm_mmu { const struct msm_mmu_funcs *funcs; struct device *dev; int (*handler)(void *arg, unsigned long iova, int flags); void *arg; enum msm_mmu_type type; + u32 features; }; static inline void msm_mmu_init(struct msm_mmu *mmu, struct device *dev, -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
Add a new page protection flag IOMMU_LLC which can be used by non-coherent masters to set cacheable memory attributes for an outer level of cache called as last-level cache or system cache. Initial user of this page protection flag is the adreno gpu and then can later be used by other clients such as video where this can be used for per-buffer based mapping. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 3 +++ include/linux/iommu.h | 6 ++ 2 files changed, 9 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7439ee7fdcdb..ebe653ef601b 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, else if (prot & IOMMU_CACHE) pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if (prot & IOMMU_LLC) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE + << ARM_LPAE_PTE_ATTRINDX_SHIFT); } if (prot & IOMMU_CACHE) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ffaa389ea128..1f82057df531 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -31,6 +31,12 @@ * if the IOMMU page table format is equivalent. */ #define IOMMU_PRIV (1 << 5) +/* + * Non-coherent masters can use this page protection flag to set cacheable + * memory attributes for only a transparent outer level of cache, also known as + * the last-level or system cache. + */ +#define IOMMU_LLC (1 << 6) struct iommu_ops; struct iommu_group; -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 1/3] iommu/io-pgtable: Rename last-level cache quirk to IO_PGTABLE_QUIRK_PTW_LLC
Rename last-level cache quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC which is used to set the required TCR attributes for non-coherent page table walker to be more generic and in sync with the upcoming page protection flag IOMMU_LLC. Signed-off-by: Sai Prakash Ranjan --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +- drivers/iommu/io-pgtable-arm.c | 6 +++--- include/linux/io-pgtable.h | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 0f184c3dd9d9..82b5e4969195 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -190,7 +190,7 @@ void adreno_set_llc_attributes(struct iommu_domain *iommu) { struct io_pgtable_domain_attr pgtbl_cfg; - pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA; + pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_PTW_LLC; iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg); } diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7c9ea9d7874a..7439ee7fdcdb 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -762,7 +762,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NON_STRICT | IO_PGTABLE_QUIRK_ARM_TTBR1 | - IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)) + IO_PGTABLE_QUIRK_PTW_LLC)) return NULL; data = arm_lpae_alloc_pgtable(cfg); @@ -774,12 +774,12 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) tcr->sh = ARM_LPAE_TCR_SH_IS; tcr->irgn = ARM_LPAE_TCR_RGN_WBWA; tcr->orgn = ARM_LPAE_TCR_RGN_WBWA; - if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) + if (cfg->quirks & IO_PGTABLE_QUIRK_PTW_LLC) goto out_free_data; } else { tcr->sh = ARM_LPAE_TCR_SH_OS; tcr->irgn = ARM_LPAE_TCR_RGN_NC; - if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)) + if (!(cfg->quirks & IO_PGTABLE_QUIRK_PTW_LLC)) tcr->orgn = ARM_LPAE_TCR_RGN_NC; else tcr->orgn = ARM_LPAE_TCR_RGN_WBWA; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index fb4d5a763e0c..6f996a817441 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -87,8 +87,8 @@ struct io_pgtable_cfg { * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table * for use in the upper half of a split address space. * -* IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability -* attributes set in the TCR for a non-coherent page-table walker. +* IO_PGTABLE_QUIRK_PTW_LLC: Override the outer-cacheability attributes +* set in the TCR for a non-coherent page-table walker. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -96,7 +96,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3) #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4) #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) - #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) + #define IO_PGTABLE_QUIRK_PTW_LLCBIT(6) unsigned long quirks; unsigned long pgsize_bitmap; unsigned intias; -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache
commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went the memory type setting required for the non-coherent masters to use system cache. Now that system cache support for GPU is added, we will need to set the right PTE attribute for GPU buffers to be sys cached. Without this, the system cache lines are not allocated for GPU. So the patches in this series introduces a new prot flag IOMMU_LLC, renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC and makes GPU the user of this protection flag. The series slightly depends on following 2 patches posted earlier and is based on msm-next branch: * https://lore.kernel.org/patchwork/patch/1363008/ * https://lore.kernel.org/patchwork/patch/1363010/ Sai Prakash Ranjan (3): iommu/io-pgtable: Rename last-level cache quirk to IO_PGTABLE_QUIRK_PTW_LLC iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 3 +++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +- drivers/gpu/drm/msm/msm_iommu.c | 3 +++ drivers/gpu/drm/msm/msm_mmu.h | 4 drivers/iommu/io-pgtable-arm.c | 9 ++--- include/linux/io-pgtable.h | 6 +++--- include/linux/iommu.h | 6 ++ 7 files changed, 26 insertions(+), 7 deletions(-) base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709 -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 0/5] Optimize iommu_map_sg() performance
On 2021-01-11 11:52, Sai Prakash Ranjan wrote: Hi Isaac, I gave this series a go on chromebook and saw these warnings and several device probe failures, logs attached below: WARN corresponds to this code in arm_lpae_map_by_pgsize() if (WARN_ON(iaext || (paddr + size) >> cfg->oas)) return -ERANGE; Logs: [2.411391] [ cut here ] [2.416149] WARNING: CPU: 6 PID: 56 at drivers/iommu/io-pgtable-arm.c:492 arm_lpae_map_sg+0x234/0x248 [2.425606] Modules linked in: [2.428749] CPU: 6 PID: 56 Comm: kworker/6:1 Not tainted 5.10.5 #970 [2.440287] Workqueue: events deferred_probe_work_func [2.445563] pstate: 20c9 (nzCv daif +PAN +UAO -TCO BTYPE=--) [2.451726] pc : arm_lpae_map_sg+0x234/0x248 [2.456112] lr : arm_lpae_map_sg+0xe0/0x248 [2.460410] sp : ffc010513750 [2.463820] x29: ffc010513790 x28: ffb943332000 [2.469281] x27: 000ff000 x26: ffb943d14900 [2.474738] x25: 1000 x24: 000103465000 [2.480196] x23: 0001 x22: 000103466000 [2.485645] x21: 0003 x20: 0a20 [2.491103] x19: ffc010513850 x18: 0001 [2.496562] x17: 0002 x16: [2.502021] x15: x14: [2.507479] x13: 0001 x12: [2.512928] x11: 0010 x10: [2.518385] x9 : 0001 x8 : 40201000 [2.523844] x7 : 0a20 x6 : ffb943463000 [2.529302] x5 : 0003 x4 : 1000 [2.534760] x3 : 0001 x2 : ffb941f605a0 [2.540219] x1 : 0003 x0 : 0e40 [2.545679] Call trace: [2.548196] arm_lpae_map_sg+0x234/0x248 [2.552225] arm_smmu_map_sg+0x80/0xc4 [2.556078] __iommu_map_sg+0x6c/0x188 [2.559931] iommu_map_sg_atomic+0x18/0x20 [2.564144] iommu_dma_alloc_remap+0x26c/0x34c [2.568703] iommu_dma_alloc+0x9c/0x268 [2.572647] dma_alloc_attrs+0x88/0xfc [2.576503] gsi_ring_alloc+0x50/0x144 [2.580356] gsi_init+0x2c4/0x5c4 [2.583766] ipa_probe+0x14c/0x2b4 [2.587263] platform_drv_probe+0x94/0xb4 [2.591377] really_probe+0x138/0x348 [2.595145] driver_probe_device+0x80/0xb8 [2.599358] __device_attach_driver+0x90/0xa8 [2.603829] bus_for_each_drv+0x84/0xcc [2.607772] __device_attach+0xc0/0x148 [2.611713] device_initial_probe+0x18/0x20 [2.616012] bus_probe_device+0x38/0x94 [2.619953] deferred_probe_work_func+0x78/0xb0 [2.624611] process_one_work+0x210/0x3dc [2.628726] worker_thread+0x284/0x3e0 [2.632578] kthread+0x148/0x1a8 [2.635891] ret_from_fork+0x10/0x18 [2.639562] ---[ end trace 9bac18cad6a9862e ]--- [2.644414] ipa 1e4.ipa: error -12 allocating channel 0 event ring [2.651656] ipa: probe of 1e4.ipa failed with error -12 [2.660072] dwc3 a60.dwc3: Adding to iommu group 8 [2.668632] xhci-hcd xhci-hcd.13.auto: xHCI Host Controller [2.674680] xhci-hcd xhci-hcd.13.auto: new USB bus registered, assigned bus number 1 ... Isaac provided a fix which he will post as v2 and no warnings were observed with that fix. Tested-by: Sai Prakash Ranjan Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 0/5] Optimize iommu_map_sg() performance
Hi Isaac, On 2021-01-09 07:20, Isaac J. Manjarres wrote: > The iommu_map_sg() code currently iterates through the given > scatter-gather list, and in the worst case, invokes iommu_map() > for each element in the scatter-gather list, which calls into > the IOMMU driver through an indirect call. For an IOMMU driver > that uses a format supported by the io-pgtable code, the IOMMU > driver will then call into the io-pgtable code to map the chunk. > > Jumping between the IOMMU core code, the IOMMU driver, and the > io-pgtable code and back for each element in a scatter-gather list > is not efficient. > > Instead, add a map_sg() hook in both the IOMMU driver ops and the > io-pgtable ops. iommu_map_sg() can then call into the IOMMU driver's > map_sg() hook with the entire scatter-gather list, which can call > into the io-pgtable map_sg() hook, which can process the entire > scatter-gather list, signficantly reducing the number of indirect > calls, and jumps between these layers, boosting performance. > > On a system that uses the ARM SMMU driver, and the ARM LPAE format, > the current implementation of iommu_map_sg() yields the following > latencies for mapping scatter-gather lists of various sizes. These > latencies are calculated by repeating the mapping operation 10 times: > > sizeiommu_map_sg latency > 4K0.624 us > 64K9.468 us > 1M 122.557 us > 2M 239.807 us > 12M 1435.979 us > 24M 2884.968 us > 32M 3832.979 us > > On the same system, the proposed modifications yield the following > results: > > sizeiommu_map_sg latency > 4K3.645 us > 64K4.198 us > 1M 11.010 us > 2M 17.125 us > 12M 82.416 us > 24M 158.677 us > 32M 210.468 us > > The procedure for collecting the iommu_map_sg latencies is > the same in both experiments. Clearly, reducing the jumps > between the different layers in the IOMMU code offers a > signficant performance boost in iommu_map_sg() latency. > I gave this series a go on chromebook and saw these warnings and several device probe failures, logs attached below: WARN corresponds to this code in arm_lpae_map_by_pgsize() if (WARN_ON(iaext || (paddr + size) >> cfg->oas)) return -ERANGE; Logs: [2.411391] [ cut here ] [2.416149] WARNING: CPU: 6 PID: 56 at drivers/iommu/io-pgtable-arm.c:492 arm_lpae_map_sg+0x234/0x248 [2.425606] Modules linked in: [2.428749] CPU: 6 PID: 56 Comm: kworker/6:1 Not tainted 5.10.5 #970 [2.440287] Workqueue: events deferred_probe_work_func [2.445563] pstate: 20c9 (nzCv daif +PAN +UAO -TCO BTYPE=--) [2.451726] pc : arm_lpae_map_sg+0x234/0x248 [2.456112] lr : arm_lpae_map_sg+0xe0/0x248 [2.460410] sp : ffc010513750 [2.463820] x29: ffc010513790 x28: ffb943332000 [2.469281] x27: 000ff000 x26: ffb943d14900 [2.474738] x25: 1000 x24: 000103465000 [2.480196] x23: 0001 x22: 000103466000 [2.485645] x21: 0003 x20: 0a20 [2.491103] x19: ffc010513850 x18: 0001 [2.496562] x17: 0002 x16: [2.502021] x15: x14: [2.507479] x13: 0001 x12: [2.512928] x11: 0010 x10: [2.518385] x9 : 0001 x8 : 40201000 [2.523844] x7 : 0a20 x6 : ffb943463000 [2.529302] x5 : 0003 x4 : 1000 [2.534760] x3 : 0001 x2 : ffb941f605a0 [2.540219] x1 : 0003 x0 : 0e40 [2.545679] Call trace: [2.548196] arm_lpae_map_sg+0x234/0x248 [2.552225] arm_smmu_map_sg+0x80/0xc4 [2.556078] __iommu_map_sg+0x6c/0x188 [2.559931] iommu_map_sg_atomic+0x18/0x20 [2.564144] iommu_dma_alloc_remap+0x26c/0x34c [2.568703] iommu_dma_alloc+0x9c/0x268 [2.572647] dma_alloc_attrs+0x88/0xfc [2.576503] gsi_ring_alloc+0x50/0x144 [2.580356] gsi_init+0x2c4/0x5c4 [2.583766] ipa_probe+0x14c/0x2b4 [2.587263] platform_drv_probe+0x94/0xb4 [2.591377] really_probe+0x138/0x348 [2.595145] driver_probe_device+0x80/0xb8 [2.599358] __device_attach_driver+0x90/0xa8 [2.603829] bus_for_each_drv+0x84/0xcc [2.607772] __device_attach+0xc0/0x148 [2.611713] device_initial_probe+0x18/0x20 [2.616012] bus_probe_device+0x38/0x94 [2.619953] deferred_probe_work_func+0x78/0xb0 [2.624611] process_one_work+0x210/0x3dc [2.628726] worker_thread+0x284/0x3e0 [2.632578] kthread+0x148/0x1a8 [2.635891] ret_from_fork+0x10/0x18 [2.639562] ---[ end trace 9bac18cad6a9862e ]--- [2.644414] ipa 1e4.ipa: error -12 allocating channel 0 event ring [2.651656
Re: [PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache
On 2021-01-08 23:48, Will Deacon wrote: On Fri, Jan 08, 2021 at 11:17:25AM +0530, Sai Prakash Ranjan wrote: On 2021-01-07 22:27, isa...@codeaurora.org wrote: > On 2021-01-06 03:56, Will Deacon wrote: > > On Thu, Dec 24, 2020 at 12:10:07PM +0530, Sai Prakash Ranjan wrote: > > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY > > > flag") > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went > > > the memory type setting required for the non-coherent masters to use > > > system cache. Now that system cache support for GPU is added, we will > > > need to mark the memory as normal sys-cached for GPU to use > > > system cache. > > > Without this, the system cache lines are not allocated for GPU. > > > We use > > > the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page > > > protection > > > flag as the flag cannot be exposed via DMA api because of no in-tree > > > users. > > > > > > Signed-off-by: Sai Prakash Ranjan > > > --- > > > drivers/iommu/io-pgtable-arm.c | 3 +++ > > > 1 file changed, 3 insertions(+) > > > > > > diff --git a/drivers/iommu/io-pgtable-arm.c > > > b/drivers/iommu/io-pgtable-arm.c > > > index 7c9ea9d7874a..3fb7de8304a2 100644 > > > --- a/drivers/iommu/io-pgtable-arm.c > > > +++ b/drivers/iommu/io-pgtable-arm.c > > > @@ -415,6 +415,9 @@ static arm_lpae_iopte > > > arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, > > > else if (prot & IOMMU_CACHE) > > > pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE > > > << ARM_LPAE_PTE_ATTRINDX_SHIFT); > > > +else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) > > > +pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE > > > +<< ARM_LPAE_PTE_ATTRINDX_SHIFT); > > > } > > > While this approach of enabling system cache globally for both page > tables and other buffers > works for the GPU usecase, this isn't ideal for other clients that use > system cache. For example, > video clients only want to cache a subset of their buffers in the > system cache, due to the sizing constraint > imposed by how much of the system cache they can use. So, it would be > ideal to have > a way of expressing the desire to use the system cache on a per-buffer > basis. Additionally, > our video clients use the DMA layer, and since the requirement is for > caching in the system cache > to be a per buffer attribute, it seems like we would have to have a > DMA attribute to express > this on a per-buffer basis. > I did bring this up initially [1], also where is this video client in upstream? AFAIK, only system cache user in upstream is GPU. We cannot add any DMA attribute unless there is any user upstream as per [2], so when the support for such a client is added, wouldn't ((data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) || PROT_FLAG) work? Hmm, I think this is another case where we need to separate out the page-table walker attributes from the access attributes. Currently, IO_PGTABLE_QUIRK_ARM_OUTER_WBWA applies _only_ to the page-table walker and I don't think it makes any sense for that to be per-buffer (how would you even manage that?). However, if we want to extend this to data accesses and we know that there are valid use-cases where this should be per-buffer, then shoe-horning it in with the walker quirk does not feel like the best thing to do. As a starting point, we could: 1. Rename IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC 2. Add a new prot flag IOMMU_LLC 3. Have the GPU pass the new prot for its buffer mappings This looks good to me, I will work on this and post something soon. Does that work? One thing I'm not sure about is whether IOMMU_CACHE should imply IOMMU_LLC, or whether there is a use-case for inner-cacheable, outer non-cacheable mappings for a coherent device. Have you ever seen that sort of thing before? I don't think there is such a usecase as Isaac mentioned. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache
On 2021-01-08 23:39, isa...@codeaurora.org wrote: On 2021-01-07 21:47, Sai Prakash Ranjan wrote: On 2021-01-07 22:27, isa...@codeaurora.org wrote: On 2021-01-06 03:56, Will Deacon wrote: On Thu, Dec 24, 2020 at 12:10:07PM +0530, Sai Prakash Ranjan wrote: commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went the memory type setting required for the non-coherent masters to use system cache. Now that system cache support for GPU is added, we will need to mark the memory as normal sys-cached for GPU to use system cache. Without this, the system cache lines are not allocated for GPU. We use the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page protection flag as the flag cannot be exposed via DMA api because of no in-tree users. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7c9ea9d7874a..3fb7de8304a2 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, else if (prot & IOMMU_CACHE) pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE + << ARM_LPAE_PTE_ATTRINDX_SHIFT); } While this approach of enabling system cache globally for both page tables and other buffers works for the GPU usecase, this isn't ideal for other clients that use system cache. For example, video clients only want to cache a subset of their buffers in the system cache, due to the sizing constraint imposed by how much of the system cache they can use. So, it would be ideal to have a way of expressing the desire to use the system cache on a per-buffer basis. Additionally, our video clients use the DMA layer, and since the requirement is for caching in the system cache to be a per buffer attribute, it seems like we would have to have a DMA attribute to express this on a per-buffer basis. I did bring this up initially [1], also where is this video client in upstream? AFAIK, only system cache user in upstream is GPU. We cannot add any DMA attribute unless there is any user upstream Right, there wouldn't be an upstream user, which would be problematic, but I was thinking of having it so that when video or any of our other clients that use this attribute on a per buffer basis upstreams their code, it's not too much of a stretch to add the support. Agreed. as per [2], so when the support for such a client is added, wouldn't ((data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) || PROT_FLAG) work? I don't think that will work, because we currently have clients who use the system cache as follows: -cache only page tables in the system cache -cache only data buffers in the system cache -cache both page tables and all buffers in the system cache -cache both page tables and some buffers in the system cache The approach you're suggesting doesn't allow for the last case, as caching the page tables in the system cache involves setting IO_PGTABLE_QUIRK_ARM_OUTER_WBWA, so we will end up losing the flexibility to cache some data buffers in the system cache. Ah yes, you are right, I believe Jordan mentioned the same [1]. [1] https://lore.kernel.org/lkml/20200709161352.gc21...@jcrouse1-lnx.qualcomm.com/ Ideally, the page table quirk would drive the settings for the TCR, and the prot flag drives the PTE for the mapping, as is done with the page table walker being dma-coherent, while buffers are mapped as cacheable based on IOMMU_CACHE. Thoughts? Right, mixing the two is not correct. Will's suggestion for a new prot flag sounds good to me, I will work on that. Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache
On 2021-01-07 22:27, isa...@codeaurora.org wrote: On 2021-01-06 03:56, Will Deacon wrote: On Thu, Dec 24, 2020 at 12:10:07PM +0530, Sai Prakash Ranjan wrote: commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went the memory type setting required for the non-coherent masters to use system cache. Now that system cache support for GPU is added, we will need to mark the memory as normal sys-cached for GPU to use system cache. Without this, the system cache lines are not allocated for GPU. We use the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page protection flag as the flag cannot be exposed via DMA api because of no in-tree users. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7c9ea9d7874a..3fb7de8304a2 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, else if (prot & IOMMU_CACHE) pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE + << ARM_LPAE_PTE_ATTRINDX_SHIFT); } While this approach of enabling system cache globally for both page tables and other buffers works for the GPU usecase, this isn't ideal for other clients that use system cache. For example, video clients only want to cache a subset of their buffers in the system cache, due to the sizing constraint imposed by how much of the system cache they can use. So, it would be ideal to have a way of expressing the desire to use the system cache on a per-buffer basis. Additionally, our video clients use the DMA layer, and since the requirement is for caching in the system cache to be a per buffer attribute, it seems like we would have to have a DMA attribute to express this on a per-buffer basis. I did bring this up initially [1], also where is this video client in upstream? AFAIK, only system cache user in upstream is GPU. We cannot add any DMA attribute unless there is any user upstream as per [2], so when the support for such a client is added, wouldn't ((data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) || PROT_FLAG) work? [1] https://lore.kernel.org/dri-devel/ecfda7ca80f6d7b4ff3d89b8758f4...@codeaurora.org/ [2] https://lore.kernel.org/linux-iommu/20191026053026.ga14...@lst.de/T/ Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache
Hi Will, On 2021-01-06 17:26, Will Deacon wrote: On Thu, Dec 24, 2020 at 12:10:07PM +0530, Sai Prakash Ranjan wrote: commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went the memory type setting required for the non-coherent masters to use system cache. Now that system cache support for GPU is added, we will need to mark the memory as normal sys-cached for GPU to use system cache. Without this, the system cache lines are not allocated for GPU. We use the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page protection flag as the flag cannot be exposed via DMA api because of no in-tree users. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7c9ea9d7874a..3fb7de8304a2 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, else if (prot & IOMMU_CACHE) pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE + << ARM_LPAE_PTE_ATTRINDX_SHIFT); } drivers/iommu/io-pgtable.c currently documents this quirk as applying only to the page-table walker. Given that we only have one user at the moment, I think it's ok to change that, but please update the comment. Sure, how about this change in comment: * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability -* attributes set in the TCR for a non-coherent page-table walker. +* attributes set in the TCR for a non-coherent page-table walker +* and also to set the correct cacheability attributes to use an +* outer level of cache for non-coherent masters. We also need to decide on whether we want to allow the quirk to be passed if the coherency of the page-table walker differs from the DMA device, since we have these combinations: Coherent walker?IOMMU_CACHE IO_PGTABLE_QUIRK_ARM_OUTER_WBWA 0: N 0 0 1: N 0 1 2: N 1 0 3: N 1 1 4: Y 0 0 5: Y 0 1 6: Y 1 0 7: Y 1 1 Some of them are obviously bogus, such as (7), but I don't know what to do about cases such as (3) and (5). I thought this was already decided when IOMMU_SYS_CACHE_ONLY prot flag was added in this same location [1]. dma-coherent masters can use the normal cached memory type to use the system cache and non dma-coherent masters willing to use system cache should use normal sys-cached memory type with this quirk. [1] https://lore.kernel.org/linux-arm-msm/20190516093020.18028-1-vivek.gau...@codeaurora.org/ Thanks, Sai -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] iommu/io-pgtable-arm: Allow non-coherent masters to use system cache
commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag") removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went the memory type setting required for the non-coherent masters to use system cache. Now that system cache support for GPU is added, we will need to mark the memory as normal sys-cached for GPU to use system cache. Without this, the system cache lines are not allocated for GPU. We use the IO_PGTABLE_QUIRK_ARM_OUTER_WBWA quirk instead of a page protection flag as the flag cannot be exposed via DMA api because of no in-tree users. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7c9ea9d7874a..3fb7de8304a2 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, else if (prot & IOMMU_CACHE) pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE << ARM_LPAE_PTE_ATTRINDX_SHIFT); + else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE + << ARM_LPAE_PTE_ATTRINDX_SHIFT); } if (prot & IOMMU_CACHE) -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv10 8/9] iommu: arm-smmu-impl: Use table to list QCOM implementations
Use table and of_match_node() to match qcom implementation instead of multiple of_device_compatible() calls for each QCOM SMMU implementation. Signed-off-by: Sai Prakash Ranjan Acked-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 9 + drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 - drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 - 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c index 7fed89c9d18a..26e2734eb4d7 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c @@ -214,14 +214,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu) if (of_device_is_compatible(np, "nvidia,tegra194-smmu")) return nvidia_smmu_impl_init(smmu); - if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") || - of_device_is_compatible(np, "qcom,sc7180-smmu-500") || - of_device_is_compatible(np, "qcom,sm8150-smmu-500") || - of_device_is_compatible(np, "qcom,sm8250-smmu-500")) - return qcom_smmu_impl_init(smmu); - - if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu")) - return qcom_adreno_smmu_impl_init(smmu); + smmu = qcom_smmu_impl_init(smmu); if (of_device_is_compatible(np, "marvell,ap806-smmu-500")) smmu->impl = &mrvl_mmu500_impl; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index d0636c803a36..add1859b2899 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -318,12 +318,23 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, return &qsmmu->smmu; } +static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { + { .compatible = "qcom,sc7180-smmu-500" }, + { .compatible = "qcom,sdm845-smmu-500" }, + { .compatible = "qcom,sm8150-smmu-500" }, + { .compatible = "qcom,sm8250-smmu-500" }, + { } +}; + struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) { - return qcom_smmu_create(smmu, &qcom_smmu_impl); -} + const struct device_node *np = smmu->dev->of_node; -struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device *smmu) -{ - return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + if (of_match_node(qcom_smmu_impl_of_match, np)) + return qcom_smmu_create(smmu, &qcom_smmu_impl); + + if (of_device_is_compatible(np, "qcom,adreno-smmu")) + return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + + return smmu; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index cb7ca3a444c9..d2a2d1bc58ba 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -523,7 +523,6 @@ static inline void arm_smmu_writeq(struct arm_smmu_device *smmu, int page, struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu); struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu); struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu); -struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device *smmu); void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx); int arm_mmu500_reset(struct arm_smmu_device *smmu); -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv10 7/9] drm/msm/a6xx: Add support for using system cache on MMU500 based targets
From: Jordan Crouse GPU targets with an MMU-500 attached have a slightly different process for enabling system cache. Use the compatible string on the IOMMU phandle to see if an MMU-500 is attached and modify the programming sequence accordingly. Signed-off-by: Jordan Crouse Signed-off-by: Sai Prakash Ranjan --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +-- drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 1 + 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 95c98c642876..3f8b92da8cba 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) { + struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; + struct msm_gpu *gpu = &adreno_gpu->base; u32 cntl1_regval = 0; if (IS_ERR(a6xx_gpu->llc_mmio)) @@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) (gpu_scid << 15) | (gpu_scid << 20); } + /* +* For targets with a MMU500, activate the slice but don't program the +* register. The XBL will take care of that. +*/ if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) { - u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); + if (!a6xx_gpu->have_mmu500) { + u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); - gpuhtw_scid &= 0x1f; - cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); + gpuhtw_scid &= 0x1f; + cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); + } } if (cntl1_regval) { @@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) * Program the slice IDs for the various GPU blocks and GPU MMU * pagetables */ - a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, cntl1_regval); - - /* -* Program cacheability overrides to not allocate cache lines on -* a write miss -*/ - a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 0x03); + if (a6xx_gpu->have_mmu500) + gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0), + cntl1_regval); + else { + a6xx_llc_write(a6xx_gpu, + REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, cntl1_regval); + + /* +* Program cacheability overrides to not allocate cache +* lines on a write miss +*/ + a6xx_llc_rmw(a6xx_gpu, + REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 0x03); + } } } @@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu) static void a6xx_llc_slices_init(struct platform_device *pdev, struct a6xx_gpu *a6xx_gpu) { + struct device_node *phandle; + a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx"); if (IS_ERR(a6xx_gpu->llc_mmio)) return; + /* +* There is a different programming path for targets with an mmu500 +* attached, so detect if that is the case +*/ + phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0); + a6xx_gpu->have_mmu500 = (phandle && + of_device_is_compatible(phandle, "arm,mmu-500")); + of_node_put(phandle); + a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU); a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 9e6079af679c..e793d329e77b 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -32,6 +32,7 @@ struct a6xx_gpu { void __iomem *llc_mmio; void *llc_slice; void *htw_llc_slice; + bool have_mmu500; }; #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base) -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv10 9/9] iommu: arm-smmu-impl: Add a space before open parenthesis
Fix the checkpatch warning for space required before the open parenthesis. Signed-off-by: Sai Prakash Ranjan Acked-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c index 26e2734eb4d7..136872e77195 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c @@ -12,7 +12,7 @@ static int arm_smmu_gr0_ns(int offset) { - switch(offset) { + switch (offset) { case ARM_SMMU_GR0_sCR0: case ARM_SMMU_GR0_sACR: case ARM_SMMU_GR0_sGFSR: -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv10 0/9] System Cache support for GPU and required SMMU support
Some hardware variants contain a system cache or the last level cache(llc). This cache is typically a large block which is shared by multiple clients on the SOC. GPU uses the system cache to cache both the GPU data buffers(like textures) as well the SMMU pagetables. This helps with improved render performance as well as lower power consumption by reducing the bus traffic to the system memory. The system cache architecture allows the cache to be split into slices which then be used by multiple SOC clients. This patch series is an effort to enable and use two of those slices preallocated for the GPU, one for the GPU data buffers and another for the GPU SMMU hardware pagetables. Patch 1 - Patch 7 adds system cache support in SMMU and GPU driver. Patch 8 and 9 are minor cleanups for arm-smmu impl. Changes in v10: * Fix non-strict mode domain attr handling (Will) * Split the domain attribute patch into two (Will) Changes in v9: * Change name from domain_attr_io_pgtbl_cfg to io_pgtable_domain_attr (Will) * Modify comment for the quirk as suggested (Will) * Compare with IO_PGTABLE_QUIRK_NON_STRICT for non-strict mode (Will) Changes in v8: * Introduce a generic domain attribute for pagetable config (Will) * Rename quirk to more generic IO_PGTABLE_QUIRK_ARM_OUTER_WBWA (Will) * Move non-strict mode to use new struct domain_attr_io_pgtbl_config (Will) Changes in v7: * Squash Jordan's patch to support MMU500 targets * Rebase on top of for-joerg/arm-smmu/updates and Jordan's short series for adreno-smmu impl Changes in v6: * Move table to arm-smmu-qcom (Robin) Changes in v5: * Drop cleanup of blank lines since it was intentional (Robin) * Rebase again on top of msm-next-pgtables as it moves pretty fast Changes in v4: * Drop IOMMU_SYS_CACHE prot flag * Rebase on top of https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables Changes in v3: * Fix domain attribute setting to before iommu_attach_device() * Fix few code style and checkpatch warnings * Rebase on top of Jordan's latest split pagetables and per-instance pagetables support Changes in v2: * Addressed review comments and rebased on top of Jordan's split pagetables series Jordan Crouse (1): drm/msm/a6xx: Add support for using system cache on MMU500 based targets Sai Prakash Ranjan (6): iommu/io-pgtable: Add a domain attribute for pagetable configuration iommu/io-pgtable-arm: Add support to use system cache iommu/arm-smmu: Add support for pagetable config domain attribute iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr iommu: arm-smmu-impl: Use table to list QCOM implementations iommu: arm-smmu-impl: Add a space before open parenthesis Sharat Masetty (2): drm/msm: rearrange the gpu_rmw() function drm/msm/a6xx: Add support for using system cache(LLC) drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 109 + drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 5 + drivers/gpu/drm/msm/adreno/adreno_gpu.c| 17 drivers/gpu/drm/msm/msm_drv.c | 8 ++ drivers/gpu/drm/msm/msm_drv.h | 1 + drivers/gpu/drm/msm/msm_gpu.h | 5 +- drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 11 +-- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 +++- drivers/iommu/arm/arm-smmu/arm-smmu.c | 33 ++- drivers/iommu/arm/arm-smmu/arm-smmu.h | 3 +- drivers/iommu/io-pgtable-arm.c | 10 +- include/linux/io-pgtable.h | 8 ++ include/linux/iommu.h | 1 + 13 files changed, 205 insertions(+), 27 deletions(-) base-commit: a29bbb0861f487a5e144dc997a9f71a36c7a2404 -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv10 5/9] drm/msm: rearrange the gpu_rmw() function
From: Sharat Masetty The register read-modify-write construct is generic enough that it can be used by other subsystems as needed, create a more generic rmw() function and have the gpu_rmw() use this new function. Signed-off-by: Sharat Masetty Reviewed-by: Jordan Crouse Signed-off-by: Sai Prakash Ranjan --- drivers/gpu/drm/msm/msm_drv.c | 8 drivers/gpu/drm/msm/msm_drv.h | 1 + drivers/gpu/drm/msm/msm_gpu.h | 5 + 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 49685571dc0e..a1e22b974b77 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr) return val; } +void msm_rmw(void __iomem *addr, u32 mask, u32 or) +{ + u32 val = msm_readl(addr); + + val &= ~mask; + msm_writel(val | or, addr); +} + struct msm_vblank_work { struct work_struct work; int crtc_id; diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index b9dd8f8f4887..655b3b0424a1 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device *pdev, const char *name, const char *dbgname); void msm_writel(u32 data, void __iomem *addr); u32 msm_readl(const void __iomem *addr); +void msm_rmw(void __iomem *addr, u32 mask, u32 or); struct msm_gpu_submitqueue; int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 6c9e1fdc1a76..b2b419277953 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg) static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or) { - uint32_t val = gpu_read(gpu, reg); - - val &= ~mask; - gpu_write(gpu, reg, val | or); + msm_rmw(gpu->mmio + (reg << 2), mask, or); } static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi) -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv10 6/9] drm/msm/a6xx: Add support for using system cache(LLC)
From: Sharat Masetty The last level system cache can be partitioned to 32 different slices of which GPU has two slices preallocated. One slice is used for caching GPU buffers and the other slice is used for caching the GPU SMMU pagetables. This talks to the core system cache driver to acquire the slice handles, configure the SCID's to those slices and activates and deactivates the slices upon GPU power collapse and restore. Some support from the IOMMU driver is also needed to make use of the system cache to set the right TCR attributes. GPU then has the ability to override a few cacheability parameters which it does to override write-allocate to write-no-allocate as the GPU hardware does not benefit much from it. DOMAIN_ATTR_IO_PGTABLE_CFG is another domain level attribute used by the IOMMU driver for pagetable configuration which will be used to set a quirk initially to set the right attributes to cache the hardware pagetables into the system cache. Signed-off-by: Sharat Masetty [saiprakash.ranjan: fix to set attr before device attach to iommu and rebase] Signed-off-by: Sai Prakash Ranjan --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 + drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 ++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 + 3 files changed, 104 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 948f3656c20c..95c98c642876 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -8,7 +8,9 @@ #include "a6xx_gpu.h" #include "a6xx_gmu.xml.h" +#include #include +#include #define GPU_PAS_ID 13 @@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) return IRQ_HANDLED; } +static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or) +{ + return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or); +} + +static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value) +{ + return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2)); +} + +static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) +{ + llcc_slice_deactivate(a6xx_gpu->llc_slice); + llcc_slice_deactivate(a6xx_gpu->htw_llc_slice); +} + +static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) +{ + u32 cntl1_regval = 0; + + if (IS_ERR(a6xx_gpu->llc_mmio)) + return; + + if (!llcc_slice_activate(a6xx_gpu->llc_slice)) { + u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice); + + gpu_scid &= 0x1f; + cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) | + (gpu_scid << 15) | (gpu_scid << 20); + } + + if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) { + u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); + + gpuhtw_scid &= 0x1f; + cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); + } + + if (cntl1_regval) { + /* +* Program the slice IDs for the various GPU blocks and GPU MMU +* pagetables +*/ + a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, cntl1_regval); + + /* +* Program cacheability overrides to not allocate cache lines on +* a write miss +*/ + a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 0x03); + } +} + +static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu) +{ + llcc_slice_putd(a6xx_gpu->llc_slice); + llcc_slice_putd(a6xx_gpu->htw_llc_slice); +} + +static void a6xx_llc_slices_init(struct platform_device *pdev, + struct a6xx_gpu *a6xx_gpu) +{ + a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx"); + if (IS_ERR(a6xx_gpu->llc_mmio)) + return; + + a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU); + a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW); + + if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice)) + a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL); +} + static int a6xx_pm_resume(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu) msm_gpu_resume_devfreq(gpu); + a6xx_llc_activate(a6xx_gpu); + return 0; } @@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu) trace_msm_gpu_suspend(0); + a6xx_llc_deactivate(a6xx_gpu); + devfreq_suspend_device(gpu->devfreq.devfreq); return a6xx_gmu_stop(a6xx_gpu); @@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu) d
[PATCHv10 2/9] iommu/io-pgtable-arm: Add support to use system cache
Add a quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to override the outer-cacheability attributes set in the TCR for a non-coherent page table walker when using system cache. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/io-pgtable-arm.c | 10 -- include/linux/io-pgtable.h | 4 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index a7a9bc08dcd1..7c9ea9d7874a 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NON_STRICT | - IO_PGTABLE_QUIRK_ARM_TTBR1)) + IO_PGTABLE_QUIRK_ARM_TTBR1 | + IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)) return NULL; data = arm_lpae_alloc_pgtable(cfg); @@ -773,10 +774,15 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) tcr->sh = ARM_LPAE_TCR_SH_IS; tcr->irgn = ARM_LPAE_TCR_RGN_WBWA; tcr->orgn = ARM_LPAE_TCR_RGN_WBWA; + if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA) + goto out_free_data; } else { tcr->sh = ARM_LPAE_TCR_SH_OS; tcr->irgn = ARM_LPAE_TCR_RGN_NC; - tcr->orgn = ARM_LPAE_TCR_RGN_NC; + if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)) + tcr->orgn = ARM_LPAE_TCR_RGN_NC; + else + tcr->orgn = ARM_LPAE_TCR_RGN_WBWA; } tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 215fd9d69540..fb4d5a763e0c 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -86,6 +86,9 @@ struct io_pgtable_cfg { * * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table * for use in the upper half of a split address space. +* +* IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability +* attributes set in the TCR for a non-coherent page-table walker. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -93,6 +96,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3) #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4) #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) + #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) unsigned long quirks; unsigned long pgsize_bitmap; unsigned intias; -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv10 4/9] iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr
Now that we have a struct io_pgtable_domain_attr with quirks, use that for non_strict mode as well thereby removing the need for more members of arm_smmu_domain in the future. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 15 +-- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 - 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 4b9b10fe50ed..d8979bb71fc0 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -786,9 +786,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, goto out_clear_smmu; } - if (smmu_domain->non_strict) - pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; - if (smmu_domain->pgtbl_cfg.quirks) pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks; @@ -1526,9 +1523,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, break; case IOMMU_DOMAIN_DMA: switch (attr) { - case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: - *(int *)data = smmu_domain->non_strict; + case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: { + bool non_strict = smmu_domain->pgtbl_cfg.quirks & + IO_PGTABLE_QUIRK_NON_STRICT; + *(int *)data = non_strict; return 0; + } default: return -ENODEV; } @@ -1578,7 +1578,10 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, case IOMMU_DOMAIN_DMA: switch (attr) { case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: - smmu_domain->non_strict = *(int *)data; + if (*(int *)data) + smmu_domain->pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; + else + smmu_domain->pgtbl_cfg.quirks &= ~IO_PGTABLE_QUIRK_NON_STRICT; break; default: ret = -ENODEV; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index bb5a419f240f..cb7ca3a444c9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -368,7 +368,6 @@ struct arm_smmu_domain { const struct iommu_flush_ops*flush_ops; struct arm_smmu_cfg cfg; enum arm_smmu_domain_stage stage; - boolnon_strict; struct mutexinit_mutex; /* Protects smmu pointer */ spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ struct iommu_domain domain; -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCHv10 3/9] iommu/arm-smmu: Add support for pagetable config domain attribute
Add support for domain attribute DOMAIN_ATTR_IO_PGTABLE_CFG to get/set pagetable configuration data which initially will be used to set quirks and later can be extended to include other pagetable configuration data. Signed-off-by: Sai Prakash Ranjan --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 2 files changed, 21 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 0f28a8614da3..4b9b10fe50ed 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, if (smmu_domain->non_strict) pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; + if (smmu_domain->pgtbl_cfg.quirks) + pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks; + pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); if (!pgtbl_ops) { ret = -ENOMEM; @@ -1511,6 +1514,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, case DOMAIN_ATTR_NESTING: *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED); return 0; + case DOMAIN_ATTR_IO_PGTABLE_CFG: { + struct io_pgtable_domain_attr *pgtbl_cfg = data; + *pgtbl_cfg = smmu_domain->pgtbl_cfg; + + return 0; + } default: return -ENODEV; } @@ -1551,6 +1560,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, else smmu_domain->stage = ARM_SMMU_DOMAIN_S1; break; + case DOMAIN_ATTR_IO_PGTABLE_CFG: { + struct io_pgtable_domain_attr *pgtbl_cfg = data; + + if (smmu_domain->smmu) { + ret = -EPERM; + goto out_unlock; + } + + smmu_domain->pgtbl_cfg = *pgtbl_cfg; + break; + } default: ret = -ENODEV; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 04288b6fc619..bb5a419f240f 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -364,6 +364,7 @@ enum arm_smmu_domain_stage { struct arm_smmu_domain { struct arm_smmu_device *smmu; struct io_pgtable_ops *pgtbl_ops; + struct io_pgtable_domain_attr pgtbl_cfg; const struct iommu_flush_ops*flush_ops; struct arm_smmu_cfg cfg; enum arm_smmu_domain_stage stage; -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu