arm-smmu-v3: Add support for NVIDIA CMDQ-Virtualization hw

Nicolin Chen via iommu Fri, 23 Jul 2021 12:32:39 -0700

From: Nate Watterson <[email protected]>

NVIDIA's Grace SoC includes custom CMDQ-Virtualization (CMDQV)
hardware, which adds multiple VCMDQ interfaces to supplement
the architected SMMU_CMDQ in an effort to reduce contention.


To make use of these supplemental CMDQs in arm-smmu-v3 driver,
we borrow the "implementation infrastructure" design from the
arm-smmu driver, and add support for implementation defined
issue_cmdlist methods.

Signed-off-by: Nate Watterson <[email protected]>
Signed-off-by: Nicolin Chen <[email protected]>
---
 MAINTAINERS                                   |   2 +
 drivers/iommu/arm/arm-smmu-v3/Makefile        |   2 +-
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c  |   7 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  67 +--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  11 +
 .../iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c    | 425 ++++++++++++++++++
 6 files changed, 487 insertions(+), 27 deletions(-)
 create mode 100644 drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c

diff --git a/MAINTAINERS b/MAINTAINERS
index d69b2d4646be..e72e3459c9be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18240,8 +18240,10 @@ F:     drivers/i2c/busses/i2c-tegra.c
 TEGRA IOMMU DRIVERS
 M:     Thierry Reding <[email protected]>
 R:     Krishna Reddy <[email protected]>
+R:     Nicolin Chen <[email protected]>
 L:     [email protected]
 S:     Supported
+F:     drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c
 F:     drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
 F:     drivers/iommu/tegra*
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile 
b/drivers/iommu/arm/arm-smmu-v3/Makefile
index 1f5838d3351b..0aa84c0a50ea 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
-arm_smmu_v3-objs-y += arm-smmu-v3.o arm-smmu-v3-impl.o
+arm_smmu_v3-objs-y += arm-smmu-v3.o arm-smmu-v3-impl.o nvidia-smmu-v3.o
 arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
 arm_smmu_v3-objs := $(arm_smmu_v3-objs-y)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c
index 6947d28067a8..37d062e40eb5 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c
@@ -4,5 +4,12 @@
 
 struct arm_smmu_device *arm_smmu_v3_impl_init(struct arm_smmu_device *smmu)
 {
+       /*
+        * Nvidia implementation supports ACPI only, so calling its init()
+        * unconditionally to walk through ACPI tables to probe the device.
+        * It will keep the smmu pointer intact, if it fails.
+        */
+       smmu = nvidia_smmu_v3_impl_init(smmu);
+
        return smmu;
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b2d23de2b207..439809e1acd4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -336,9 +336,9 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
arm_smmu_cmdq_ent *ent)
 }
 
 static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device 
*smmu,
-                                        u32 prod)
+                                        u32 prod, struct arm_smmu_cmdq *cmdq)
 {
-       struct arm_smmu_queue *q = &smmu->cmdq.q;
+       struct arm_smmu_queue *q = &cmdq->q;
        struct arm_smmu_cmdq_ent ent = {
                .opcode = CMDQ_OP_CMD_SYNC,
        };
@@ -575,11 +575,11 @@ static void arm_smmu_cmdq_poll_valid_map(struct 
arm_smmu_cmdq *cmdq,
 
 /* Wait for the command queue to become non-full */
 static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
-                                            struct arm_smmu_ll_queue *llq)
+                                            struct arm_smmu_ll_queue *llq,
+                                            struct arm_smmu_cmdq *cmdq)
 {
        unsigned long flags;
        struct arm_smmu_queue_poll qp;
-       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
        int ret = 0;
 
        /*
@@ -595,7 +595,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct 
arm_smmu_device *smmu,
 
        queue_poll_init(smmu, &qp);
        do {
-               llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+               llq->val = READ_ONCE(cmdq->q.llq.val);
                if (!queue_full(llq))
                        break;
 
@@ -610,11 +610,11 @@ static int arm_smmu_cmdq_poll_until_not_full(struct 
arm_smmu_device *smmu,
  * Must be called with the cmdq lock held in some capacity.
  */
 static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
-                                         struct arm_smmu_ll_queue *llq)
+                                         struct arm_smmu_ll_queue *llq,
+                                         struct arm_smmu_cmdq *cmdq)
 {
        int ret = 0;
        struct arm_smmu_queue_poll qp;
-       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
        u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
 
        queue_poll_init(smmu, &qp);
@@ -634,15 +634,15 @@ static int __arm_smmu_cmdq_poll_until_msi(struct 
arm_smmu_device *smmu,
  * Must be called with the cmdq lock held in some capacity.
  */
 static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
-                                              struct arm_smmu_ll_queue *llq)
+                                              struct arm_smmu_ll_queue *llq,
+                                              struct arm_smmu_cmdq *cmdq)
 {
        struct arm_smmu_queue_poll qp;
-       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
        u32 prod = llq->prod;
        int ret = 0;
 
        queue_poll_init(smmu, &qp);
-       llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+       llq->val = READ_ONCE(cmdq->q.llq.val);
        do {
                if (queue_consumed(llq, prod))
                        break;
@@ -684,12 +684,13 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct 
arm_smmu_device *smmu,
 }
 
 static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
-                                        struct arm_smmu_ll_queue *llq)
+                                        struct arm_smmu_ll_queue *llq,
+                                        struct arm_smmu_cmdq *cmdq)
 {
        if (smmu->options & ARM_SMMU_OPT_MSIPOLL)
-               return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
+               return __arm_smmu_cmdq_poll_until_msi(smmu, llq, cmdq);
 
-       return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
+       return __arm_smmu_cmdq_poll_until_consumed(smmu, llq, cmdq);
 }
 
 static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
@@ -709,6 +710,14 @@ static void arm_smmu_cmdq_write_entries(struct 
arm_smmu_cmdq *cmdq, u64 *cmds,
        }
 }
 
+static int arm_smmu_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int 
n, bool sync)
+{
+       if (smmu->impl && smmu->impl->issue_cmdlist)
+               return smmu->impl->issue_cmdlist(smmu, cmds, n, sync);
+
+       return arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, sync, &smmu->cmdq);
+}
+
 /*
  * This is the actual insertion function, and provides the following
  * ordering guarantees to callers:
@@ -725,14 +734,13 @@ static void arm_smmu_cmdq_write_entries(struct 
arm_smmu_cmdq *cmdq, u64 *cmds,
  *   insert their own list of commands then all of the commands from one
  *   CPU will appear before any of the commands from the other CPU.
  */
-static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
-                                      u64 *cmds, int n, bool sync)
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int 
n, bool sync,
+                               struct arm_smmu_cmdq *cmdq)
 {
        u64 cmd_sync[CMDQ_ENT_DWORDS];
        u32 prod;
        unsigned long flags;
        bool owner;
-       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
        struct arm_smmu_ll_queue llq = {
                .max_n_shift = cmdq->q.llq.max_n_shift,
        }, head = llq;
@@ -746,7 +754,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct 
arm_smmu_device *smmu,
 
                while (!queue_has_space(&llq, n + sync)) {
                        local_irq_restore(flags);
-                       if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
+                       if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq, cmdq))
                                dev_err_ratelimited(smmu->dev, "CMDQ 
timeout\n");
                        local_irq_save(flags);
                }
@@ -772,7 +780,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct 
arm_smmu_device *smmu,
        arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
        if (sync) {
                prod = queue_inc_prod_n(&llq, n);
-               arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
+               arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod, cmdq);
                queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
 
                /*
@@ -822,7 +830,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct 
arm_smmu_device *smmu,
        /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
        if (sync) {
                llq.prod = queue_inc_prod_n(&llq, n);
-               ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
+               ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq, cmdq);
                if (ret) {
                        dev_err_ratelimited(smmu->dev,
                                            "CMD_SYNC timeout at 0x%08x [hwprod 
0x%08x, hwcons 0x%08x]\n",
@@ -856,12 +864,12 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device 
*smmu,
                return -EINVAL;
        }
 
-       return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
+       return arm_smmu_issue_cmdlist(smmu, cmd, 1, false);
 }
 
 static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
-       return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
+       return arm_smmu_issue_cmdlist(smmu, NULL, 0, true);
 }
 
 static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
@@ -869,7 +877,7 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device 
*smmu,
                                    struct arm_smmu_cmdq_ent *cmd)
 {
        if (cmds->num == CMDQ_BATCH_ENTRIES) {
-               arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
+               arm_smmu_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
                cmds->num = 0;
        }
        arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd);
@@ -879,7 +887,7 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device 
*smmu,
 static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
                                      struct arm_smmu_cmdq_batch *cmds)
 {
-       return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+       return arm_smmu_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
 }
 
 static int arm_smmu_page_response(struct device *dev,
@@ -2899,10 +2907,9 @@ static void arm_smmu_cmdq_free_bitmap(void *data)
        bitmap_free(bitmap);
 }
 
-static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu, struct 
arm_smmu_cmdq *cmdq)
 {
        int ret = 0;
-       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
        unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
        atomic_long_t *bitmap;
 
@@ -2932,7 +2939,7 @@ static int arm_smmu_init_queues(struct arm_smmu_device 
*smmu)
        if (ret)
                return ret;
 
-       ret = arm_smmu_cmdq_init(smmu);
+       ret = arm_smmu_cmdq_init(smmu, &smmu->cmdq);
        if (ret)
                return ret;
 
@@ -3416,6 +3423,14 @@ static int arm_smmu_device_reset(struct arm_smmu_device 
*smmu, bool bypass)
                return ret;
        }
 
+       if (smmu->impl && smmu->impl->device_reset) {
+               ret = smmu->impl->device_reset(smmu);
+               if (ret) {
+                       dev_err(smmu->dev, "failed at implementation specific 
device_reset\n");
+                       return ret;
+               }
+       }
+
        return 0;
 }
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 4c60ba14221b..baec2d3a46f9 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -647,6 +647,8 @@ struct arm_smmu_device {
 #define ARM_SMMU_OPT_MSIPOLL           (1 << 2)
        u32                             options;
 
+       const struct arm_smmu_impl      *impl;
+
        struct arm_smmu_cmdq            cmdq;
        struct arm_smmu_evtq            evtq;
        struct arm_smmu_priq            priq;
@@ -807,7 +809,16 @@ static inline u32 arm_smmu_sva_get_pasid(struct iommu_sva 
*handle)
 static inline void arm_smmu_sva_notifier_synchronize(void) {}
 #endif /* CONFIG_ARM_SMMU_V3_SVA */
 
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int 
n, bool sync,
+                               struct arm_smmu_cmdq *cmdq);
+
 /* Implementation details */
+struct arm_smmu_impl {
+       int (*device_reset)(struct arm_smmu_device *smmu);
+       int (*issue_cmdlist)(struct arm_smmu_device *smmu, u64 *cmds, int n, 
bool sync);
+};
+
 struct arm_smmu_device *arm_smmu_v3_impl_init(struct arm_smmu_device *smmu);
+struct arm_smmu_device *nvidia_smmu_v3_impl_init(struct arm_smmu_device *smmu);
 
 #endif /* _ARM_SMMU_V3_H */
diff --git a/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c
new file mode 100644
index 000000000000..ceec2a24057f
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define dev_fmt(fmt) "nvidia_smmu_cmdqv: " fmt
+
+#include <linux/acpi.h>
+#include <linux/dma-mapping.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/platform_device.h>
+
+#include <acpi/acpixf.h>
+
+#include "arm-smmu-v3.h"
+
+#define NVIDIA_SMMU_CMDQV_HID          "NVDA0600"
+
+/* CMDQV global config regs */
+#define NVIDIA_CMDQV_CONFIG            0x0000
+#define  CMDQV_EN                      BIT(0)
+
+#define NVIDIA_CMDQV_PARAM             0x0004
+#define  CMDQV_NUM_VINTF_LOG2          GENMASK(11, 8)
+#define  CMDQV_NUM_VCMDQ_LOG2          GENMASK(7, 4)
+
+#define NVIDIA_CMDQV_STATUS            0x0008
+#define  CMDQV_STATUS                  GENMASK(2, 1)
+#define  CMDQV_ENABLED                 BIT(0)
+
+#define NVIDIA_CMDQV_VINTF_ERR_MAP     0x000C
+#define NVIDIA_CMDQV_VINTF_INT_MASK    0x0014
+#define NVIDIA_CMDQV_VCMDQ_ERR_MAP     0x001C
+
+#define NVIDIA_CMDQV_CMDQ_ALLOC(q)     (0x0200 + 0x4*(q))
+#define  CMDQV_CMDQ_ALLOC_VINTF                GENMASK(20, 15)
+#define  CMDQV_CMDQ_ALLOC_LVCMDQ       GENMASK(7, 1)
+#define  CMDQV_CMDQ_ALLOCATED          BIT(0)
+
+/* VINTF config regs */
+#define NVIDIA_CMDQV_VINTF(v)          (0x1000 + 0x100*(v))
+
+#define NVIDIA_VINTF_CONFIG            0x0000
+#define  VINTF_HYP_OWN                 BIT(17)
+#define  VINTF_VMID                    GENMASK(16, 1)
+#define  VINTF_EN                      BIT(0)
+
+#define NVIDIA_VINTF_STATUS            0x0004
+#define  VINTF_STATUS                  GENMASK(3, 1)
+#define  VINTF_ENABLED                 BIT(0)
+
+/* VCMDQ config regs */
+#define NVIDIA_CMDQV_VCMDQ(q)          (0x10000 + 0x80*(q))
+
+#define NVIDIA_VCMDQ_CONS              0x00000
+#define  VCMDQ_CONS_ERR                        GENMASK(30, 24)
+
+#define NVIDIA_VCMDQ_PROD              0x00004
+
+#define NVIDIA_VCMDQ_CONFIG            0x00008
+#define  VCMDQ_EN                      BIT(0)
+
+#define NVIDIA_VCMDQ_STATUS            0x0000C
+#define  VCMDQ_ENABLED                 BIT(0)
+
+#define NVIDIA_VCMDQ_GERROR            0x00010
+#define NVIDIA_VCMDQ_GERRORN           0x00014
+
+#define NVIDIA_VCMDQ_BASE              0x10000
+#define  VCMDQ_ADDR                    GENMASK(63, 5)
+#define  VCMDQ_LOG2SIZE                        GENMASK(4, 0)
+
+struct nvidia_smmu_vintf {
+       u16                     idx;
+       u32                     cfg;
+       u32                     status;
+
+       void __iomem            *base;
+       struct arm_smmu_cmdq    *vcmdqs;
+};
+
+struct nvidia_smmu {
+       struct arm_smmu_device  smmu;
+
+       struct device           *cmdqv_dev;
+       void __iomem            *cmdqv_base;
+       int                     cmdqv_irq;
+
+       /* CMDQV Hardware Params */
+       u16                     num_total_vintfs;
+       u16                     num_total_vcmdqs;
+       u16                     num_vcmdqs_per_vintf;
+
+       /* CMDQV_VINTF(0) reserved for host kernel use */
+       struct nvidia_smmu_vintf vintf0;
+};
+
+static irqreturn_t nvidia_smmu_cmdqv_isr(int irq, void *devid)
+{
+       struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)devid;
+       struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0;
+       u32 vintf_err_map[2];
+       u32 vcmdq_err_map[4];
+
+       vintf_err_map[0] = readl_relaxed(nsmmu->cmdqv_base + 
NVIDIA_CMDQV_VINTF_ERR_MAP);
+       vintf_err_map[1] = readl_relaxed(nsmmu->cmdqv_base + 
NVIDIA_CMDQV_VINTF_ERR_MAP + 0x4);
+
+       vcmdq_err_map[0] = readl_relaxed(nsmmu->cmdqv_base + 
NVIDIA_CMDQV_VCMDQ_ERR_MAP);
+       vcmdq_err_map[1] = readl_relaxed(nsmmu->cmdqv_base + 
NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0x4);
+       vcmdq_err_map[2] = readl_relaxed(nsmmu->cmdqv_base + 
NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0x8);
+       vcmdq_err_map[3] = readl_relaxed(nsmmu->cmdqv_base + 
NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0xC);
+
+       dev_warn(nsmmu->cmdqv_dev,
+                "Unexpected cmdqv error reported: vintf_map %08X %08X, 
vcmdq_map %08X %08X %08X %08X\n",
+                vintf_err_map[0], vintf_err_map[1], vcmdq_err_map[0], 
vcmdq_err_map[1],
+                vcmdq_err_map[2], vcmdq_err_map[3]);
+
+       /* If the error was reported by vintf0, avoid using any of its VCMDQs */
+       if (vintf_err_map[vintf0->idx / 32] & (1 << (vintf0->idx % 32))) {
+               vintf0->status = readl_relaxed(vintf0->base + 
NVIDIA_VINTF_STATUS);
+
+               dev_warn(nsmmu->cmdqv_dev, "error (0x%lX) reported by host 
vintf0 - disabling its vcmdqs\n",
+                        FIELD_GET(VINTF_STATUS, vintf0->status));
+       } else if (vintf_err_map[0] || vintf_err_map[1]) {
+               dev_err(nsmmu->cmdqv_dev, "cmdqv error interrupt triggered by 
unassigned vintf!\n");
+       }
+
+       return IRQ_HANDLED;
+}
+
+/* Adapt struct arm_smmu_cmdq init sequences from arm-smmu-v3.c for VCMDQs */
+static int nvidia_smmu_init_one_arm_smmu_cmdq(struct nvidia_smmu *nsmmu,
+                                             struct arm_smmu_cmdq *cmdq,
+                                             void __iomem *vcmdq_base,
+                                             u16 idx)
+{
+       struct arm_smmu_queue *q = &cmdq->q;
+       size_t qsz;
+
+       /* struct arm_smmu_cmdq config normally done in 
arm_smmu_device_hw_probe() */
+       q->llq.max_n_shift = ilog2(SZ_64K >> CMDQ_ENT_SZ_SHIFT);
+
+       /* struct arm_smmu_cmdq config normally done in 
arm_smmu_init_one_queue() */
+       qsz = (1 << q->llq.max_n_shift) << CMDQ_ENT_SZ_SHIFT;
+       q->base = dmam_alloc_coherent(nsmmu->cmdqv_dev, qsz, &q->base_dma, 
GFP_KERNEL);
+       if (!q->base) {
+               dev_err(nsmmu->cmdqv_dev, "failed to allocate 0x%zX bytes for 
VCMDQ%u\n",
+                       qsz, idx);
+               return -ENOMEM;
+       }
+       dev_dbg(nsmmu->cmdqv_dev, "allocated %u entries for VCMDQ%u @ 0x%llX 
[%pad] ++ %zX",
+               1 << q->llq.max_n_shift, idx, (u64)q->base, &q->base_dma, qsz);
+
+       q->prod_reg = vcmdq_base + NVIDIA_VCMDQ_PROD;
+       q->cons_reg = vcmdq_base + NVIDIA_VCMDQ_CONS;
+       q->ent_dwords = CMDQ_ENT_DWORDS;
+
+       q->q_base  = q->base_dma & VCMDQ_ADDR;
+       q->q_base |= FIELD_PREP(VCMDQ_LOG2SIZE, q->llq.max_n_shift);
+
+       q->llq.prod = q->llq.cons = 0;
+
+       /* struct arm_smmu_cmdq config normally done in arm_smmu_cmdq_init() */
+       atomic_set(&cmdq->owner_prod, 0);
+       atomic_set(&cmdq->lock, 0);
+
+       cmdq->valid_map = (atomic_long_t *)bitmap_zalloc(1 << 
q->llq.max_n_shift, GFP_KERNEL);
+       if (!cmdq->valid_map) {
+               dev_err(nsmmu->cmdqv_dev, "failed to allocate valid_map for 
VCMDQ%u\n", idx);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int nvidia_smmu_cmdqv_init(struct nvidia_smmu *nsmmu)
+{
+       struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0;
+       u32 regval;
+       u16 idx;
+       int ret;
+
+       /* Setup vintf0 for host kernel */
+       vintf0->idx = 0;
+       vintf0->base = nsmmu->cmdqv_base + NVIDIA_CMDQV_VINTF(0);
+
+       regval = FIELD_PREP(VINTF_HYP_OWN, nsmmu->num_total_vintfs > 1);
+       writel_relaxed(regval, vintf0->base + NVIDIA_VINTF_CONFIG);
+
+       regval |= FIELD_PREP(VINTF_EN, 1);
+       writel_relaxed(regval, vintf0->base + NVIDIA_VINTF_CONFIG);
+
+       vintf0->cfg = regval;
+
+       ret = readl_relaxed_poll_timeout(vintf0->base + NVIDIA_VINTF_STATUS,
+                                        regval, regval == VINTF_ENABLED,
+                                        1, ARM_SMMU_POLL_TIMEOUT_US);
+       vintf0->status = regval;
+       if (ret) {
+               dev_err(nsmmu->cmdqv_dev, "failed to enable VINTF[%u]: STATUS = 
0x%08X\n",
+                       vintf0->idx, regval);
+               return ret;
+       }
+
+       /* Allocate vcmdqs to vintf0 */
+       for (idx = 0; idx < nsmmu->num_vcmdqs_per_vintf; idx++) {
+               regval  = FIELD_PREP(CMDQV_CMDQ_ALLOC_VINTF, vintf0->idx);
+               regval |= FIELD_PREP(CMDQV_CMDQ_ALLOC_LVCMDQ, idx);
+               regval |= CMDQV_CMDQ_ALLOCATED;
+               writel_relaxed(regval, nsmmu->cmdqv_base + 
NVIDIA_CMDQV_CMDQ_ALLOC(idx));
+       }
+
+       /* Build an arm_smmu_cmdq for each vcmdq allocated to vintf0 */
+       vintf0->vcmdqs = devm_kcalloc(nsmmu->cmdqv_dev, 
nsmmu->num_vcmdqs_per_vintf,
+                                     sizeof(*vintf0->vcmdqs), GFP_KERNEL);
+       if (!vintf0->vcmdqs)
+               return -ENOMEM;
+
+       for (idx = 0; idx < nsmmu->num_vcmdqs_per_vintf; idx++) {
+               void __iomem *vcmdq_base = nsmmu->cmdqv_base + 
NVIDIA_CMDQV_VCMDQ(idx);
+               struct arm_smmu_cmdq *cmdq = &vintf0->vcmdqs[idx];
+
+               /* Setup struct arm_smmu_cmdq data members */
+               nvidia_smmu_init_one_arm_smmu_cmdq(nsmmu, cmdq, vcmdq_base, 
idx);
+
+               /* Configure and enable the vcmdq */
+               writel_relaxed(0, vcmdq_base + NVIDIA_VCMDQ_PROD);
+               writel_relaxed(0, vcmdq_base + NVIDIA_VCMDQ_CONS);
+
+               writeq_relaxed(cmdq->q.q_base, vcmdq_base + NVIDIA_VCMDQ_BASE);
+
+               writel_relaxed(VCMDQ_EN, vcmdq_base + NVIDIA_VCMDQ_CONFIG);
+               ret = readl_poll_timeout(vcmdq_base + NVIDIA_VCMDQ_STATUS,
+                                        regval, regval == VCMDQ_ENABLED,
+                                        1, ARM_SMMU_POLL_TIMEOUT_US);
+               if (ret) {
+                       u32 gerror = readl_relaxed(vcmdq_base + 
NVIDIA_VCMDQ_GERROR);
+                       u32 gerrorn = readl_relaxed(vcmdq_base + 
NVIDIA_VCMDQ_GERRORN);
+                       u32 cons = readl_relaxed(vcmdq_base + 
NVIDIA_VCMDQ_CONS);
+
+                       dev_err(nsmmu->cmdqv_dev,
+                               "failed to enable VCMDQ[%u]: GERROR=0x%X, 
GERRORN=0x%X, CONS=0x%X\n",
+                               idx, gerror, gerrorn, cons);
+                       return ret;
+               }
+
+               dev_info(nsmmu->cmdqv_dev, "VCMDQ%u allocated to VINTF%u as 
CMDQ%u\n",
+                        idx, vintf0->idx, idx);
+       }
+
+       return 0;
+}
+
+static int nvidia_smmu_probe(struct nvidia_smmu *nsmmu)
+{
+       struct platform_device *cmdqv_pdev = 
to_platform_device(nsmmu->cmdqv_dev);
+       struct resource *res;
+       u32 regval;
+
+       /* Base address */
+       res = platform_get_resource(cmdqv_pdev, IORESOURCE_MEM, 0);
+       if (!res)
+               return -ENXIO;
+
+       nsmmu->cmdqv_base = devm_ioremap_resource(nsmmu->cmdqv_dev, res);
+       if (IS_ERR(nsmmu->cmdqv_base))
+               return PTR_ERR(nsmmu->cmdqv_base);
+
+       /* Interrupt */
+       nsmmu->cmdqv_irq = platform_get_irq(cmdqv_pdev, 0);
+       if (nsmmu->cmdqv_irq < 0) {
+               dev_warn(nsmmu->cmdqv_dev, "no cmdqv interrupt - errors will 
not be reported\n");
+               nsmmu->cmdqv_irq = 0;
+       }
+
+       /* Probe the h/w */
+       regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_CONFIG);
+       if (!FIELD_GET(CMDQV_EN, regval)) {
+               dev_err(nsmmu->cmdqv_dev, "CMDQV h/w is disabled: 
CMDQV_CONFIG=0x%08X\n", regval);
+               return -ENODEV;
+       }
+
+       regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_STATUS);
+       if (!FIELD_GET(CMDQV_ENABLED, regval) || FIELD_GET(CMDQV_STATUS, 
regval)) {
+               dev_err(nsmmu->cmdqv_dev, "CMDQV h/w not ready: 
CMDQV_STATUS=0x%08X\n", regval);
+               return -ENODEV;
+       }
+
+       regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_PARAM);
+       nsmmu->num_total_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval);
+       nsmmu->num_total_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval);
+       nsmmu->num_vcmdqs_per_vintf = nsmmu->num_total_vcmdqs / 
nsmmu->num_total_vintfs;
+
+       return 0;
+}
+
+static int nvidia_smmu_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, 
int n, bool sync)
+{
+       struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)smmu;
+       struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0;
+       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+       u16 idx;
+
+       /* Make sure vintf0 is enabled and healthy */
+       if (vintf0->status != VINTF_ENABLED)
+               goto issue_cmdlist;
+
+       /* Check for illegal CMDs */
+       if (!FIELD_GET(VINTF_HYP_OWN, vintf0->cfg)) {
+               u64 opcode = (n) ? FIELD_GET(CMDQ_0_OP, cmds[0]) : 
CMDQ_OP_CMD_SYNC;
+
+               switch (opcode) {
+               case CMDQ_OP_TLBI_NH_ASID:
+               case CMDQ_OP_TLBI_NH_VA:
+               case CMDQ_OP_TLBI_S12_VMALL:
+               case CMDQ_OP_TLBI_S2_IPA:
+               case CMDQ_OP_ATC_INV:
+                       break;
+               default:
+                       goto issue_cmdlist;
+               }
+       }
+
+       /*
+        * Select a vcmdq to use. Here we use a temporal solution to
+        * balance out traffic on cmdq issuing: each cmdq has its own
+        * lock, if all cpus issue cmdlist using the same cmdq, only
+        * one CPU at a time can enter the process, while the others
+        * will be spinning at the same lock.
+        */
+       idx = smp_processor_id() % nsmmu->num_vcmdqs_per_vintf;
+       cmdq = &vintf0->vcmdqs[idx];
+
+issue_cmdlist:
+       return arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, sync, cmdq);
+}
+
+static int nvidia_smmu_device_reset(struct arm_smmu_device *smmu)
+{
+       struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)smmu;
+       int ret;
+
+       ret = nvidia_smmu_cmdqv_init(nsmmu);
+       if (ret)
+               return ret;
+
+       if (nsmmu->cmdqv_irq) {
+               ret = devm_request_irq(nsmmu->cmdqv_dev, nsmmu->cmdqv_irq, 
nvidia_smmu_cmdqv_isr,
+                                      IRQF_SHARED, "nvidia-smmu-cmdqv", nsmmu);
+               if (ret) {
+                       dev_err(nsmmu->cmdqv_dev, "failed to claim irq (%d): 
%d\n",
+                               nsmmu->cmdqv_irq, ret);
+                       return ret;
+               }
+       }
+
+       /* Disable FEAT_MSI and OPT_MSIPOLL since VCMDQs only support CMD_SYNC 
w/CS_NONE */
+       smmu->features &= ~ARM_SMMU_FEAT_MSI;
+       smmu->options &= ~ARM_SMMU_OPT_MSIPOLL;
+
+       return 0;
+}
+
+const struct arm_smmu_impl nvidia_smmu_impl = {
+       .device_reset = nvidia_smmu_device_reset,
+       .issue_cmdlist = nvidia_smmu_issue_cmdlist,
+};
+
+#ifdef CONFIG_ACPI
+struct nvidia_smmu *nvidia_smmu_create(struct arm_smmu_device *smmu)
+{
+       struct nvidia_smmu *nsmmu = NULL;
+       struct acpi_iort_node *node;
+       struct acpi_device *adev;
+       struct device *cmdqv_dev;
+       const char *match_uid;
+
+       if (acpi_disabled)
+               return NULL;
+
+       /* Look for a device in the DSDT whose _UID matches the SMMU's 
iort_node identifier */
+       node = *(struct acpi_iort_node **)dev_get_platdata(smmu->dev);
+       match_uid = kasprintf(GFP_KERNEL, "%u", node->identifier);
+       adev = acpi_dev_get_first_match_dev(NVIDIA_SMMU_CMDQV_HID, match_uid, 
-1);
+       kfree(match_uid);
+
+       if (!adev)
+               return NULL;
+
+       cmdqv_dev = bus_find_device_by_acpi_dev(&platform_bus_type, adev);
+       if (!cmdqv_dev)
+               return NULL;
+
+       dev_info(smmu->dev, "found companion CMDQV device, %s", 
dev_name(cmdqv_dev));
+
+       nsmmu = devm_krealloc(smmu->dev, smmu, sizeof(*nsmmu), GFP_KERNEL);
+       if (!nsmmu)
+               return ERR_PTR(-ENOMEM);
+
+       nsmmu->cmdqv_dev = cmdqv_dev;
+
+       return nsmmu;
+}
+#else
+struct nvidia_smmu *nvidia_smmu_create(struct arm_smmu_device *smmu)
+{
+       return NULL;
+}
+#endif
+
+struct arm_smmu_device *nvidia_smmu_v3_impl_init(struct arm_smmu_device *smmu)
+{
+       struct nvidia_smmu *nsmmu;
+       int ret;
+
+       nsmmu = nvidia_smmu_create(smmu);
+       if (!nsmmu)
+               return smmu;
+
+       ret = nvidia_smmu_probe(nsmmu);
+       if (ret)
+               return ERR_PTR(ret);
+
+       nsmmu->smmu.impl = &nvidia_smmu_impl;
+
+       return &nsmmu->smmu;
+}
-- 
2.17.1

_______________________________________________
iommu mailing list
[email protected]
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[RFC][Patch v1 2/2] iommu/arm-smmu-v3: Add support for NVIDIA CMDQ-Virtualization hw

Reply via email to