According to the Arm SMMUv3 spec (ARM IHI 0070), a system may have
SMMU(s) that is/are non-coherent to the PE (processing element). In such
cases, memory accesses from the PE should be either non-cached or be
augmented with manual cache maintenance. SMMU cache coherency is reported
by bit 4 (COHACC) of the SMMU_IDR0 register and is already present in the
Xen driver. However, the current implementation is not aware of cache
maintenance for memory that is shared between the PE and non-coherent
SMMUs. It contains dmam_alloc_coherent() function, that is added during
Linux driver porting. But it is actually a wrapper for _xzalloc(), that
returns normal writeback memory (which is OK for coherent SMMUs).

During Xen bring-up on a system with non-coherent SMMUs, the driver did
not work properly - the SMMU was not functional and halted initialization
at the very beginning due to a timeout while waiting for CMD_SYNC
completion:

  (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout
  (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout

To properly handle such scenarios, add the non_coherent flag to the
arm_smmu_queue struct. It is initialized using features reported by the
SMMU HW and will be used for triggering cache clean/invalidate operations.
This flag is not queue-specific (it is applicable to the whole SMMU), but
adding it to arm_smmu_queue allows us to not change function signatures
and simplify the patch (smmu->features, which contains the required flag,
are not available in code parts that require cache maintenance).

Signed-off-by: Dmytro Firsov <dmytro_fir...@epam.com>
Reviewed-by: Julien Grall <jgr...@amazon.com>
Tested-by: Mykola Kvach <mykola_kv...@epam.com>
---
v2:
 - changed comment for non_coherent struct member
 - added Julien's RB
 - added Mykola's TB
---
 xen/drivers/passthrough/arm/smmu-v3.c | 27 +++++++++++++++++++++++----
 xen/drivers/passthrough/arm/smmu-v3.h |  3 +++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/xen/drivers/passthrough/arm/smmu-v3.c 
b/xen/drivers/passthrough/arm/smmu-v3.c
index bca5866b35..c65c47c038 100644
--- a/xen/drivers/passthrough/arm/smmu-v3.c
+++ b/xen/drivers/passthrough/arm/smmu-v3.c
@@ -341,10 +341,14 @@ static void queue_write(__le64 *dst, u64 *src, size_t 
n_dwords)
 
 static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
 {
+       __le64 *q_addr = Q_ENT(q, q->llq.prod);
+
        if (queue_full(&q->llq))
                return -ENOSPC;
 
-       queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
+       queue_write(q_addr, ent, q->ent_dwords);
+       if (q->non_coherent)
+               clean_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr));
        queue_inc_prod(&q->llq);
        queue_sync_prod_out(q);
        return 0;
@@ -360,10 +364,15 @@ static void queue_read(u64 *dst, __le64 *src, size_t 
n_dwords)
 
 static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
 {
+       __le64 *q_addr = Q_ENT(q, q->llq.cons);
+
        if (queue_empty(&q->llq))
                return -EAGAIN;
 
-       queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords);
+       if (q->non_coherent)
+               invalidate_dcache_va_range(q_addr, q->ent_dwords * 
sizeof(*q_addr));
+
+       queue_read(ent, q_addr, q->ent_dwords);
        queue_inc_cons(&q->llq);
        queue_sync_cons_out(q);
        return 0;
@@ -458,6 +467,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device 
*smmu)
        struct arm_smmu_queue *q = &smmu->cmdq.q;
        u32 cons = readl_relaxed(q->cons_reg);
        u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons);
+       __le64 *q_addr = Q_ENT(q, cons);
        struct arm_smmu_cmdq_ent cmd_sync = {
                .opcode = CMDQ_OP_CMD_SYNC,
        };
@@ -484,11 +494,14 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device 
*smmu)
                break;
        }
 
+       if (q->non_coherent)
+               invalidate_dcache_va_range(q_addr, q->ent_dwords * 
sizeof(*q_addr));
+
        /*
         * We may have concurrent producers, so we need to be careful
         * not to touch any of the shadow cmdq state.
         */
-       queue_read(cmd, Q_ENT(q, cons), q->ent_dwords);
+       queue_read(cmd, q_addr, q->ent_dwords);
        dev_err(smmu->dev, "skipping command in error state:\n");
        for (i = 0; i < ARRAY_SIZE(cmd); ++i)
                dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]);
@@ -499,7 +512,10 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device 
*smmu)
                return;
        }
 
-       queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
+       queue_write(q_addr, cmd, q->ent_dwords);
+
+       if (q->non_coherent)
+               clean_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr));
 }
 
 static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
@@ -1587,6 +1603,9 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device 
*smmu,
        q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
 
        q->llq.prod = q->llq.cons = 0;
+
+       q->non_coherent = !(smmu->features & ARM_SMMU_FEAT_COHERENCY);
+
        return 0;
 }
 
diff --git a/xen/drivers/passthrough/arm/smmu-v3.h 
b/xen/drivers/passthrough/arm/smmu-v3.h
index f09048812c..ab07366294 100644
--- a/xen/drivers/passthrough/arm/smmu-v3.h
+++ b/xen/drivers/passthrough/arm/smmu-v3.h
@@ -522,6 +522,9 @@ struct arm_smmu_queue {
 
        u32 __iomem                     *prod_reg;
        u32 __iomem                     *cons_reg;
+
+       /* Is the memory access coherent? */
+       bool                            non_coherent;
 };
 
 struct arm_smmu_cmdq {
-- 
2.50.1

Reply via email to