According to the Arm SMMUv3 spec (ARM IHI 0070), a system may have
SMMU(s) that is/are non-coherent to the PE (processing element). In such
cases, memory accesses from the PE should be either non-cached or be
augmented with manual cache maintenance. SMMU cache coherency is reported
by bit 4 (COHACC) of the SMMU_IDR0 register and is already present in the
Xen driver. However, the current implementation is not aware of cache
maintenance for memory that is shared between the PE and non-coherent
SMMUs. It contains dmam_alloc_coherent() function, that is added during
Linux driver porting. But it is actually a wrapper for _xzalloc(), that
returns normal writeback memory (which is OK for coherent SMMUs).

During Xen bring-up on a system with non-coherent SMMUs, the driver did
not work properly - the SMMU was not functional and halted initialization
at the very beginning due to a timeout while waiting for CMD_SYNC
completion:

  (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout
  (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout

To properly handle such scenarios, add the non_coherent flag to the
arm_smmu_queue struct. It is initialized using features reported by the
SMMU HW and will be used for triggering cache clean/invalidate operations.
This flag is not queue-specific (it is applicable to the whole SMMU), but
adding it to arm_smmu_queue allows us to not change function signatures
and simplify the patch (smmu->features, which contains the required flag,
are not available in code parts that require cache maintenance).

Signed-off-by: Dmytro Firsov <dmytro_fir...@epam.com>
---
 xen/drivers/passthrough/arm/smmu-v3.c | 27 +++++++++++++++++++++++----
 xen/drivers/passthrough/arm/smmu-v3.h |  7 +++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/xen/drivers/passthrough/arm/smmu-v3.c 
b/xen/drivers/passthrough/arm/smmu-v3.c
index 5e9e3e048e..bf153227db 100644
--- a/xen/drivers/passthrough/arm/smmu-v3.c
+++ b/xen/drivers/passthrough/arm/smmu-v3.c
@@ -346,10 +346,14 @@ static void queue_write(__le64 *dst, u64 *src, size_t 
n_dwords)
 
 static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
 {
+       __le64 *q_addr = Q_ENT(q, q->llq.prod);
+
        if (queue_full(&q->llq))
                return -ENOSPC;
 
-       queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
+       queue_write(q_addr, ent, q->ent_dwords);
+       if (q->non_coherent)
+               clean_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr));
        queue_inc_prod(&q->llq);
        queue_sync_prod_out(q);
        return 0;
@@ -365,10 +369,15 @@ static void queue_read(u64 *dst, __le64 *src, size_t 
n_dwords)
 
 static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
 {
+       __le64 *q_addr = Q_ENT(q, q->llq.cons);
+
        if (queue_empty(&q->llq))
                return -EAGAIN;
 
-       queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords);
+       if (q->non_coherent)
+               invalidate_dcache_va_range(q_addr, q->ent_dwords * 
sizeof(*q_addr));
+
+       queue_read(ent, q_addr, q->ent_dwords);
        queue_inc_cons(&q->llq);
        queue_sync_cons_out(q);
        return 0;
@@ -463,6 +472,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device 
*smmu)
        struct arm_smmu_queue *q = &smmu->cmdq.q;
        u32 cons = readl_relaxed(q->cons_reg);
        u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons);
+       __le64 *q_addr = Q_ENT(q, cons);
        struct arm_smmu_cmdq_ent cmd_sync = {
                .opcode = CMDQ_OP_CMD_SYNC,
        };
@@ -489,11 +499,14 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device 
*smmu)
                break;
        }
 
+       if (q->non_coherent)
+               invalidate_dcache_va_range(q_addr, q->ent_dwords * 
sizeof(*q_addr));
+
        /*
         * We may have concurrent producers, so we need to be careful
         * not to touch any of the shadow cmdq state.
         */
-       queue_read(cmd, Q_ENT(q, cons), q->ent_dwords);
+       queue_read(cmd, q_addr, q->ent_dwords);
        dev_err(smmu->dev, "skipping command in error state:\n");
        for (i = 0; i < ARRAY_SIZE(cmd); ++i)
                dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]);
@@ -504,7 +517,10 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device 
*smmu)
                return;
        }
 
-       queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
+       queue_write(q_addr, cmd, q->ent_dwords);
+
+       if (q->non_coherent)
+               clean_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr));
 }
 
 static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
@@ -1634,6 +1650,9 @@ static int __init arm_smmu_init_one_queue(struct 
arm_smmu_device *smmu,
        q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
 
        q->llq.prod = q->llq.cons = 0;
+
+       q->non_coherent = !(smmu->features & ARM_SMMU_FEAT_COHERENCY);
+
        return 0;
 }
 
diff --git a/xen/drivers/passthrough/arm/smmu-v3.h 
b/xen/drivers/passthrough/arm/smmu-v3.h
index f09048812c..db936b9bd4 100644
--- a/xen/drivers/passthrough/arm/smmu-v3.h
+++ b/xen/drivers/passthrough/arm/smmu-v3.h
@@ -522,6 +522,13 @@ struct arm_smmu_queue {
 
        u32 __iomem                     *prod_reg;
        u32 __iomem                     *cons_reg;
+
+       /*
+        * According to SMMU spec section 3.16, some systems may have
+        * SMMUs, that are non-coherent to PE (processing elements).
+        * In such case manual cache management is needed.
+        */
+       bool                            non_coherent;
 };
 
 struct arm_smmu_cmdq {
-- 
2.50.1

Reply via email to