Hi Dmytro,

> On 3 Sep 2025, at 20:40, Dmytro Firsov <dmytro_fir...@epam.com> wrote:
> 
> According to the Arm SMMUv3 spec (ARM IHI 0070), a system may have
> SMMU(s) that is/are non-coherent to the PE (processing element). In such
> cases, memory accesses from the PE should be either non-cached or be
> augmented with manual cache maintenance. SMMU cache coherency is reported
> by bit 4 (COHACC) of the SMMU_IDR0 register and is already present in the
> Xen driver. However, the current implementation is not aware of cache
> maintenance for memory that is shared between the PE and non-coherent
> SMMUs. It contains dmam_alloc_coherent() function, that is added during
> Linux driver porting. But it is actually a wrapper for _xzalloc(), that
> returns normal writeback memory (which is OK for coherent SMMUs).
> 
> During Xen bring-up on a system with non-coherent SMMUs, the driver did
> not work properly - the SMMU was not functional and halted initialization
> at the very beginning due to a timeout while waiting for CMD_SYNC
> completion:
> 
>  (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout
>  (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout
> 
> To properly handle such scenarios, add the non_coherent flag to the
> arm_smmu_queue struct. It is initialized using features reported by the
> SMMU HW and will be used for triggering cache clean/invalidate operations.
> This flag is not queue-specific (it is applicable to the whole SMMU), but
> adding it to arm_smmu_queue allows us to not change function signatures
> and simplify the patch (smmu->features, which contains the required flag,
> are not available in code parts that require cache maintenance).
> 
> Signed-off-by: Dmytro Firsov <dmytro_fir...@epam.com>
> Reviewed-by: Julien Grall <jgr...@amazon.com>
> Tested-by: Mykola Kvach <mykola_kv...@epam.com>

Acked-by: Bertrand Marquis <bertrand.marq...@arm.com>

Cheers
Bertrand

> ---
> v2:
> - changed comment for non_coherent struct member
> - added Julien's RB
> - added Mykola's TB
> ---
> xen/drivers/passthrough/arm/smmu-v3.c | 27 +++++++++++++++++++++++----
> xen/drivers/passthrough/arm/smmu-v3.h |  3 +++
> 2 files changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/drivers/passthrough/arm/smmu-v3.c 
> b/xen/drivers/passthrough/arm/smmu-v3.c
> index bca5866b35..c65c47c038 100644
> --- a/xen/drivers/passthrough/arm/smmu-v3.c
> +++ b/xen/drivers/passthrough/arm/smmu-v3.c
> @@ -341,10 +341,14 @@ static void queue_write(__le64 *dst, u64 *src, size_t 
> n_dwords)
> 
> static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
> {
> + __le64 *q_addr = Q_ENT(q, q->llq.prod);
> +
> if (queue_full(&q->llq))
> return -ENOSPC;
> 
> - queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
> + queue_write(q_addr, ent, q->ent_dwords);
> + if (q->non_coherent)
> + clean_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr));
> queue_inc_prod(&q->llq);
> queue_sync_prod_out(q);
> return 0;
> @@ -360,10 +364,15 @@ static void queue_read(u64 *dst, __le64 *src, size_t 
> n_dwords)
> 
> static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
> {
> + __le64 *q_addr = Q_ENT(q, q->llq.cons);
> +
> if (queue_empty(&q->llq))
> return -EAGAIN;
> 
> - queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords);
> + if (q->non_coherent)
> + invalidate_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr));
> +
> + queue_read(ent, q_addr, q->ent_dwords);
> queue_inc_cons(&q->llq);
> queue_sync_cons_out(q);
> return 0;
> @@ -458,6 +467,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device 
> *smmu)
> struct arm_smmu_queue *q = &smmu->cmdq.q;
> u32 cons = readl_relaxed(q->cons_reg);
> u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons);
> + __le64 *q_addr = Q_ENT(q, cons);
> struct arm_smmu_cmdq_ent cmd_sync = {
> .opcode = CMDQ_OP_CMD_SYNC,
> };
> @@ -484,11 +494,14 @@ static void arm_smmu_cmdq_skip_err(struct 
> arm_smmu_device *smmu)
> break;
> }
> 
> + if (q->non_coherent)
> + invalidate_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr));
> +
> /*
> * We may have concurrent producers, so we need to be careful
> * not to touch any of the shadow cmdq state.
> */
> - queue_read(cmd, Q_ENT(q, cons), q->ent_dwords);
> + queue_read(cmd, q_addr, q->ent_dwords);
> dev_err(smmu->dev, "skipping command in error state:\n");
> for (i = 0; i < ARRAY_SIZE(cmd); ++i)
> dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]);
> @@ -499,7 +512,10 @@ static void arm_smmu_cmdq_skip_err(struct 
> arm_smmu_device *smmu)
> return;
> }
> 
> - queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
> + queue_write(q_addr, cmd, q->ent_dwords);
> +
> + if (q->non_coherent)
> + clean_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr));
> }
> 
> static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
> @@ -1587,6 +1603,9 @@ static int arm_smmu_init_one_queue(struct 
> arm_smmu_device *smmu,
> q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
> 
> q->llq.prod = q->llq.cons = 0;
> +
> + q->non_coherent = !(smmu->features & ARM_SMMU_FEAT_COHERENCY);
> +
> return 0;
> }
> 
> diff --git a/xen/drivers/passthrough/arm/smmu-v3.h 
> b/xen/drivers/passthrough/arm/smmu-v3.h
> index f09048812c..ab07366294 100644
> --- a/xen/drivers/passthrough/arm/smmu-v3.h
> +++ b/xen/drivers/passthrough/arm/smmu-v3.h
> @@ -522,6 +522,9 @@ struct arm_smmu_queue {
> 
> u32 __iomem *prod_reg;
> u32 __iomem *cons_reg;
> +
> + /* Is the memory access coherent? */
> + bool non_coherent;
> };
> 
> struct arm_smmu_cmdq {
> -- 
> 2.50.1


Reply via email to