Hi Dmytro, > On 3 Sep 2025, at 20:40, Dmytro Firsov <dmytro_fir...@epam.com> wrote: > > According to the Arm SMMUv3 spec (ARM IHI 0070), a system may have > SMMU(s) that is/are non-coherent to the PE (processing element). In such > cases, memory accesses from the PE should be either non-cached or be > augmented with manual cache maintenance. SMMU cache coherency is reported > by bit 4 (COHACC) of the SMMU_IDR0 register and is already present in the > Xen driver. However, the current implementation is not aware of cache > maintenance for memory that is shared between the PE and non-coherent > SMMUs. It contains dmam_alloc_coherent() function, that is added during > Linux driver porting. But it is actually a wrapper for _xzalloc(), that > returns normal writeback memory (which is OK for coherent SMMUs). > > During Xen bring-up on a system with non-coherent SMMUs, the driver did > not work properly - the SMMU was not functional and halted initialization > at the very beginning due to a timeout while waiting for CMD_SYNC > completion: > > (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout > (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout > > To properly handle such scenarios, add the non_coherent flag to the > arm_smmu_queue struct. It is initialized using features reported by the > SMMU HW and will be used for triggering cache clean/invalidate operations. > This flag is not queue-specific (it is applicable to the whole SMMU), but > adding it to arm_smmu_queue allows us to not change function signatures > and simplify the patch (smmu->features, which contains the required flag, > are not available in code parts that require cache maintenance). > > Signed-off-by: Dmytro Firsov <dmytro_fir...@epam.com> > Reviewed-by: Julien Grall <jgr...@amazon.com> > Tested-by: Mykola Kvach <mykola_kv...@epam.com>
Acked-by: Bertrand Marquis <bertrand.marq...@arm.com> Cheers Bertrand > --- > v2: > - changed comment for non_coherent struct member > - added Julien's RB > - added Mykola's TB > --- > xen/drivers/passthrough/arm/smmu-v3.c | 27 +++++++++++++++++++++++---- > xen/drivers/passthrough/arm/smmu-v3.h | 3 +++ > 2 files changed, 26 insertions(+), 4 deletions(-) > > diff --git a/xen/drivers/passthrough/arm/smmu-v3.c > b/xen/drivers/passthrough/arm/smmu-v3.c > index bca5866b35..c65c47c038 100644 > --- a/xen/drivers/passthrough/arm/smmu-v3.c > +++ b/xen/drivers/passthrough/arm/smmu-v3.c > @@ -341,10 +341,14 @@ static void queue_write(__le64 *dst, u64 *src, size_t > n_dwords) > > static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent) > { > + __le64 *q_addr = Q_ENT(q, q->llq.prod); > + > if (queue_full(&q->llq)) > return -ENOSPC; > > - queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords); > + queue_write(q_addr, ent, q->ent_dwords); > + if (q->non_coherent) > + clean_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr)); > queue_inc_prod(&q->llq); > queue_sync_prod_out(q); > return 0; > @@ -360,10 +364,15 @@ static void queue_read(u64 *dst, __le64 *src, size_t > n_dwords) > > static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent) > { > + __le64 *q_addr = Q_ENT(q, q->llq.cons); > + > if (queue_empty(&q->llq)) > return -EAGAIN; > > - queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords); > + if (q->non_coherent) > + invalidate_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr)); > + > + queue_read(ent, q_addr, q->ent_dwords); > queue_inc_cons(&q->llq); > queue_sync_cons_out(q); > return 0; > @@ -458,6 +467,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device > *smmu) > struct arm_smmu_queue *q = &smmu->cmdq.q; > u32 cons = readl_relaxed(q->cons_reg); > u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons); > + __le64 *q_addr = Q_ENT(q, cons); > struct arm_smmu_cmdq_ent cmd_sync = { > .opcode = CMDQ_OP_CMD_SYNC, > }; > @@ -484,11 +494,14 @@ static void arm_smmu_cmdq_skip_err(struct > arm_smmu_device *smmu) > break; > } > > + if (q->non_coherent) > + invalidate_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr)); > + > /* > * We may have concurrent producers, so we need to be careful > * not to touch any of the shadow cmdq state. > */ > - queue_read(cmd, Q_ENT(q, cons), q->ent_dwords); > + queue_read(cmd, q_addr, q->ent_dwords); > dev_err(smmu->dev, "skipping command in error state:\n"); > for (i = 0; i < ARRAY_SIZE(cmd); ++i) > dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long long)cmd[i]); > @@ -499,7 +512,10 @@ static void arm_smmu_cmdq_skip_err(struct > arm_smmu_device *smmu) > return; > } > > - queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); > + queue_write(q_addr, cmd, q->ent_dwords); > + > + if (q->non_coherent) > + clean_dcache_va_range(q_addr, q->ent_dwords * sizeof(*q_addr)); > } > > static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) > @@ -1587,6 +1603,9 @@ static int arm_smmu_init_one_queue(struct > arm_smmu_device *smmu, > q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift); > > q->llq.prod = q->llq.cons = 0; > + > + q->non_coherent = !(smmu->features & ARM_SMMU_FEAT_COHERENCY); > + > return 0; > } > > diff --git a/xen/drivers/passthrough/arm/smmu-v3.h > b/xen/drivers/passthrough/arm/smmu-v3.h > index f09048812c..ab07366294 100644 > --- a/xen/drivers/passthrough/arm/smmu-v3.h > +++ b/xen/drivers/passthrough/arm/smmu-v3.h > @@ -522,6 +522,9 @@ struct arm_smmu_queue { > > u32 __iomem *prod_reg; > u32 __iomem *cons_reg; > + > + /* Is the memory access coherent? */ > + bool non_coherent; > }; > > struct arm_smmu_cmdq { > -- > 2.50.1