Hi Dmytro, On Wed, Aug 6, 2025 at 5:59 PM Dmytro Firsov <dmytro_fir...@epam.com> wrote: > > According to the Arm SMMUv3 spec (ARM IHI 0070), a system may have > SMMU(s) that is/are non-coherent to the PE (processing element). In such > cases, memory accesses from the PE should be either non-cached or be > augmented with manual cache maintenance. SMMU cache coherency is reported > by bit 4 (COHACC) of the SMMU_IDR0 register and is already present in the > Xen driver. However, the current implementation is not aware of cache > maintenance for memory that is shared between the PE and non-coherent > SMMUs. It contains dmam_alloc_coherent() function, that is added during > Linux driver porting. But it is actually a wrapper for _xzalloc(), that > returns normal writeback memory (which is OK for coherent SMMUs). > > During Xen bring-up on a system with non-coherent SMMUs, the driver did > not work properly - the SMMU was not functional and halted initialization > at the very beginning due to a timeout while waiting for CMD_SYNC > completion: > > (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout > (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout
Thank you for your patch. I have encountered the same issue while testing other Xen functionality on the Orange Pi 5 board (SoC RK3588S): (XEN) [ 0.040350] SMMUv3: /iommu@fc900000: ias 48-bit, oas 48-bit (features 0x00001c0f) (XEN) [ 0.043164] SMMUv3: /iommu@fc900000: allocated 524288 entries for cmdq (XEN) [ 0.048505] SMMUv3: /iommu@fc900000: allocated 524288 entries for evtq (XEN) [ 1.099335] SMMUv3: /iommu@fc900000: CMD_SYNC timeout This patch resolves the problem. Tested-by: Mykola Kvach <mykola_kv...@epam.com> > > To properly handle such scenarios, add the non_coherent flag to the > arm_smmu_queue struct. It is initialized using features reported by the > SMMU HW and will be used for triggering cache clean/invalidate operations. > This flag is not queue-specific (it is applicable to the whole SMMU), but > adding it to arm_smmu_queue allows us to not change function signatures > and simplify the patch (smmu->features, which contains the required flag, > are not available in code parts that require cache maintenance). > > Signed-off-by: Dmytro Firsov <dmytro_fir...@epam.com> > --- > xen/drivers/passthrough/arm/smmu-v3.c | 27 +++++++++++++++++++++++---- > xen/drivers/passthrough/arm/smmu-v3.h | 7 +++++++ > 2 files changed, 30 insertions(+), 4 deletions(-) > > diff --git a/xen/drivers/passthrough/arm/smmu-v3.c > b/xen/drivers/passthrough/arm/smmu-v3.c > index 5e9e3e048e..bf153227db 100644 > --- a/xen/drivers/passthrough/arm/smmu-v3.c > +++ b/xen/drivers/passthrough/arm/smmu-v3.c > @@ -346,10 +346,14 @@ static void queue_write(__le64 *dst, u64 *src, size_t > n_dwords) > > static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent) > { > + __le64 *q_addr = Q_ENT(q, q->llq.prod); > + > if (queue_full(&q->llq)) > return -ENOSPC; > > - queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords); > + queue_write(q_addr, ent, q->ent_dwords); > + if (q->non_coherent) > + clean_dcache_va_range(q_addr, q->ent_dwords * > sizeof(*q_addr)); > queue_inc_prod(&q->llq); > queue_sync_prod_out(q); > return 0; > @@ -365,10 +369,15 @@ static void queue_read(u64 *dst, __le64 *src, size_t > n_dwords) > > static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent) > { > + __le64 *q_addr = Q_ENT(q, q->llq.cons); > + > if (queue_empty(&q->llq)) > return -EAGAIN; > > - queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords); > + if (q->non_coherent) > + invalidate_dcache_va_range(q_addr, q->ent_dwords * > sizeof(*q_addr)); > + > + queue_read(ent, q_addr, q->ent_dwords); > queue_inc_cons(&q->llq); > queue_sync_cons_out(q); > return 0; > @@ -463,6 +472,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device > *smmu) > struct arm_smmu_queue *q = &smmu->cmdq.q; > u32 cons = readl_relaxed(q->cons_reg); > u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons); > + __le64 *q_addr = Q_ENT(q, cons); > struct arm_smmu_cmdq_ent cmd_sync = { > .opcode = CMDQ_OP_CMD_SYNC, > }; > @@ -489,11 +499,14 @@ static void arm_smmu_cmdq_skip_err(struct > arm_smmu_device *smmu) > break; > } > > + if (q->non_coherent) > + invalidate_dcache_va_range(q_addr, q->ent_dwords * > sizeof(*q_addr)); > + > /* > * We may have concurrent producers, so we need to be careful > * not to touch any of the shadow cmdq state. > */ > - queue_read(cmd, Q_ENT(q, cons), q->ent_dwords); > + queue_read(cmd, q_addr, q->ent_dwords); > dev_err(smmu->dev, "skipping command in error state:\n"); > for (i = 0; i < ARRAY_SIZE(cmd); ++i) > dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long > long)cmd[i]); > @@ -504,7 +517,10 @@ static void arm_smmu_cmdq_skip_err(struct > arm_smmu_device *smmu) > return; > } > > - queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); > + queue_write(q_addr, cmd, q->ent_dwords); > + > + if (q->non_coherent) > + clean_dcache_va_range(q_addr, q->ent_dwords * > sizeof(*q_addr)); > } > > static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) > @@ -1634,6 +1650,9 @@ static int __init arm_smmu_init_one_queue(struct > arm_smmu_device *smmu, > q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift); > > q->llq.prod = q->llq.cons = 0; > + > + q->non_coherent = !(smmu->features & ARM_SMMU_FEAT_COHERENCY); > + > return 0; > } > > diff --git a/xen/drivers/passthrough/arm/smmu-v3.h > b/xen/drivers/passthrough/arm/smmu-v3.h > index f09048812c..db936b9bd4 100644 > --- a/xen/drivers/passthrough/arm/smmu-v3.h > +++ b/xen/drivers/passthrough/arm/smmu-v3.h > @@ -522,6 +522,13 @@ struct arm_smmu_queue { > > u32 __iomem *prod_reg; > u32 __iomem *cons_reg; > + > + /* > + * According to SMMU spec section 3.16, some systems may have > + * SMMUs, that are non-coherent to PE (processing elements). > + * In such case manual cache management is needed. > + */ > + bool non_coherent; > }; > > struct arm_smmu_cmdq { > -- > 2.50.1 > Best regards, Mykola