Hi Dmytro,

On Wed, Aug 6, 2025 at 5:59 PM Dmytro Firsov <dmytro_fir...@epam.com> wrote:
>
> According to the Arm SMMUv3 spec (ARM IHI 0070), a system may have
> SMMU(s) that is/are non-coherent to the PE (processing element). In such
> cases, memory accesses from the PE should be either non-cached or be
> augmented with manual cache maintenance. SMMU cache coherency is reported
> by bit 4 (COHACC) of the SMMU_IDR0 register and is already present in the
> Xen driver. However, the current implementation is not aware of cache
> maintenance for memory that is shared between the PE and non-coherent
> SMMUs. It contains dmam_alloc_coherent() function, that is added during
> Linux driver porting. But it is actually a wrapper for _xzalloc(), that
> returns normal writeback memory (which is OK for coherent SMMUs).
>
> During Xen bring-up on a system with non-coherent SMMUs, the driver did
> not work properly - the SMMU was not functional and halted initialization
> at the very beginning due to a timeout while waiting for CMD_SYNC
> completion:
>
>   (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout
>   (XEN) SMMUv3: /soc/iommu@fa000000: CMD_SYNC timeout

Thank you for your patch.

I have encountered the same issue while testing other Xen functionality
on the Orange Pi 5 board (SoC RK3588S):

(XEN) [ 0.040350] SMMUv3: /iommu@fc900000: ias 48-bit, oas 48-bit
(features 0x00001c0f)
(XEN) [ 0.043164] SMMUv3: /iommu@fc900000: allocated 524288 entries for cmdq
(XEN) [ 0.048505] SMMUv3: /iommu@fc900000: allocated 524288 entries for evtq
(XEN) [ 1.099335] SMMUv3: /iommu@fc900000: CMD_SYNC timeout

This patch resolves the problem.

Tested-by: Mykola Kvach <mykola_kv...@epam.com>

>
> To properly handle such scenarios, add the non_coherent flag to the
> arm_smmu_queue struct. It is initialized using features reported by the
> SMMU HW and will be used for triggering cache clean/invalidate operations.
> This flag is not queue-specific (it is applicable to the whole SMMU), but
> adding it to arm_smmu_queue allows us to not change function signatures
> and simplify the patch (smmu->features, which contains the required flag,
> are not available in code parts that require cache maintenance).
>
> Signed-off-by: Dmytro Firsov <dmytro_fir...@epam.com>
> ---
>  xen/drivers/passthrough/arm/smmu-v3.c | 27 +++++++++++++++++++++++----
>  xen/drivers/passthrough/arm/smmu-v3.h |  7 +++++++
>  2 files changed, 30 insertions(+), 4 deletions(-)
>
> diff --git a/xen/drivers/passthrough/arm/smmu-v3.c 
> b/xen/drivers/passthrough/arm/smmu-v3.c
> index 5e9e3e048e..bf153227db 100644
> --- a/xen/drivers/passthrough/arm/smmu-v3.c
> +++ b/xen/drivers/passthrough/arm/smmu-v3.c
> @@ -346,10 +346,14 @@ static void queue_write(__le64 *dst, u64 *src, size_t 
> n_dwords)
>
>  static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
>  {
> +       __le64 *q_addr = Q_ENT(q, q->llq.prod);
> +
>         if (queue_full(&q->llq))
>                 return -ENOSPC;
>
> -       queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
> +       queue_write(q_addr, ent, q->ent_dwords);
> +       if (q->non_coherent)
> +               clean_dcache_va_range(q_addr, q->ent_dwords * 
> sizeof(*q_addr));
>         queue_inc_prod(&q->llq);
>         queue_sync_prod_out(q);
>         return 0;
> @@ -365,10 +369,15 @@ static void queue_read(u64 *dst, __le64 *src, size_t 
> n_dwords)
>
>  static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
>  {
> +       __le64 *q_addr = Q_ENT(q, q->llq.cons);
> +
>         if (queue_empty(&q->llq))
>                 return -EAGAIN;
>
> -       queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords);
> +       if (q->non_coherent)
> +               invalidate_dcache_va_range(q_addr, q->ent_dwords * 
> sizeof(*q_addr));
> +
> +       queue_read(ent, q_addr, q->ent_dwords);
>         queue_inc_cons(&q->llq);
>         queue_sync_cons_out(q);
>         return 0;
> @@ -463,6 +472,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device 
> *smmu)
>         struct arm_smmu_queue *q = &smmu->cmdq.q;
>         u32 cons = readl_relaxed(q->cons_reg);
>         u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons);
> +       __le64 *q_addr = Q_ENT(q, cons);
>         struct arm_smmu_cmdq_ent cmd_sync = {
>                 .opcode = CMDQ_OP_CMD_SYNC,
>         };
> @@ -489,11 +499,14 @@ static void arm_smmu_cmdq_skip_err(struct 
> arm_smmu_device *smmu)
>                 break;
>         }
>
> +       if (q->non_coherent)
> +               invalidate_dcache_va_range(q_addr, q->ent_dwords * 
> sizeof(*q_addr));
> +
>         /*
>          * We may have concurrent producers, so we need to be careful
>          * not to touch any of the shadow cmdq state.
>          */
> -       queue_read(cmd, Q_ENT(q, cons), q->ent_dwords);
> +       queue_read(cmd, q_addr, q->ent_dwords);
>         dev_err(smmu->dev, "skipping command in error state:\n");
>         for (i = 0; i < ARRAY_SIZE(cmd); ++i)
>                 dev_err(smmu->dev, "\t0x%016llx\n", (unsigned long 
> long)cmd[i]);
> @@ -504,7 +517,10 @@ static void arm_smmu_cmdq_skip_err(struct 
> arm_smmu_device *smmu)
>                 return;
>         }
>
> -       queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
> +       queue_write(q_addr, cmd, q->ent_dwords);
> +
> +       if (q->non_coherent)
> +               clean_dcache_va_range(q_addr, q->ent_dwords * 
> sizeof(*q_addr));
>  }
>
>  static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
> @@ -1634,6 +1650,9 @@ static int __init arm_smmu_init_one_queue(struct 
> arm_smmu_device *smmu,
>         q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift);
>
>         q->llq.prod = q->llq.cons = 0;
> +
> +       q->non_coherent = !(smmu->features & ARM_SMMU_FEAT_COHERENCY);
> +
>         return 0;
>  }
>
> diff --git a/xen/drivers/passthrough/arm/smmu-v3.h 
> b/xen/drivers/passthrough/arm/smmu-v3.h
> index f09048812c..db936b9bd4 100644
> --- a/xen/drivers/passthrough/arm/smmu-v3.h
> +++ b/xen/drivers/passthrough/arm/smmu-v3.h
> @@ -522,6 +522,13 @@ struct arm_smmu_queue {
>
>         u32 __iomem                     *prod_reg;
>         u32 __iomem                     *cons_reg;
> +
> +       /*
> +        * According to SMMU spec section 3.16, some systems may have
> +        * SMMUs, that are non-coherent to PE (processing elements).
> +        * In such case manual cache management is needed.
> +        */
> +       bool                            non_coherent;
>  };
>
>  struct arm_smmu_cmdq {
> --
> 2.50.1
>

Best regards,
Mykola

Reply via email to