Hi Rob,
On 1/17/20 10:16 PM, Rob Herring wrote:
> Arm SMMUv3.2 adds support for TLB range invalidate operations.
> Support for range invalidate is determined by the RIL bit in the IDR3
> register.
> 
> The range invalidate is in units of the leaf page size and operates on
> 1-32 chunks of a power of 2 multiple pages. First, we determine from the
> size what power of 2 multiple we can use. Then we calculate how many
> chunks (1-31) of the power of 2 size for the range on the iteration. On
> each iteration, we move up in size by at least 5 bits.
> 
> Cc: Eric Auger <[email protected]>
> Cc: Jean-Philippe Brucker <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Robin Murphy <[email protected]>
> Cc: Joerg Roedel <[email protected]>
> Signed-off-by: Rob Herring <[email protected]>
> ---
>  drivers/iommu/arm-smmu-v3.c | 66 ++++++++++++++++++++++++++++++++++++-
>  1 file changed, 65 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index e91b4a098215..0ee561db7149 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -70,6 +70,9 @@
>  #define IDR1_SSIDSIZE                        GENMASK(10, 6)
>  #define IDR1_SIDSIZE                 GENMASK(5, 0)
>  
> +#define ARM_SMMU_IDR3                        0xc
> +#define IDR3_RIL                     (1 << 10)
> +
>  #define ARM_SMMU_IDR5                        0x14
>  #define IDR5_STALL_MAX                       GENMASK(31, 16)
>  #define IDR5_GRAN64K                 (1 << 6)
> @@ -327,9 +330,14 @@
>  #define CMDQ_CFGI_1_LEAF             (1UL << 0)
>  #define CMDQ_CFGI_1_RANGE            GENMASK_ULL(4, 0)
>  
> +#define CMDQ_TLBI_0_NUM                      GENMASK_ULL(16, 12)
> +#define CMDQ_TLBI_RANGE_NUM_MAX              31
> +#define CMDQ_TLBI_0_SCALE            GENMASK_ULL(24, 20)
>  #define CMDQ_TLBI_0_VMID             GENMASK_ULL(47, 32)
>  #define CMDQ_TLBI_0_ASID             GENMASK_ULL(63, 48)
>  #define CMDQ_TLBI_1_LEAF             (1UL << 0)
> +#define CMDQ_TLBI_1_TTL                      GENMASK_ULL(9, 8)
> +#define CMDQ_TLBI_1_TG                       GENMASK_ULL(11, 10)
>  #define CMDQ_TLBI_1_VA_MASK          GENMASK_ULL(63, 12)
>  #define CMDQ_TLBI_1_IPA_MASK         GENMASK_ULL(51, 12)
>  
> @@ -455,9 +463,13 @@ struct arm_smmu_cmdq_ent {
>               #define CMDQ_OP_TLBI_S2_IPA     0x2a
>               #define CMDQ_OP_TLBI_NSNH_ALL   0x30
>               struct {
> +                     u8                      num;
> +                     u8                      scale;
>                       u16                     asid;
>                       u16                     vmid;
>                       bool                    leaf;
> +                     u8                      ttl;
> +                     u8                      tg;
>                       u64                     addr;
>               } tlbi;
>  
> @@ -595,6 +607,7 @@ struct arm_smmu_device {
>  #define ARM_SMMU_FEAT_HYP            (1 << 12)
>  #define ARM_SMMU_FEAT_STALL_FORCE    (1 << 13)
>  #define ARM_SMMU_FEAT_VAX            (1 << 14)
> +#define ARM_SMMU_FEAT_RANGE_INV              (1 << 15)
>       u32                             features;
>  
>  #define ARM_SMMU_OPT_SKIP_PREFETCH   (1 << 0)
> @@ -856,13 +869,21 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
> arm_smmu_cmdq_ent *ent)
>               cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
>               break;
>       case CMDQ_OP_TLBI_NH_VA:
> +             cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
> +             cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
>               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
>               cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
> +             cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
> +             cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
>               cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
>               break;
>       case CMDQ_OP_TLBI_S2_IPA:
> +             cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
> +             cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
>               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
>               cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
> +             cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
> +             cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
>               cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
>               break;
>       case CMDQ_OP_TLBI_NH_ASID:
> @@ -2003,7 +2024,7 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, 
> size_t size,
>  {
>       u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
>       struct arm_smmu_device *smmu = smmu_domain->smmu;
> -     unsigned long start = iova, end = iova + size;
> +     unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
>       int i = 0;
>       struct arm_smmu_cmdq_ent cmd = {
>               .tlbi = {
> @@ -2022,12 +2043,50 @@ static void arm_smmu_tlb_inv_range(unsigned long 
> iova, size_t size,
>               cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
>       }
>  
> +     if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> +             /* Get the leaf page size */
> +             tg = __ffs(smmu_domain->domain.pgsize_bitmap);
> +
> +             /* Convert page size of 12,14,16 (log2) to 1,2,3 */
> +             cmd.tlbi.tg = ((tg - ilog2(SZ_4K)) / 2) + 1;
> +
> +             /* Determine what level the granule is at */
> +             cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
> +
> +             num_pages = size / (1UL << tg);
> +     }
> +
>       while (iova < end) {
>               if (i == CMDQ_BATCH_ENTRIES) {
>                       arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
>                       i = 0;
>               }
>  
> +             if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> +                     /*
> +                      * On each iteration of the loop, the range is 5 bits
> +                      * worth of the aligned size remaining.
> +                      * The range in pages is:
> +                      *
> +                      * range = (num_pages & (0x1f << __ffs(num_pages)))
> +                      */
> +                     unsigned long scale, num;
> +
> +                     /* Determine the power of 2 multiple number of pages */
> +                     scale = __ffs(num_pages);
> +                     cmd.tlbi.scale = scale;
> +
> +                     /* Determine how many chunks of 2^scale size we have */
> +                     num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
> +                     cmd.tlbi.num = num - 1;
> +
> +                     /* range is num * 2^scale * pgsize */
> +                     granule = num << (scale + tg);
> +
> +                     /* Clear out the lower order bits for the next 
> iteration */
> +                     num_pages -= num << scale;
Regarding the 2 options given in
https://lore.kernel.org/linux-arm-kernel/CAL_JsqKABoE+0crGwyZdNogNgEoG=moopf6deqgh6s73c0u...@mail.gmail.com/raw,

I understand you implemented 2) but I still do not understand why you
preferred that one against 1).

In your case of 1023*4k pages this will invalidate by 31 32*2^0*4K +
31*2^0*4K pages
whereas you could achieve that with 10 invalidations with the 1st algo.
I did not get the case where it is more efficient. Please can you detail.

Also a question about TG. Reading the spec again & again, it is said
entries to be invalidated were inserted using this
Granule size. Here you pick the lowest granule supported by the domain.
Does it mean this was the one being used?

Thanks

Eric
> +             }
> +
>               cmd.tlbi.addr = iova;
>               arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd);
>               iova += granule;
> @@ -3449,6 +3508,11 @@ static int arm_smmu_device_hw_probe(struct 
> arm_smmu_device *smmu)
>       if (smmu->sid_bits <= STRTAB_SPLIT)
>               smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
>  
> +     /* IDR3 */
> +     reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
> +     if (FIELD_GET(IDR3_RIL, reg))
> +             smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
> +
>       /* IDR5 */
>       reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
>  
> 

_______________________________________________
iommu mailing list
[email protected]
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Reply via email to