From: Haseeb Ashraf <[email protected]>

FEAT_nTLBPA (quoting definition) introduces a mechanism to identify
if the intermediate caching of translation table walks does not
include non-coherent caches of previous valid translation table
entries since the last completed TLBI applicable to the PE.

As there won't be any non-coherent caches since the last completed
TLBI, stage-1 TLBI won't be required while performing stage-2 TLBI.

This feature is optionally available in both arm32 and arm64.

Suggested-by: Mohamed Mediouni <[email protected]>
Signed-off-by: Haseeb Ashraf <[email protected]>

Changes in v3:
- This commit has no functional change in v3, only rebasing changes
  due to updates in commit-1.

Changes in v2:
- This commit is implemented in v2 and is splitted from commit-1 in
  v1. This is implemented by using CPU capability.
---
 xen/arch/arm/cpufeature.c                 | 19 ++++++
 xen/arch/arm/include/asm/arm32/flushtlb.h | 14 +++--
 xen/arch/arm/include/asm/arm64/flushtlb.h | 77 ++++++++++++++++-------
 xen/arch/arm/include/asm/cpufeature.h     | 24 ++++++-
 xen/arch/arm/include/asm/processor.h      |  7 +++
 5 files changed, 109 insertions(+), 32 deletions(-)

diff --git a/xen/arch/arm/cpufeature.c b/xen/arch/arm/cpufeature.c
index 1a80738571..9fa1c45869 100644
--- a/xen/arch/arm/cpufeature.c
+++ b/xen/arch/arm/cpufeature.c
@@ -17,7 +17,19 @@ DECLARE_BITMAP(cpu_hwcaps, ARM_NCAPS);
 
 struct cpuinfo_arm __read_mostly domain_cpuinfo;
 
+#ifdef CONFIG_ARM_32
+static bool has_ntlbpa(const struct arm_cpu_capabilities *entry)
+{
+    return system_cpuinfo.mm32.ntlbpa == MM32_NTLBPA_SUPPORT_IMP;
+}
+#endif
+
 #ifdef CONFIG_ARM_64
+static bool has_ntlbpa(const struct arm_cpu_capabilities *entry)
+{
+    return system_cpuinfo.mm64.ntlbpa == MM64_NTLBPA_SUPPORT_IMP;
+}
+
 static bool has_sb_instruction(const struct arm_cpu_capabilities *entry)
 {
     return system_cpuinfo.isa64.sb;
@@ -25,6 +37,13 @@ static bool has_sb_instruction(const struct 
arm_cpu_capabilities *entry)
 #endif
 
 static const struct arm_cpu_capabilities arm_features[] = {
+#if defined(CONFIG_ARM_32) || defined(CONFIG_ARM_64)
+    {
+        .desc = "Intermediate caching of translation table walks (nTLBPA)",
+        .capability = ARM_HAS_NTLBPA,
+        .matches = has_ntlbpa,
+    },
+#endif
 #ifdef CONFIG_ARM_64
     {
         .desc = "Speculation barrier instruction (SB)",
diff --git a/xen/arch/arm/include/asm/arm32/flushtlb.h 
b/xen/arch/arm/include/asm/arm32/flushtlb.h
index 3c0c2123d4..7cff042508 100644
--- a/xen/arch/arm/include/asm/arm32/flushtlb.h
+++ b/xen/arch/arm/include/asm/arm32/flushtlb.h
@@ -49,8 +49,8 @@ TLB_HELPER(flush_xen_tlb_local, TLBIALLH, nsh)
  * Flush TLB of local processor. Use when flush for only stage-1 is intended.
  *
  * The following function should be used where intention is to clear only
- * stage-1 TLBs. This would be helpful in future in identifying which stage-1
- * TLB flushes can be skipped such as in present of FEAT_nTLBPA.
+ * stage-1 TLBs. This would be helpful in identifying which stage-1 TLB flushes
+ * can be skipped such as in present of FEAT_nTLBPA.
  */
 static inline void flush_guest_tlb_s1_local(void)
 {
@@ -60,7 +60,8 @@ static inline void flush_guest_tlb_s1_local(void)
      *
      * See ARMv8 (DDI 0487L.b): G5-11698 Table G5-23.
      */
-    return flush_guest_tlb_local();
+    if ( !cpus_have_const_cap(ARM_HAS_NTLBPA) )
+        flush_guest_tlb_local();
 }
 
 /*
@@ -68,8 +69,8 @@ static inline void flush_guest_tlb_s1_local(void)
  * stage-1 is intended.
  *
  * The following function should be used where intention is to clear only
- * stage-1 TLBs. This would be helpful in future in identifying which stage-1
- * TLB flushes can be skipped such as in present of FEAT_nTLBPA.
+ * stage-1 TLBs. This would be helpful in identifying which stage-1 TLB flushes
+ * can be skipped such as in present of FEAT_nTLBPA.
  */
 static inline void flush_guest_tlb_s1(void)
 {
@@ -79,7 +80,8 @@ static inline void flush_guest_tlb_s1(void)
      *
      * See ARMv8 (DDI 0487L.b): G5-11698 Table G5-23.
      */
-    return flush_guest_tlb();
+    if ( !cpus_have_const_cap(ARM_HAS_NTLBPA) )
+        flush_guest_tlb();
 }
 
 /* Flush TLB of local processor for address va. */
diff --git a/xen/arch/arm/include/asm/arm64/flushtlb.h 
b/xen/arch/arm/include/asm/arm64/flushtlb.h
index 67ae616993..0f0d5050e5 100644
--- a/xen/arch/arm/include/asm/arm64/flushtlb.h
+++ b/xen/arch/arm/include/asm/arm64/flushtlb.h
@@ -47,6 +47,24 @@ static inline void name(void)                    \
         : : : "memory");                         \
 }
 
+#define TLB_HELPER_NTLBPA(name, tlbop, sh)           \
+static inline void name(void)                        \
+{                                                    \
+    if ( !cpus_have_const_cap(ARM_HAS_NTLBPA) )      \
+        asm_inline volatile (                        \
+            "dsb  "  # sh  "st;"                     \
+            "tlbi "  # tlbop  ";"                    \
+            ALTERNATIVE(                             \
+                "nop; nop;",                         \
+                "dsb  ish;"                          \
+                "tlbi "  # tlbop  ";",               \
+                ARM64_WORKAROUND_REPEAT_TLBI,        \
+                CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \
+            "dsb  "  # sh  ";"                       \
+            "isb;"                                   \
+            : : : "memory");                         \
+}
+
 /*
  * FLush TLB by VA. This will likely be used in a loop, so the caller
  * is responsible to use the appropriate memory barriers before/after
@@ -75,10 +93,10 @@ TLB_HELPER(flush_guest_tlb_local, vmalls12e1, nsh)
 TLB_HELPER(flush_guest_tlb, vmalls12e1is, ish)
 
 /* Flush local TLBs, current VMID, stage-1 only */
-TLB_HELPER(flush_guest_tlb_s1_local, vmalle1, nsh)
+TLB_HELPER_NTLBPA(flush_guest_tlb_s1_local, vmalle1, nsh)
 
 /* Flush innershareable TLBs, current VMID, stage-1 only */
-TLB_HELPER(flush_guest_tlb_s1, vmalle1is, ish)
+TLB_HELPER_NTLBPA(flush_guest_tlb_s1, vmalle1is, ish)
 
 /* Flush local TLBs, all VMIDs, non-hypervisor mode */
 TLB_HELPER(flush_all_guests_tlb_local, alle1, nsh)
@@ -104,8 +122,6 @@ TLB_HELPER_VA(__flush_xen_tlb_one, vae2is)
  */
 static inline void flush_guest_tlb_range_ipa(paddr_t ipa, unsigned long size)
 {
-    paddr_t end;
-
     /*
      * If IPA range is too big (empirically found to be 256M), then fallback to
      * full TLB flush.
@@ -113,27 +129,42 @@ static inline void flush_guest_tlb_range_ipa(paddr_t ipa, 
unsigned long size)
     if ( size > SZ_256M )
         return flush_guest_tlb();
 
-    end = ipa + size;
-
-    /*
-     * See ARM ARM DDI 0487L.b D8.17.6.1 (Invalidating TLB entries from stage 2
-     * translations) for details of TLBI sequence.
-     */
-    dsb(ishst); /* Ensure prior page-tables updates have completed */
-    while ( ipa < end )
+    else if ( size > 0 )
     {
-        /* Flush stage-2 TLBs for ipa address */
-        asm_inline volatile (
-            "tlbi ipas2e1is, %0;" : : "r" (ipa >> PAGE_SHIFT) : "memory" );
-        ipa += PAGE_SIZE;
+        paddr_t end = ipa + size;
+
+        /*
+         * See ARM ARM DDI 0487L.b D8.17.6.1 (Invalidating TLB entries from
+         * stage 2 translations) for details on TLBI sequence.
+         */
+        dsb(ishst); /* Ensure prior page-tables updates have completed */
+        while ( ipa < end )
+        {
+            /* Flush stage-2 TLBs for ipa address */
+            asm_inline volatile (
+                "tlbi ipas2e1is, %0;" : : "r" (ipa >> PAGE_SHIFT) : "memory" );
+            ipa += PAGE_SIZE;
+        }
+        if ( cpus_have_const_cap(ARM_HAS_NTLBPA) )
+            asm_inline volatile (
+                ALTERNATIVE(
+                    "nop; nop;",
+                    "dsb  ish;"
+                    "tlbi ipas2e1is, %0;",
+                    ARM64_WORKAROUND_REPEAT_TLBI,
+                    CONFIG_ARM64_WORKAROUND_REPEAT_TLBI)
+                "dsb  ish;"
+                "isb;"
+                : : "r" ((ipa - PAGE_SIZE) >> PAGE_SHIFT) : "memory" );
+        else
+            /*
+             * As ARM64_WORKAROUND_REPEAT_TLBI is required to be applied to
+             * last TLBI of the sequence, it is only needed to be handled in
+             * the following invocation. Final dsb() and isb() are also applied
+             * in the following invocation.
+             */
+            flush_guest_tlb_s1();
     }
-    /*
-     * As ARM64_WORKAROUND_REPEAT_TLBI is required to be applied to last TLBI
-     * of the sequence, it is only needed to be handled in the following
-     * invocation. Final dsb() and isb() are also applied in the following
-     * invocation.
-     */
-    flush_guest_tlb_s1();
 }
 
 #endif /* __ASM_ARM_ARM64_FLUSHTLB_H__ */
diff --git a/xen/arch/arm/include/asm/cpufeature.h 
b/xen/arch/arm/include/asm/cpufeature.h
index 13353c8e1a..9f796ed4c1 100644
--- a/xen/arch/arm/include/asm/cpufeature.h
+++ b/xen/arch/arm/include/asm/cpufeature.h
@@ -76,8 +76,9 @@
 #define ARM_WORKAROUND_BHB_SMCC_3 15
 #define ARM_HAS_SB 16
 #define ARM64_WORKAROUND_1508412 17
+#define ARM_HAS_NTLBPA 18
 
-#define ARM_NCAPS           18
+#define ARM_NCAPS           19
 
 #ifndef __ASSEMBLER__
 
@@ -269,7 +270,8 @@ struct cpuinfo_arm {
             unsigned long ets:4;
             unsigned long __res1:4;
             unsigned long afp:4;
-            unsigned long __res2:12;
+            unsigned long ntlbpa:4;
+            unsigned long __res2:8;
             unsigned long ecbhb:4;
 
             /* MMFR2 */
@@ -430,8 +432,24 @@ struct cpuinfo_arm {
         register_t bits[1];
     } aux32;
 
-    struct {
+    union {
         register_t bits[6];
+        struct {
+            /* MMFR0 */
+            unsigned long __res0:32;
+            /* MMFR1 */
+            unsigned long __res1:32;
+            /* MMFR2 */
+            unsigned long __res2:32;
+            /* MMFR3 */
+            unsigned long __res3:32;
+            /* MMFR4 */
+            unsigned long __res4:32;
+            /* MMFR5 */
+            unsigned long __res5:4;
+            unsigned long ntlbpa:4;
+            unsigned long __res6:24;
+        };
     } mm32;
 
     struct {
diff --git a/xen/arch/arm/include/asm/processor.h 
b/xen/arch/arm/include/asm/processor.h
index 1a48c9ff3b..85f3b643a0 100644
--- a/xen/arch/arm/include/asm/processor.h
+++ b/xen/arch/arm/include/asm/processor.h
@@ -459,9 +459,16 @@
 /* FSR long format */
 #define FSRL_STATUS_DEBUG       (_AC(0x22,UL)<<0)
 
+#ifdef CONFIG_ARM_32
+#define MM32_NTLBPA_SUPPORT_NI      0x0
+#define MM32_NTLBPA_SUPPORT_IMP     0x1
+#endif
+
 #ifdef CONFIG_ARM_64
 #define MM64_VMID_8_BITS_SUPPORT    0x0
 #define MM64_VMID_16_BITS_SUPPORT   0x2
+#define MM64_NTLBPA_SUPPORT_NI      0x0
+#define MM64_NTLBPA_SUPPORT_IMP     0x1
 #endif
 
 #ifndef __ASSEMBLER__
-- 
2.43.0


Reply via email to