Compilers estimate the size of an asm() block for inlining purposes.

Constructs such as ALTERNATIVE appear large due to the metadata, depsite often
only being a handful of instructions.  asm inline() overrides the estimation
to identify the block as being small.

This has a substantial impact on inlining decisions, expected to be for the
better given that the compiler has a more accurate picture to work with.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
---
CC: Jan Beulich <jbeul...@suse.com>
CC: Roger Pau Monné <roger....@citrix.com>
CC: Stefano Stabellini <sstabell...@kernel.org>
CC: Julien Grall <jul...@xen.org>
CC: Volodymyr Babchuk <volodymyr_babc...@epam.com>
CC: Bertrand Marquis <bertrand.marq...@arm.com>
CC: Michal Orzel <michal.or...@amd.com>
---
 xen/Kconfig                               |  4 +++
 xen/arch/arm/include/asm/alternative.h    |  4 +--
 xen/arch/arm/include/asm/arm64/flushtlb.h |  4 +--
 xen/arch/arm/include/asm/arm64/io.h       | 43 ++++++++++++++---------
 xen/arch/arm/include/asm/cpuerrata.h      |  8 ++---
 xen/arch/arm/include/asm/cpufeature.h     |  8 ++---
 xen/arch/arm/include/asm/page.h           | 12 ++++---
 xen/arch/arm/include/asm/processor.h      |  7 ++--
 xen/arch/arm/include/asm/sysregs.h        | 10 +++---
 xen/arch/arm/mmu/p2m.c                    |  3 +-
 xen/arch/x86/include/asm/alternative.h    | 36 ++++++++++---------
 xen/include/xen/compiler.h                | 14 ++++++++
 12 files changed, 95 insertions(+), 58 deletions(-)

diff --git a/xen/Kconfig b/xen/Kconfig
index ae1c401a981e..ab4ab42ae987 100644
--- a/xen/Kconfig
+++ b/xen/Kconfig
@@ -29,6 +29,10 @@ config LD_IS_GNU
 config LD_IS_LLVM
        def_bool $(success,$(LD) --version | head -n 1 | grep -q "^LLD")
 
+config CC_HAS_ASM_INLINE
+       # GCC >= 9, Clang >= 11
+       def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) 
-x c - -c -o /dev/null)
+
 # Use -f{function,data}-sections compiler parameters
 config CC_SPLIT_SECTIONS
        bool
diff --git a/xen/arch/arm/include/asm/alternative.h 
b/xen/arch/arm/include/asm/alternative.h
index 22477d9497a3..1563f03a0f5a 100644
--- a/xen/arch/arm/include/asm/alternative.h
+++ b/xen/arch/arm/include/asm/alternative.h
@@ -209,9 +209,9 @@ alternative_endif
 #endif  /*  __ASSEMBLY__  */
 
 /*
- * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
+ * Usage: asm_inline (ALTERNATIVE(oldinstr, newinstr, feature));
  *
- * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
+ * Usage: asm_inline (ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
  * N.B. If CONFIG_FOO is specified, but not selected, the whole block
  *      will be omitted, including oldinstr.
  */
diff --git a/xen/arch/arm/include/asm/arm64/flushtlb.h 
b/xen/arch/arm/include/asm/arm64/flushtlb.h
index 45642201d147..3b99c11b50d1 100644
--- a/xen/arch/arm/include/asm/arm64/flushtlb.h
+++ b/xen/arch/arm/include/asm/arm64/flushtlb.h
@@ -31,7 +31,7 @@
 #define TLB_HELPER(name, tlbop, sh)              \
 static inline void name(void)                    \
 {                                                \
-    asm volatile(                                \
+    asm_inline volatile (                        \
         "dsb  "  # sh  "st;"                     \
         "tlbi "  # tlbop  ";"                    \
         ALTERNATIVE(                             \
@@ -55,7 +55,7 @@ static inline void name(void)                    \
 #define TLB_HELPER_VA(name, tlbop)               \
 static inline void name(vaddr_t va)              \
 {                                                \
-    asm volatile(                                \
+    asm_inline volatile (                        \
         "tlbi "  # tlbop  ", %0;"                \
         ALTERNATIVE(                             \
             "nop; nop;",                         \
diff --git a/xen/arch/arm/include/asm/arm64/io.h 
b/xen/arch/arm/include/asm/arm64/io.h
index 7d5959877759..ac90b729c44d 100644
--- a/xen/arch/arm/include/asm/arm64/io.h
+++ b/xen/arch/arm/include/asm/arm64/io.h
@@ -51,40 +51,51 @@ static inline void __raw_writeq(u64 val, volatile void 
__iomem *addr)
 static inline u8 __raw_readb(const volatile void __iomem *addr)
 {
         u8 val;
-        asm volatile(ALTERNATIVE("ldrb %w0, [%1]",
-                                 "ldarb %w0, [%1]",
-                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
-                     : "=r" (val) : "r" (addr));
+
+        asm_inline volatile (
+            ALTERNATIVE("ldrb %w0, [%1]",
+                        "ldarb %w0, [%1]",
+                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
+            : "=r" (val) : "r" (addr) );
+
         return val;
 }
 
 static inline u16 __raw_readw(const volatile void __iomem *addr)
 {
         u16 val;
-        asm volatile(ALTERNATIVE("ldrh %w0, [%1]",
-                                 "ldarh %w0, [%1]",
-                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
-                     : "=r" (val) : "r" (addr));
+        asm_inline volatile (
+            ALTERNATIVE("ldrh %w0, [%1]",
+                        "ldarh %w0, [%1]",
+                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
+            : "=r" (val) : "r" (addr) );
+
         return val;
 }
 
 static inline u32 __raw_readl(const volatile void __iomem *addr)
 {
         u32 val;
-        asm volatile(ALTERNATIVE("ldr %w0, [%1]",
-                                 "ldar %w0, [%1]",
-                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
-                     : "=r" (val) : "r" (addr));
+
+        asm_inline volatile (
+            ALTERNATIVE("ldr %w0, [%1]",
+                        "ldar %w0, [%1]",
+                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
+            : "=r" (val) : "r" (addr) );
+
         return val;
 }
 
 static inline u64 __raw_readq(const volatile void __iomem *addr)
 {
         u64 val;
-        asm volatile(ALTERNATIVE("ldr %0, [%1]",
-                                 "ldar %0, [%1]",
-                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
-                     : "=r" (val) : "r" (addr));
+
+        asm_inline volatile (
+            ALTERNATIVE("ldr %0, [%1]",
+                        "ldar %0, [%1]",
+                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
+            : "=r" (val) : "r" (addr) );
+
         return val;
 }
 
diff --git a/xen/arch/arm/include/asm/cpuerrata.h 
b/xen/arch/arm/include/asm/cpuerrata.h
index 8d7e7b9375bd..1799a16d7e7f 100644
--- a/xen/arch/arm/include/asm/cpuerrata.h
+++ b/xen/arch/arm/include/asm/cpuerrata.h
@@ -16,10 +16,10 @@ static inline bool check_workaround_##erratum(void)         
    \
     {                                                           \
         register_t ret;                                         \
                                                                 \
-        asm volatile (ALTERNATIVE("mov %0, #0",                 \
-                                  "mov %0, #1",                 \
-                                  feature)                      \
-                      : "=r" (ret));                            \
+        asm_inline volatile (                                   \
+            ALTERNATIVE("mov %0, #0",                           \
+                        "mov %0, #1", feature)                  \
+            : "=r" (ret) );                                     \
                                                                 \
         return unlikely(ret);                                   \
     }                                                           \
diff --git a/xen/arch/arm/include/asm/cpufeature.h 
b/xen/arch/arm/include/asm/cpufeature.h
index 50297e53d90e..b6df18801166 100644
--- a/xen/arch/arm/include/asm/cpufeature.h
+++ b/xen/arch/arm/include/asm/cpufeature.h
@@ -102,10 +102,10 @@ static inline bool cpus_have_cap(unsigned int num)
 #define cpus_have_const_cap(num) ({                 \
         register_t __ret;                           \
                                                     \
-        asm volatile (ALTERNATIVE("mov %0, #0",     \
-                                  "mov %0, #1",     \
-                                  num)              \
-                      : "=r" (__ret));              \
+        asm_inline volatile (                       \
+            ALTERNATIVE("mov %0, #0",               \
+                        "mov %0, #1", num)          \
+            : "=r" (__ret) );                       \
                                                     \
         unlikely(__ret);                            \
         })
diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
index 69f817d1e68a..27bc96b9f401 100644
--- a/xen/arch/arm/include/asm/page.h
+++ b/xen/arch/arm/include/asm/page.h
@@ -176,7 +176,8 @@ static inline int invalidate_dcache_va_range(const void *p, 
unsigned long size)
     {
         size -= dcache_line_bytes - ((uintptr_t)p & cacheline_mask);
         p = (void *)((uintptr_t)p & ~cacheline_mask);
-        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p));
+        asm_inline volatile (
+            __clean_and_invalidate_dcache_one(0) :: "r" (p) );
         p += dcache_line_bytes;
     }
 
@@ -185,7 +186,8 @@ static inline int invalidate_dcache_va_range(const void *p, 
unsigned long size)
         asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx));
 
     if ( size > 0 )
-        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx));
+        asm_inline volatile (
+            __clean_and_invalidate_dcache_one(0) :: "r" (p + idx) );
 
     dsb(sy);           /* So we know the flushes happen before continuing */
 
@@ -209,7 +211,7 @@ static inline int clean_dcache_va_range(const void *p, 
unsigned long size)
     p = (void *)((uintptr_t)p & ~cacheline_mask);
     for ( ; size >= dcache_line_bytes;
             idx += dcache_line_bytes, size -= dcache_line_bytes )
-        asm volatile (__clean_dcache_one(0) : : "r" (p + idx));
+        asm_inline volatile ( __clean_dcache_one(0) : : "r" (p + idx) );
     dsb(sy);           /* So we know the flushes happen before continuing */
     /* ARM callers assume that dcache_* functions cannot fail. */
     return 0;
@@ -247,7 +249,7 @@ static inline int clean_and_invalidate_dcache_va_range
     if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) )    \
         clean_dcache_va_range(_p, sizeof(x));                           \
     else                                                                \
-        asm volatile (                                                  \
+        asm_inline volatile (                                           \
             "dsb sy;"   /* Finish all earlier writes */                 \
             __clean_dcache_one(0)                                       \
             "dsb sy;"   /* Finish flush before continuing */            \
@@ -259,7 +261,7 @@ static inline int clean_and_invalidate_dcache_va_range
     if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) )    \
         clean_and_invalidate_dcache_va_range(_p, sizeof(x));            \
     else                                                                \
-        asm volatile (                                                  \
+        asm_inline volatile (                                           \
             "dsb sy;"   /* Finish all earlier writes */                 \
             __clean_and_invalidate_dcache_one(0)                        \
             "dsb sy;"   /* Finish flush before continuing */            \
diff --git a/xen/arch/arm/include/asm/processor.h 
b/xen/arch/arm/include/asm/processor.h
index 60b587db697f..9cbc4f911061 100644
--- a/xen/arch/arm/include/asm/processor.h
+++ b/xen/arch/arm/include/asm/processor.h
@@ -607,9 +607,10 @@ register_t get_default_cptr_flags(void);
 #define SYNCHRONIZE_SERROR(feat)                                  \
     do {                                                          \
         ASSERT(local_abort_is_enabled());                         \
-        asm volatile(ALTERNATIVE("dsb sy; isb",                   \
-                                 "nop; nop", feat)                \
-                                 : : : "memory");                 \
+        asm_inline volatile (                                     \
+            ALTERNATIVE("dsb sy; isb",                            \
+                        "nop; nop", feat)                         \
+            ::: "memory" );                                       \
     } while (0)
 
 /*
diff --git a/xen/arch/arm/include/asm/sysregs.h 
b/xen/arch/arm/include/asm/sysregs.h
index 61e30c9e517c..5c2d362be3d8 100644
--- a/xen/arch/arm/include/asm/sysregs.h
+++ b/xen/arch/arm/include/asm/sysregs.h
@@ -22,11 +22,13 @@ static inline register_t read_sysreg_par(void)
      * DMB SY before and after accessing it, as part of the workaround for the
      * errata 1508412.
      */
-    asm volatile(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
-                 CONFIG_ARM64_ERRATUM_1508412));
+    asm_inline volatile (
+        ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
+                    CONFIG_ARM64_ERRATUM_1508412) );
     par_el1 = READ_SYSREG64(PAR_EL1);
-    asm volatile(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
-                 CONFIG_ARM64_ERRATUM_1508412));
+    asm_inline volatile (
+        ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
+                    CONFIG_ARM64_ERRATUM_1508412) );
 
     return par_el1;
 }
diff --git a/xen/arch/arm/mmu/p2m.c b/xen/arch/arm/mmu/p2m.c
index 7642dbc7c55b..d96078f547d5 100644
--- a/xen/arch/arm/mmu/p2m.c
+++ b/xen/arch/arm/mmu/p2m.c
@@ -228,7 +228,8 @@ void p2m_restore_state(struct vcpu *n)
      * registers associated to EL1/EL0 translations regime have been
      * synchronized.
      */
-    asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE));
+    asm_inline volatile (
+        ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE) );
     WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
 
     last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
diff --git a/xen/arch/x86/include/asm/alternative.h 
b/xen/arch/x86/include/asm/alternative.h
index 38472fb58e2d..4c8be51d0e23 100644
--- a/xen/arch/x86/include/asm/alternative.h
+++ b/xen/arch/x86/include/asm/alternative.h
@@ -116,12 +116,13 @@ extern void alternative_branches(void);
  * without volatile and memory clobber.
  */
 #define alternative(oldinstr, newinstr, feature)                        \
-        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
+                          ::: "memory" )
 
 #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
-       asm volatile (ALTERNATIVE_2(oldinstr, newinstr1, feature1,      \
-                                   newinstr2, feature2)                \
-                     : : : "memory")
+    asm_inline volatile ( ALTERNATIVE_2(oldinstr, newinstr1, feature1,  \
+                                        newinstr2, feature2)            \
+                          ::: "memory" )
 
 /*
  * Alternative inline assembly with input.
@@ -133,14 +134,14 @@ extern void alternative_branches(void);
  * If you use variable sized constraints like "m" or "g" in the
  * replacement make sure to pad to the worst case length.
  */
-#define alternative_input(oldinstr, newinstr, feature, input...)       \
-       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
-                     : : input)
+#define alternative_input(oldinstr, newinstr, feature, input...)        \
+    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
+                          :: input )
 
 /* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...)  \
-       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
-                     : output : input)
+#define alternative_io(oldinstr, newinstr, feature, output, input...)   \
+    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
+                          : output : input )
 
 /*
  * This is similar to alternative_io. But it has two features and
@@ -150,11 +151,11 @@ extern void alternative_branches(void);
  * Otherwise, if CPU has feature1, newinstr1 is used.
  * Otherwise, oldinstr is used.
  */
-#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2,     \
-                        feature2, output, input...)                    \
-       asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1,       \
-                                  newinstr2, feature2)                 \
-                    : output : input)
+#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2,      \
+                         feature2, output, input...)                    \
+    asm_inline volatile ( ALTERNATIVE_2(oldinstr, newinstr1, feature1,  \
+                                        newinstr2, feature2)            \
+                          : output : input )
 
 /* Use this macro(s) if you need more than one output parameter. */
 #define ASM_OUTPUT2(a...) a
@@ -234,8 +235,9 @@ extern void alternative_branches(void);
     rettype ret_;                                                  \
     register unsigned long r10_ asm("r10");                        \
     register unsigned long r11_ asm("r11");                        \
-    asm volatile (ALTERNATIVE("call *%c[addr](%%rip)", "call .",   \
-                              X86_FEATURE_ALWAYS)                  \
+    asm_inline volatile (                                          \
+        ALTERNATIVE("call *%c[addr](%%rip)", "call .",             \
+                    X86_FEATURE_ALWAYS)                            \
                   : ALT_CALL ## n ## _OUT, "=a" (ret_),            \
                     "=r" (r10_), "=r" (r11_) ASM_CALL_CONSTRAINT   \
                   : [addr] "i" (&(func)), "g" (func)               \
diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h
index c68fab189154..d9b133016bb6 100644
--- a/xen/include/xen/compiler.h
+++ b/xen/include/xen/compiler.h
@@ -53,6 +53,20 @@
 #define unreachable() __builtin_unreachable()
 #endif
 
+/*
+ * Compilers estimate the size of an asm() block for inlining purposes.
+ * Constructs such as ALTERNATIVE appear large due to the metadata, depsite
+ * often only being a handful of instructions.  asm inline() overrides the
+ * estimation to identify the block as being small.
+ *
+ * Note: __inline is needed to avoid getting caught up in INIT_SECTIONS_ONLY.
+ */
+#if CONFIG_CC_HAS_ASM_INLINE
+# define asm_inline asm __inline
+#else
+# define asm_inline asm
+#endif
+
 /*
  * Add the pseudo keyword 'fallthrough' so case statement blocks
  * must end with any of these keywords:
-- 
2.39.5


Reply via email to