Compilers estimate the size of an asm() block for inlining purposes. Constructs such as ALTERNATIVE appear large due to the metadata, depsite often only being a handful of instructions. asm inline() overrides the estimation to identify the block as being small.
This has a substantial impact on inlining decisions, expected to be for the better given that the compiler has a more accurate picture to work with. Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com> --- CC: Jan Beulich <jbeul...@suse.com> CC: Roger Pau Monné <roger....@citrix.com> CC: Stefano Stabellini <sstabell...@kernel.org> CC: Julien Grall <jul...@xen.org> CC: Volodymyr Babchuk <volodymyr_babc...@epam.com> CC: Bertrand Marquis <bertrand.marq...@arm.com> CC: Michal Orzel <michal.or...@amd.com> --- xen/Kconfig | 4 +++ xen/arch/arm/include/asm/alternative.h | 4 +-- xen/arch/arm/include/asm/arm64/flushtlb.h | 4 +-- xen/arch/arm/include/asm/arm64/io.h | 43 ++++++++++++++--------- xen/arch/arm/include/asm/cpuerrata.h | 8 ++--- xen/arch/arm/include/asm/cpufeature.h | 8 ++--- xen/arch/arm/include/asm/page.h | 12 ++++--- xen/arch/arm/include/asm/processor.h | 7 ++-- xen/arch/arm/include/asm/sysregs.h | 10 +++--- xen/arch/arm/mmu/p2m.c | 3 +- xen/arch/x86/include/asm/alternative.h | 36 ++++++++++--------- xen/include/xen/compiler.h | 14 ++++++++ 12 files changed, 95 insertions(+), 58 deletions(-) diff --git a/xen/Kconfig b/xen/Kconfig index ae1c401a981e..ab4ab42ae987 100644 --- a/xen/Kconfig +++ b/xen/Kconfig @@ -29,6 +29,10 @@ config LD_IS_GNU config LD_IS_LLVM def_bool $(success,$(LD) --version | head -n 1 | grep -q "^LLD") +config CC_HAS_ASM_INLINE + # GCC >= 9, Clang >= 11 + def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) + # Use -f{function,data}-sections compiler parameters config CC_SPLIT_SECTIONS bool diff --git a/xen/arch/arm/include/asm/alternative.h b/xen/arch/arm/include/asm/alternative.h index 22477d9497a3..1563f03a0f5a 100644 --- a/xen/arch/arm/include/asm/alternative.h +++ b/xen/arch/arm/include/asm/alternative.h @@ -209,9 +209,9 @@ alternative_endif #endif /* __ASSEMBLY__ */ /* - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature)); + * Usage: asm_inline (ALTERNATIVE(oldinstr, newinstr, feature)); * - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO)); + * Usage: asm_inline (ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO)); * N.B. If CONFIG_FOO is specified, but not selected, the whole block * will be omitted, including oldinstr. */ diff --git a/xen/arch/arm/include/asm/arm64/flushtlb.h b/xen/arch/arm/include/asm/arm64/flushtlb.h index 45642201d147..3b99c11b50d1 100644 --- a/xen/arch/arm/include/asm/arm64/flushtlb.h +++ b/xen/arch/arm/include/asm/arm64/flushtlb.h @@ -31,7 +31,7 @@ #define TLB_HELPER(name, tlbop, sh) \ static inline void name(void) \ { \ - asm volatile( \ + asm_inline volatile ( \ "dsb " # sh "st;" \ "tlbi " # tlbop ";" \ ALTERNATIVE( \ @@ -55,7 +55,7 @@ static inline void name(void) \ #define TLB_HELPER_VA(name, tlbop) \ static inline void name(vaddr_t va) \ { \ - asm volatile( \ + asm_inline volatile ( \ "tlbi " # tlbop ", %0;" \ ALTERNATIVE( \ "nop; nop;", \ diff --git a/xen/arch/arm/include/asm/arm64/io.h b/xen/arch/arm/include/asm/arm64/io.h index 7d5959877759..ac90b729c44d 100644 --- a/xen/arch/arm/include/asm/arm64/io.h +++ b/xen/arch/arm/include/asm/arm64/io.h @@ -51,40 +51,51 @@ static inline void __raw_writeq(u64 val, volatile void __iomem *addr) static inline u8 __raw_readb(const volatile void __iomem *addr) { u8 val; - asm volatile(ALTERNATIVE("ldrb %w0, [%1]", - "ldarb %w0, [%1]", - ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) - : "=r" (val) : "r" (addr)); + + asm_inline volatile ( + ALTERNATIVE("ldrb %w0, [%1]", + "ldarb %w0, [%1]", + ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) + : "=r" (val) : "r" (addr) ); + return val; } static inline u16 __raw_readw(const volatile void __iomem *addr) { u16 val; - asm volatile(ALTERNATIVE("ldrh %w0, [%1]", - "ldarh %w0, [%1]", - ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) - : "=r" (val) : "r" (addr)); + asm_inline volatile ( + ALTERNATIVE("ldrh %w0, [%1]", + "ldarh %w0, [%1]", + ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) + : "=r" (val) : "r" (addr) ); + return val; } static inline u32 __raw_readl(const volatile void __iomem *addr) { u32 val; - asm volatile(ALTERNATIVE("ldr %w0, [%1]", - "ldar %w0, [%1]", - ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) - : "=r" (val) : "r" (addr)); + + asm_inline volatile ( + ALTERNATIVE("ldr %w0, [%1]", + "ldar %w0, [%1]", + ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) + : "=r" (val) : "r" (addr) ); + return val; } static inline u64 __raw_readq(const volatile void __iomem *addr) { u64 val; - asm volatile(ALTERNATIVE("ldr %0, [%1]", - "ldar %0, [%1]", - ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) - : "=r" (val) : "r" (addr)); + + asm_inline volatile ( + ALTERNATIVE("ldr %0, [%1]", + "ldar %0, [%1]", + ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) + : "=r" (val) : "r" (addr) ); + return val; } diff --git a/xen/arch/arm/include/asm/cpuerrata.h b/xen/arch/arm/include/asm/cpuerrata.h index 8d7e7b9375bd..1799a16d7e7f 100644 --- a/xen/arch/arm/include/asm/cpuerrata.h +++ b/xen/arch/arm/include/asm/cpuerrata.h @@ -16,10 +16,10 @@ static inline bool check_workaround_##erratum(void) \ { \ register_t ret; \ \ - asm volatile (ALTERNATIVE("mov %0, #0", \ - "mov %0, #1", \ - feature) \ - : "=r" (ret)); \ + asm_inline volatile ( \ + ALTERNATIVE("mov %0, #0", \ + "mov %0, #1", feature) \ + : "=r" (ret) ); \ \ return unlikely(ret); \ } \ diff --git a/xen/arch/arm/include/asm/cpufeature.h b/xen/arch/arm/include/asm/cpufeature.h index 50297e53d90e..b6df18801166 100644 --- a/xen/arch/arm/include/asm/cpufeature.h +++ b/xen/arch/arm/include/asm/cpufeature.h @@ -102,10 +102,10 @@ static inline bool cpus_have_cap(unsigned int num) #define cpus_have_const_cap(num) ({ \ register_t __ret; \ \ - asm volatile (ALTERNATIVE("mov %0, #0", \ - "mov %0, #1", \ - num) \ - : "=r" (__ret)); \ + asm_inline volatile ( \ + ALTERNATIVE("mov %0, #0", \ + "mov %0, #1", num) \ + : "=r" (__ret) ); \ \ unlikely(__ret); \ }) diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h index 69f817d1e68a..27bc96b9f401 100644 --- a/xen/arch/arm/include/asm/page.h +++ b/xen/arch/arm/include/asm/page.h @@ -176,7 +176,8 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) { size -= dcache_line_bytes - ((uintptr_t)p & cacheline_mask); p = (void *)((uintptr_t)p & ~cacheline_mask); - asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); + asm_inline volatile ( + __clean_and_invalidate_dcache_one(0) :: "r" (p) ); p += dcache_line_bytes; } @@ -185,7 +186,8 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx)); if ( size > 0 ) - asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); + asm_inline volatile ( + __clean_and_invalidate_dcache_one(0) :: "r" (p + idx) ); dsb(sy); /* So we know the flushes happen before continuing */ @@ -209,7 +211,7 @@ static inline int clean_dcache_va_range(const void *p, unsigned long size) p = (void *)((uintptr_t)p & ~cacheline_mask); for ( ; size >= dcache_line_bytes; idx += dcache_line_bytes, size -= dcache_line_bytes ) - asm volatile (__clean_dcache_one(0) : : "r" (p + idx)); + asm_inline volatile ( __clean_dcache_one(0) : : "r" (p + idx) ); dsb(sy); /* So we know the flushes happen before continuing */ /* ARM callers assume that dcache_* functions cannot fail. */ return 0; @@ -247,7 +249,7 @@ static inline int clean_and_invalidate_dcache_va_range if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) ) \ clean_dcache_va_range(_p, sizeof(x)); \ else \ - asm volatile ( \ + asm_inline volatile ( \ "dsb sy;" /* Finish all earlier writes */ \ __clean_dcache_one(0) \ "dsb sy;" /* Finish flush before continuing */ \ @@ -259,7 +261,7 @@ static inline int clean_and_invalidate_dcache_va_range if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) ) \ clean_and_invalidate_dcache_va_range(_p, sizeof(x)); \ else \ - asm volatile ( \ + asm_inline volatile ( \ "dsb sy;" /* Finish all earlier writes */ \ __clean_and_invalidate_dcache_one(0) \ "dsb sy;" /* Finish flush before continuing */ \ diff --git a/xen/arch/arm/include/asm/processor.h b/xen/arch/arm/include/asm/processor.h index 60b587db697f..9cbc4f911061 100644 --- a/xen/arch/arm/include/asm/processor.h +++ b/xen/arch/arm/include/asm/processor.h @@ -607,9 +607,10 @@ register_t get_default_cptr_flags(void); #define SYNCHRONIZE_SERROR(feat) \ do { \ ASSERT(local_abort_is_enabled()); \ - asm volatile(ALTERNATIVE("dsb sy; isb", \ - "nop; nop", feat) \ - : : : "memory"); \ + asm_inline volatile ( \ + ALTERNATIVE("dsb sy; isb", \ + "nop; nop", feat) \ + ::: "memory" ); \ } while (0) /* diff --git a/xen/arch/arm/include/asm/sysregs.h b/xen/arch/arm/include/asm/sysregs.h index 61e30c9e517c..5c2d362be3d8 100644 --- a/xen/arch/arm/include/asm/sysregs.h +++ b/xen/arch/arm/include/asm/sysregs.h @@ -22,11 +22,13 @@ static inline register_t read_sysreg_par(void) * DMB SY before and after accessing it, as part of the workaround for the * errata 1508412. */ - asm volatile(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412, - CONFIG_ARM64_ERRATUM_1508412)); + asm_inline volatile ( + ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412, + CONFIG_ARM64_ERRATUM_1508412) ); par_el1 = READ_SYSREG64(PAR_EL1); - asm volatile(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412, - CONFIG_ARM64_ERRATUM_1508412)); + asm_inline volatile ( + ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412, + CONFIG_ARM64_ERRATUM_1508412) ); return par_el1; } diff --git a/xen/arch/arm/mmu/p2m.c b/xen/arch/arm/mmu/p2m.c index 7642dbc7c55b..d96078f547d5 100644 --- a/xen/arch/arm/mmu/p2m.c +++ b/xen/arch/arm/mmu/p2m.c @@ -228,7 +228,8 @@ void p2m_restore_state(struct vcpu *n) * registers associated to EL1/EL0 translations regime have been * synchronized. */ - asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE)); + asm_inline volatile ( + ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE) ); WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2); last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()]; diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h index 38472fb58e2d..4c8be51d0e23 100644 --- a/xen/arch/x86/include/asm/alternative.h +++ b/xen/arch/x86/include/asm/alternative.h @@ -116,12 +116,13 @@ extern void alternative_branches(void); * without volatile and memory clobber. */ #define alternative(oldinstr, newinstr, feature) \ - asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") + asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature) \ + ::: "memory" ) #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ - asm volatile (ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ - newinstr2, feature2) \ - : : : "memory") + asm_inline volatile ( ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ + newinstr2, feature2) \ + ::: "memory" ) /* * Alternative inline assembly with input. @@ -133,14 +134,14 @@ extern void alternative_branches(void); * If you use variable sized constraints like "m" or "g" in the * replacement make sure to pad to the worst case length. */ -#define alternative_input(oldinstr, newinstr, feature, input...) \ - asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ - : : input) +#define alternative_input(oldinstr, newinstr, feature, input...) \ + asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature) \ + :: input ) /* Like alternative_input, but with a single output argument */ -#define alternative_io(oldinstr, newinstr, feature, output, input...) \ - asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ - : output : input) +#define alternative_io(oldinstr, newinstr, feature, output, input...) \ + asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature) \ + : output : input ) /* * This is similar to alternative_io. But it has two features and @@ -150,11 +151,11 @@ extern void alternative_branches(void); * Otherwise, if CPU has feature1, newinstr1 is used. * Otherwise, oldinstr is used. */ -#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2, \ - feature2, output, input...) \ - asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ - newinstr2, feature2) \ - : output : input) +#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2, \ + feature2, output, input...) \ + asm_inline volatile ( ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ + newinstr2, feature2) \ + : output : input ) /* Use this macro(s) if you need more than one output parameter. */ #define ASM_OUTPUT2(a...) a @@ -234,8 +235,9 @@ extern void alternative_branches(void); rettype ret_; \ register unsigned long r10_ asm("r10"); \ register unsigned long r11_ asm("r11"); \ - asm volatile (ALTERNATIVE("call *%c[addr](%%rip)", "call .", \ - X86_FEATURE_ALWAYS) \ + asm_inline volatile ( \ + ALTERNATIVE("call *%c[addr](%%rip)", "call .", \ + X86_FEATURE_ALWAYS) \ : ALT_CALL ## n ## _OUT, "=a" (ret_), \ "=r" (r10_), "=r" (r11_) ASM_CALL_CONSTRAINT \ : [addr] "i" (&(func)), "g" (func) \ diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h index c68fab189154..d9b133016bb6 100644 --- a/xen/include/xen/compiler.h +++ b/xen/include/xen/compiler.h @@ -53,6 +53,20 @@ #define unreachable() __builtin_unreachable() #endif +/* + * Compilers estimate the size of an asm() block for inlining purposes. + * Constructs such as ALTERNATIVE appear large due to the metadata, depsite + * often only being a handful of instructions. asm inline() overrides the + * estimation to identify the block as being small. + * + * Note: __inline is needed to avoid getting caught up in INIT_SECTIONS_ONLY. + */ +#if CONFIG_CC_HAS_ASM_INLINE +# define asm_inline asm __inline +#else +# define asm_inline asm +#endif + /* * Add the pseudo keyword 'fallthrough' so case statement blocks * must end with any of these keywords: -- 2.39.5