It has existed in x86 CPUs since 2008, so we're only 16 years late adding support. With all the other scafolding in place, implement arch_hweightl() for x86.
The only complication is that the call to arch_generic_hweightl() is behind the compilers back. Address this by writing it in ASM and ensure that it preserves all registers. Copy the code generation from generic_hweightl(). It's not a complicated algorithm, and is easy to regenerate if needs be, but cover it with the same unit tests as test_generic_hweightl() just for piece of mind. Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com> --- CC: Jan Beulich <jbeul...@suse.com> CC: Roger Pau Monné <roger....@citrix.com> A few RFC points. * I throught we had an x86 general lib-y but I can't find one, hence why it's still in xen/lib/ for now. * When we up the minimum toolchain to GCC 7 / Clang 5, we can use a __attribute__((no_caller_saved_registers)) and can forgo writing this in asm. GCC seems to need extra help, and wants -mgeneral-regs-only too. It has a habit of complaining about incompatible instructions even when it's not emitting them. --- xen/arch/x86/include/asm/bitops.h | 21 ++++++++++++++ xen/lib/Makefile | 1 + xen/lib/arch-generic-hweightl.S | 46 +++++++++++++++++++++++++++++++ xen/lib/generic-hweightl.c | 15 ++++++++++ 4 files changed, 83 insertions(+) create mode 100644 xen/lib/arch-generic-hweightl.S diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h index 642d8e58b288..0db698ed3f4c 100644 --- a/xen/arch/x86/include/asm/bitops.h +++ b/xen/arch/x86/include/asm/bitops.h @@ -6,6 +6,7 @@ */ #include <asm/alternative.h> +#include <asm/asm_defns.h> #include <asm/cpufeatureset.h> /* @@ -475,4 +476,24 @@ static always_inline unsigned int arch_flsl(unsigned long x) } #define arch_flsl arch_flsl +static always_inline unsigned int arch_hweightl(unsigned long x) +{ + unsigned int r; + + /* + * arch_generic_hweightl() is written in ASM in order to preserve all + * registers, as the compiler can't see the call. + * + * This limits the POPCNT instruction to using the same ABI as a function + * call (input in %rdi, output in %eax) but that's fine. + */ + alternative_io("call arch_generic_hweightl", + "popcnt %[val], %q[res]", X86_FEATURE_POPCNT, + ASM_OUTPUT2([res] "=a" (r) ASM_CALL_CONSTRAINT), + [val] "D" (x)); + + return r; +} +#define arch_hweightl arch_hweightl + #endif /* _X86_BITOPS_H */ diff --git a/xen/lib/Makefile b/xen/lib/Makefile index b6558e108bd9..84d731dc0ac8 100644 --- a/xen/lib/Makefile +++ b/xen/lib/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_X86) += x86/ +lib-$(CONFIG_X86) += arch-generic-hweightl.o lib-y += bsearch.o lib-y += ctors.o lib-y += ctype.o diff --git a/xen/lib/arch-generic-hweightl.S b/xen/lib/arch-generic-hweightl.S new file mode 100644 index 000000000000..15c6e3394845 --- /dev/null +++ b/xen/lib/arch-generic-hweightl.S @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + + .file __FILE__ + +#include <xen/linkage.h> + +/* + * An implementation of generic_hweightl() used on hardware without the POPCNT + * instruction. + * + * This function is called from within an ALTERNATIVE in arch_hweightl(). + * i.e. behind the back of the compiler. Therefore all registers are callee + * preserved. + * + * The ASM is what GCC-12 emits for generic_hweightl() in a release build of + * Xen, with spilling of %rdi/%rdx to preserve the callers registers. + */ +FUNC(arch_generic_hweightl) + push %rdi + push %rdx + + movabs $0x5555555555555555, %rdx + mov %rdi, %rax + shr $1, %rax + and %rdx, %rax + sub %rax, %rdi + movabs $0x3333333333333333, %rax + mov %rdi, %rdx + shr $0x2, %rdi + and %rax, %rdx + and %rax, %rdi + add %rdi, %rdx + mov %rdx, %rax + shr $0x4, %rax + add %rdx, %rax + movabs $0xf0f0f0f0f0f0f0f, %rdx + and %rdx, %rax + movabs $0x101010101010101, %rdx + imul %rdx, %rax + shr $0x38, %rax + + pop %rdx + pop %rdi + + ret +END(arch_generic_hweightl) diff --git a/xen/lib/generic-hweightl.c b/xen/lib/generic-hweightl.c index fa4bbec273ab..4b39dd84de5e 100644 --- a/xen/lib/generic-hweightl.c +++ b/xen/lib/generic-hweightl.c @@ -43,4 +43,19 @@ static void __init __constructor test_generic_hweightl(void) RUNTIME_CHECK(generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2); RUNTIME_CHECK(generic_hweightl, -1UL, BITS_PER_LONG); } + +#ifdef CONFIG_X86 +unsigned int arch_generic_hweightl(unsigned long); +static void __init __constructor test_arch_generic_hweightl(void) +{ + RUNTIME_CHECK(arch_generic_hweightl, 0, 0); + RUNTIME_CHECK(arch_generic_hweightl, 1, 1); + RUNTIME_CHECK(arch_generic_hweightl, 3, 2); + RUNTIME_CHECK(arch_generic_hweightl, 7, 3); + RUNTIME_CHECK(arch_generic_hweightl, 0xff, 8); + + RUNTIME_CHECK(arch_generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2); + RUNTIME_CHECK(arch_generic_hweightl, -1UL, BITS_PER_LONG); +} +#endif /* CONFIG_X86 */ #endif /* CONFIG_SELF_TESTS */ -- 2.39.2