On 12/07/17 14:41, Alexey Dobriyan wrote:
> Mainline kernel can only generate "popcnt rax, rdi" instruction
> with alternative masquareading as function call. Patch allows
> to generate all POPCNT variations and inlines hweigth*() family of functions.
> 
>       $ objdump  -dr ../obj/vmlinux | grep popcnt
>       ffffffff81004f6d:       f3 48 0f b8 c9          popcnt rcx,rcx
>       ffffffff81008484:       f3 48 0f b8 03          popcnt rax,QWORD PTR 
> [rbx]
>       ffffffff81073aae:       f3 48 0f b8 d8          popcnt rbx,rax
>               ...
> 
> Signed-off-by: Alexey Dobriyan <[email protected]>
> ---
>  arch/x86/include/asm/arch_hweight.h | 32 ++++++++++++++++++++++++++++++--
>  arch/x86/lib/Makefile               |  5 ++++-
>  include/linux/bitops.h              |  2 ++
>  lib/Makefile                        |  2 ++
>  scripts/kconfig/cpuid.c             |  6 ++++++
>  scripts/march-native.sh             |  6 +++++-
>  6 files changed, 49 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/arch_hweight.h 
> b/arch/x86/include/asm/arch_hweight.h
> index 34a10b2d5b73..58e4f65d8665 100644
> --- a/arch/x86/include/asm/arch_hweight.h
> +++ b/arch/x86/include/asm/arch_hweight.h
> @@ -2,6 +2,34 @@
>  #ifndef _ASM_X86_HWEIGHT_H
>  #define _ASM_X86_HWEIGHT_H
>  
> +#define __HAVE_ARCH_SW_HWEIGHT
> +
> +#ifdef CONFIG_MARCH_NATIVE_POPCNT
> +static inline unsigned int __arch_hweight64(uint64_t x)
> +{
> +     uint64_t rv;
> +     asm ("popcnt %1, %0" : "=r" (rv) : "rm" (x));
> +     return rv;
> +}
> +
> +static inline unsigned int __arch_hweight32(uint32_t x)
> +{
> +     uint32_t rv;
> +     asm ("popcnt %1, %0" : "=r" (rv) : "rm" (x));
> +     return rv;
> +}
> +
> +static inline unsigned int __arch_hweight16(uint16_t x)
> +{
> +     return __arch_hweight32(x);
> +}
> +
> +static inline unsigned int __arch_hweight8(uint8_t x)
> +{
> +     return __arch_hweight32(x);
> +}


-march=native really would be better implemented by examining the macros
generated by gcc which correspond to the selected -m options
(-march=native really just selects a combination of -m options.)  It
seems bizarre to just reimplement the mechanism that already exists.

Now, this specific case would be better done with alternatives; we can
patch in a JMP to an out-of-line stub to mangle the arguments.  Then you
get the benefit on all systems and don't need to decide at compile time.

The reason to use -m options for this would be to be able to use the
__builtin_popcount() and __builtin_popcountl() intrinsics, which would
allow gcc to schedule it and optimize arbitrarily.

So, much more something like:

#ifdef __POPCNT__

static inline unsigned int __arch_hweight64(uint64_t x)
{
        return __builtin_popcountll(x);
}

static inline unsigned int __arch_hweight32(uint32_t x)
{
        return __builtin_popcount(x);
}

#else

/* Assembly code with alternatives */

        /* Enabled alternative */
        popcnt %1, %0

        /* Non-enabled alternative */
        jmp 1f
2:
        .pushsection .altinstr_aux
1:
        pushq %q1               /* pushl %k1 for i386 */
        call ___arch_hweight%z1
        popq %q0                /* popl %k0 for i386 */
        jmp 2b
        .popsection

#endif


The ___arch_hweight[bwlq] functions would have to be written in assembly
with all registers preserved.  The input and output is a common word on
the stack -- 8(%rsp) or 4(%esp) for x86-64 v. i386.

        -hpa


Reply via email to