Hi! Like any other instruction with 32-bit GPR destination operand in 64-bit mode, popcntl also clears the upper 32 bits of the register (and other bits too, it can return only 0 to 32 inclusive).
During combine, the zero or sign extensions of it show up as paradoxical subreg of the popcount & 63, there 63 is the smallest power of two - 1 mask that can represent all the 0 to 32 inclusive values. Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2020-01-30 Jakub Jelinek <ja...@redhat.com> PR target/91824 * config/i386/i386.md (*popcountsi2_zext): New define_insn_and_split. (*popcountsi2_zext_falsedep): New define_insn. * gcc.target/i386/pr91824-1.c: New test. --- gcc/config/i386/i386.md.jj 2020-01-29 09:35:05.786248027 +0100 +++ gcc/config/i386/i386.md 2020-01-29 16:18:09.924717021 +0100 @@ -14563,6 +14563,60 @@ (define_insn "*popcount<mode>2_falsedep" (set_attr "type" "bitmanip") (set_attr "mode" "<MODE>")]) +(define_insn_and_split "*popcountsi2_zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (and:DI + (subreg:DI + (popcount:SI + (match_operand:SI 1 "nonimmediate_operand" "rm")) 0) + (const_int 63))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT && TARGET_64BIT" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %k0|%k0, %1}"; +#else + return "popcnt{l}\t{%1, %k0|%k0, %1}"; +#endif +} + "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 0) + (and:DI (subreg:DI (popcount:SI (match_dup 1)) 0) (const_int 63))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))])] + "ix86_expand_clear (operands[0]);" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +; False dependency happens when destination is only updated by tzcnt, +; lzcnt or popcnt. There is no false dependency when destination is +; also used in source. +(define_insn "*popcountsi2_zext_falsedep" + [(set (match_operand:DI 0 "register_operand" "=r") + (and:DI + (subreg:DI + (popcount:SI + (match_operand:SI 1 "nonimmediate_operand" "rm")) 0) + (const_int 63))) + (unspec [(match_operand:DI 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT && TARGET_64BIT" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %k0|%k0, %1}"; +#else + return "popcnt{l}\t{%1, %k0|%k0, %1}"; +#endif +} + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + (define_insn_and_split "*popcounthi2_1" [(set (match_operand:SI 0 "register_operand") (popcount:SI --- gcc/testsuite/gcc.target/i386/pr91824-1.c.jj 2020-01-29 16:23:13.290186089 +0100 +++ gcc/testsuite/gcc.target/i386/pr91824-1.c 2020-01-29 16:23:32.095905212 +0100 @@ -0,0 +1,54 @@ +/* PR target/91824 */ +/* { dg-do compile { target lp64 } } */ +/* { dg-options "-O2 -mpopcnt" } */ +/* { dg-final { scan-assembler-not "cltq" } } */ + +unsigned int foo (void); + +unsigned long +f1 (unsigned int x) +{ + return __builtin_popcount (x); +} + +unsigned long +f2 (unsigned int x) +{ + return (unsigned) __builtin_popcount (x); +} + +unsigned long +f3 (unsigned int x) +{ + return __builtin_popcount (x) & 63ULL; +} + +unsigned long +f4 (unsigned int x) +{ + return __builtin_popcount (x) & 1023ULL; +} + +unsigned long +f5 (void) +{ + return __builtin_popcount (foo ()); +} + +unsigned long +f6 (void) +{ + return (unsigned) __builtin_popcount (foo ()); +} + +unsigned long +f7 (void) +{ + return __builtin_popcount (foo ()) & 63ULL; +} + +unsigned long +f8 (void) +{ + return __builtin_popcount (foo ()) & 1023ULL; +} Jakub