Hi! As mentioned in the PR, in some cases (one or both *movqi_internal operands %dil/%sil/%bpl/%spl, neither %r*b) the movl variant is smaller and in cases where movb and movl is the same size, we might as well choose the faster instruction.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2017-09-20 Jakub Jelinek <ja...@redhat.com> PR target/82260 * config/i386/i386.md (*movqi_internal): Replace =q <- q alternative with =Q <- Q, =R <- R and =r <- r alternatives, only enable the latter two for 64-bit, renumber alternatives, for -Os imov =q <- n alternative always use QI mode, for -Os imov =R <- R alternative always use SI mode, for imov =Q <- Q or =r <- r alternatives ignore -Os. * gcc.target/i386/pr82260-1.c: New test. * gcc.target/i386/pr82260-2.c: New test. --- gcc/config/i386/i386.md.jj 2017-09-20 10:46:08.000000000 +0200 +++ gcc/config/i386/i386.md 2017-09-20 15:15:19.365142707 +0200 @@ -2571,9 +2571,9 @@ (define_insn "*movhi_internal" (define_insn "*movqi_internal" [(set (match_operand:QI 0 "nonimmediate_operand" - "=q,q ,q ,r,r ,?r,m ,k,k,r,m,k") + "=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k") (match_operand:QI 1 "general_operand" - "q ,qn,qm,q,rn,qm,qn,r,k,k,k,m"))] + "Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m"))] "!(MEM_P (operands[0]) && MEM_P (operands[1]))" { static char buf[128]; @@ -2589,17 +2589,17 @@ (define_insn "*movqi_internal" case TYPE_MSKMOV: switch (which_alternative) { - case 7: + case 9: ops = "kmov%s\t{%%k1, %%0|%%0, %%k1}"; break; - case 9: + case 11: ops = "kmov%s\t{%%1, %%k0|%%k0, %%1}"; break; - case 10: - case 11: + case 12: + case 13: gcc_assert (TARGET_AVX512DQ); /* FALLTHRU */ - case 8: + case 10: ops = "kmov%s\t{%%1, %%0|%%0, %%1}"; break; default: @@ -2619,51 +2619,67 @@ (define_insn "*movqi_internal" } } [(set (attr "isa") - (if_then_else (eq_attr "alternative" "10,11") - (const_string "avx512dq") - (const_string "*"))) + (cond [(eq_attr "alternative" "1,2") + (const_string "x64") + (eq_attr "alternative" "12,13") + (const_string "avx512dq") + ] + (const_string "*"))) (set (attr "type") - (cond [(eq_attr "alternative" "7,8,9,10,11") + (cond [(eq_attr "alternative" "9,10,11,12,13") (const_string "mskmov") - (and (eq_attr "alternative" "5") + (and (eq_attr "alternative" "7") (not (match_operand:QI 1 "aligned_operand"))) (const_string "imovx") (match_test "optimize_function_for_size_p (cfun)") (const_string "imov") - (and (eq_attr "alternative" "3") + (and (eq_attr "alternative" "5") (ior (not (match_test "TARGET_PARTIAL_REG_STALL")) (not (match_test "TARGET_QIMODE_MATH")))) (const_string "imov") - (eq_attr "alternative" "3,5") + (eq_attr "alternative" "5,7") (const_string "imovx") (and (match_test "TARGET_MOVX") - (eq_attr "alternative" "2")) + (eq_attr "alternative" "4")) (const_string "imovx") ] (const_string "imov"))) (set (attr "prefix") - (if_then_else (eq_attr "alternative" "7,8,9") + (if_then_else (eq_attr "alternative" "9,10,11") (const_string "vex") (const_string "orig"))) (set (attr "mode") - (cond [(eq_attr "alternative" "3,4,5") + (cond [(eq_attr "alternative" "5,6,7") (const_string "SI") - (eq_attr "alternative" "6") + (eq_attr "alternative" "8") (const_string "QI") - (and (eq_attr "alternative" "7,8,9") + (and (eq_attr "alternative" "9,10,11") (not (match_test "TARGET_AVX512DQ"))) (const_string "HI") (eq_attr "type" "imovx") (const_string "SI") + ;; For -Os, 8-bit immediates are always shorter than 32-bit + ;; ones. + (and (eq_attr "type" "imov") + (and (eq_attr "alternative" "3") + (match_test "optimize_function_for_size_p (cfun)"))) + (const_string "QI") + ;; For -Os, movl where one or both operands are NON_Q_REGS + ;; and both are LEGACY_REGS is shorter than movb. + ;; Otherwise movb and movl sizes are the same, so decide purely + ;; based on speed factors. + (and (eq_attr "type" "imov") + (and (eq_attr "alternative" "1") + (match_test "optimize_function_for_size_p (cfun)"))) + (const_string "SI") (and (eq_attr "type" "imov") - (and (eq_attr "alternative" "0,1") + (and (eq_attr "alternative" "0,1,2,3") (and (match_test "TARGET_PARTIAL_REG_DEPENDENCY") - (and (not (match_test "optimize_function_for_size_p (cfun)")) - (not (match_test "TARGET_PARTIAL_REG_STALL")))))) + (not (match_test "TARGET_PARTIAL_REG_STALL"))))) (const_string "SI") ;; Avoid partial register stalls when not using QImode arithmetic (and (eq_attr "type" "imov") - (and (eq_attr "alternative" "0,1") + (and (eq_attr "alternative" "0,1,2,3") (and (match_test "TARGET_PARTIAL_REG_STALL") (not (match_test "TARGET_QIMODE_MATH"))))) (const_string "SI") --- gcc/testsuite/gcc.target/i386/pr82260-1.c.jj 2017-09-20 15:27:43.696823321 +0200 +++ gcc/testsuite/gcc.target/i386/pr82260-1.c 2017-09-20 15:34:45.913536355 +0200 @@ -0,0 +1,26 @@ +/* PR target/82260 */ +/* { dg-do compile { target lp64 } } */ +/* { dg-options "-Os -mtune=generic -masm=att" } */ +/* movl %esi, %ecx is shorter than movb %sil, %cl. While + movl %edx, %ecx is the same size as movb %dl, %cl and + movl %r8d, %ecx is the same size as movb %r8b, %cl, movl + is faster on contemporary CPUs. */ +/* { dg-final { scan-assembler-not {\mmovb\M} } } */ + +int +foo (int x, int c) +{ + return x >> c; +} + +int +bar (int x, int y, int z) +{ + return x >> z; +} + +int +baz (int x, int y, int z, int u, int v) +{ + return x >> v; +} --- gcc/testsuite/gcc.target/i386/pr82260-2.c.jj 2017-09-20 15:30:51.690469279 +0200 +++ gcc/testsuite/gcc.target/i386/pr82260-2.c 2017-09-20 15:35:31.358967291 +0200 @@ -0,0 +1,25 @@ +/* PR target/82260 */ +/* { dg-do compile { target lp64 } } */ +/* { dg-options "-Os -mtune=generic -masm=att -mtune-ctrl=^partial_reg_dependency" } */ +/* { dg-final { scan-assembler-not {\mmovb\t%sil, %cl} } } */ +/* { dg-final { scan-assembler {\mmovl\t%esi, %ecx} } } */ +/* { dg-final { scan-assembler {\mmovb\t%dl, %cl} } } */ +/* { dg-final { scan-assembler {\mmovb\t%r8b, %cl} } } */ + +int +foo (int x, int c) +{ + return x >> c; +} + +int +bar (int x, int y, int z) +{ + return x >> z; +} + +int +baz (int x, int y, int z, int u, int v) +{ + return x >> v; +} Jakub