Improve immediate expansion of immediates which can be created from a bitmask immediate and 2 MOVKs. This reduces the number of 4-instruction immediates in SPECINT/FP by 10-15%.
Passes regress, OK for commit? gcc/ChangeLog: PR target/106583 * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate) Add support for a bitmask immediate with 2 MOVKs. gcc/testsuite: PR target/106583 * gcc.target/aarch64/pr106583.c: Add new test. --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 926e81f028c82aac9a5fecc18f921f84399c24ae..1601d11710cb6132c80a77bb4fe2f8429519aa5a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -5568,7 +5568,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) + ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0); - if (zero_match != 2 && one_match != 2) + if (zero_match < 2 && one_match < 2) { /* Try emitting a bitmask immediate with a movk replacing 16 bits. For a 64-bit bitmask try whether changing 16 bits to all ones or @@ -5600,6 +5600,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, } } + /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */ + if (zero_match + one_match == 0) + { + mask = 0xffffffff; + + for (i = 0; i < 64; i += 16) + { + val2 = val & ~mask; + if (aarch64_bitmask_imm (val2, mode)) + break; + val2 = val | mask; + if (aarch64_bitmask_imm (val2, mode)) + break; + val2 = val2 & ~mask; + val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask); + if (aarch64_bitmask_imm (val2, mode)) + break; + + mask = (mask << 16) | (mask >> 48); + } + + if (i != 64) + { + if (generate) + { + emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); + emit_insn (gen_insv_immdi (dest, GEN_INT (i), + GEN_INT ((val >> i) & 0xffff))); + i = (i + 16) & 63; + emit_insn (gen_insv_immdi (dest, GEN_INT (i), + GEN_INT ((val >> i) & 0xffff))); + } + + return 3; + } + } + /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which are emitted by the initial mov. If one_match > zero_match, skip set bits, otherwise skip zero bits. */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c b/gcc/testsuite/gcc.target/aarch64/pr106583.c new file mode 100644 index 0000000000000000000000000000000000000000..f0a027a0950e506d4ddaacce5e151f57070948dc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c @@ -0,0 +1,30 @@ +/* { dg-do assemble } */ +/* { dg-options "-O2 --save-temps" } */ + +long f1 (void) +{ + return 0x7efefefefefefeff; +} + +long f2 (void) +{ + return 0x12345678aaaaaaaa; +} + +long f3 (void) +{ + return 0x1234cccccccc5678; +} + +long f4 (void) +{ + return 0x7777123456787777; +} + +long f5 (void) +{ + return 0x5555555512345678; +} + +/* { dg-final { scan-assembler-times {\tmovk\t} 10 } } */ +/* { dg-final { scan-assembler-times {\tmov\t} 5 } } */