https://gcc.gnu.org/g:f2609a4c653a4993eec7ed9a2ad05caf936f0178
commit r17-548-gf2609a4c653a4993eec7ed9a2ad05caf936f0178 Author: Roger Sayle <[email protected]> Date: Sat May 16 08:44:06 2026 +0100 x86_64: Handle hard registers in TImode STV with inter-unit moves. This patch extends the types of chains that can be converted by x86's TImode Scalar-To-Vector (STV) pass, to include chains that originate and/or terminate with moves from/to hard registers. Currently STV candidate instructions explicitly exclude those than mention hard registers. As motivation, consider the four following functions: __int128 a, b, c, z; __int128 fun(); void foo_in(__int128 x) { z = (x ^ a ^ b ^ c); } __int128 foo_out() { return (z ^ a ^ b ^ c); } __int128 foo_inout(__int128 x) { return (x ^ a ^ b ^ c ^ z); } void foo_fun() { z = (fun() ^ a ^ b ^ c); } Of these, only the first, foo_in, is currently STV converted to use SSE instructions. Its incoming argument is constructed from a concat of two DImode registers, and support for this idiom was added in a previous STV patch. The next two functions aren't converted because the chain terminates with a return, which places the TImode result in a hard register. Likewise, the final foo_fun case isn't converted as the result from fun initiates a chain from a hard register. This patch supports STV conversion of TImode register-to-register moves, where either the source or the destination (but not both) is a hard register, by implementing it as a (relatively expensive) inter-unit move. Before, with -O2 -mavx: foo_out: movq z(%rip), %rax movq z+8(%rip), %rdx xorq a(%rip), %rax xorq a+8(%rip), %rdx xorq b(%rip), %rax xorq b+8(%rip), %rdx xorq c(%rip), %rax xorq c+8(%rip), %rdx ret After, with -O2 -mavx: foo_out: vmovdqa z(%rip), %xmm0 vpxor a(%rip), %xmm0, %xmm0 vpxor b(%rip), %xmm0, %xmm0 vpxor c(%rip), %xmm0, %xmm0 vpextrq $1, %xmm0, %rdx vmovq %xmm0, %rax ret Likewise for foo_fun, before with -O2 -mavx: foo_fun: subq $8, %rsp call fun movq a(%rip), %rsi movq a+8(%rip), %rdi xorq b(%rip), %rsi xorq b+8(%rip), %rdi xorq c(%rip), %rsi xorq c+8(%rip), %rdi xorq %rax, %rsi xorq %rdx, %rdi movq %rsi, z(%rip) movq %rdi, z+8(%rip) addq $8, %rsp ret After with -O2 -mavx: foo_fun: subq $8, %rsp call fun vmovdqa a(%rip), %xmm0 vpxor b(%rip), %xmm0, %xmm0 vmovq %rax, %xmm2 vpxor c(%rip), %xmm0, %xmm0 vpinsrq $1, %rdx, %xmm2, %xmm1 vpxor %xmm1, %xmm0, %xmm0 vmovdqa %xmm0, z(%rip) addq $8, %rsp ret The one small subtlety in this patch is in the cost calculation for inter-unit moves, which now correctly uses both sse_to_integer and integer_to_sse costs. This patch models the transfer of double word transfers between units as interunit_cost + COSTS_N_INSNS(1), i.e. that the two transfers are pipelined in parallel, so that the high latency is accounted for once [rather than 2*interunit_cost that assumes the transfers take place strictly sequentially with twice the single word transfer latency]. This revision implements Hongtao's suggestions/fixes to support TImode values in non-general hard registers, and adds two more test cases. Alas things turned out to be a little more complicated than originally proposed; previously STV used PUT_MODE on TImode pseudo registers to change their mode everywhere, but something different is required for hard registers, which may be used in multiple modes in a function. To demonstrate the (additional) benefits, consider the function: register __int128 x __asm("xmm0"); register __int128 y __asm("xmm1"); __int128 m; void foo() { m = x ^ y; } Previously GCC on x86_64 with -O2 generated: foo: movaps %xmm0, -24(%rsp) movq -24(%rsp), %rax movq -16(%rsp), %rdx movaps %xmm1, -24(%rsp) xorq -24(%rsp), %rax xorq -16(%rsp), %rdx movq %rax, m(%rip) movq %rdx, m+8(%rip) ret With this revised patch, we now generate: foo: movdqa %xmm0, %xmm2 pxor %xmm1, %xmm2 movaps %xmm2, m(%rip) ret 2026-05-16 Roger Sayle <[email protected]> Hongtao Liu <[email protected]> gcc/ChangeLog * config/i386/i386-features.cc (scalar_chain): If the chain starts with a register-to-register move from a hard register, then the hard register's defs don't need to converted. (timode_scalar_chain::compute_convert_gain): Provide costs for hard_reg-to-pseudo and pseudo-to-hard_reg moves. Tweak speed cost of timode_concatdi_p moves. (timode_scalar_chain::convert_insn): Add support for hard_reg-to-pseudo and pseudo-to-hard_reg TImode transfers. (timode_scalar_to_vector_candidate_p): Likewise. gcc/testsuite/ChangeLog * gcc.target/i386/avx-stv-1.c: New test case. * gcc.target/i386/sse2-stv-3.c: Likewise. * gcc.target/i386/sse2-stv-4.c: Likewise. * gcc.target/i386/sse2-stv-5.c: Likewise. Diff: --- gcc/config/i386/i386-features.cc | 108 ++++++++++++++++++++++++++--- gcc/testsuite/gcc.target/i386/avx-stv-1.c | 30 ++++++++ gcc/testsuite/gcc.target/i386/sse2-stv-3.c | 31 +++++++++ gcc/testsuite/gcc.target/i386/sse2-stv-4.c | 12 ++++ gcc/testsuite/gcc.target/i386/sse2-stv-5.c | 13 ++++ 5 files changed, 186 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index b8d4f672da0a..30c15e63a5e2 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -506,6 +506,10 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid, if (def_set) switch (GET_CODE (SET_SRC (def_set))) { + case REG: + if (HARD_REGISTER_P (SET_SRC (def_set))) + return true; + break; case VEC_SELECT: return true; case ZERO_EXTEND: @@ -1641,7 +1645,33 @@ timode_scalar_chain::compute_convert_gain () switch (GET_CODE (src)) { case REG: - if (!speed_p) + if (GENERAL_REGNO_P (REGNO (src))) + { + if (TARGET_AVX) + /* vmovq + vpinsrq */ + igain = speed_p ? -ix86_cost->integer_to_sse + - COSTS_N_INSNS (1) + : -COSTS_N_BYTES (11); + else + /* movq + movq + punpcklqdq */ + igain = speed_p ? -ix86_cost->integer_to_sse + - COSTS_N_INSNS (2) + : -COSTS_N_BYTES (14); + } + else if (GENERAL_REG_P (dst)) + { + if (TARGET_AVX) + /* vpextrq + vmovq */ + igain = speed_p ? -ix86_cost->sse_to_integer + - COSTS_N_INSNS (1) + : -COSTS_N_BYTES (11); + else + /* movhlps + movq + movq */ + igain = speed_p ? -ix86_cost->sse_to_integer + - COSTS_N_INSNS (2) + : -COSTS_N_BYTES (13); + } + else if (!speed_p) igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3); else igain = COSTS_N_INSNS (1); @@ -1680,7 +1710,7 @@ timode_scalar_chain::compute_convert_gain () if (timode_concatdi_p (src)) { /* vmovq;vpinsrq (11 bytes). */ - igain = speed_p ? -2 * ix86_cost->sse_to_integer + igain = speed_p ? -ix86_cost->integer_to_sse - COSTS_N_INSNS (1) : -COSTS_N_BYTES (11); break; } @@ -1693,7 +1723,7 @@ timode_scalar_chain::compute_convert_gain () case PLUS: if (timode_concatdi_p (src)) /* vmovq;vpinsrq (11 bytes). */ - igain = speed_p ? -2 * ix86_cost->sse_to_integer + igain = speed_p ? -ix86_cost->integer_to_sse - COSTS_N_INSNS (1) : -COSTS_N_BYTES (11); break; @@ -1963,8 +1993,13 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) case REG: if (GET_MODE (dst) == TImode) { - PUT_MODE (dst, V1TImode); - fix_debug_reg_uses (dst); + if (!HARD_REGISTER_NUM_P (REGNO (dst))) + { + PUT_MODE (dst, V1TImode); + fix_debug_reg_uses (dst); + } + else if (!GENERAL_REGNO_P (REGNO (dst))) + dst = gen_raw_REG (V1TImode, REGNO (dst)); } if (GET_MODE (dst) == V1TImode) { @@ -1988,8 +2023,42 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) case REG: if (GET_MODE (src) == TImode) { - PUT_MODE (src, V1TImode); - fix_debug_reg_uses (src); + if (GENERAL_REGNO_P (REGNO (src))) + { + rtx lo = gen_reg_rtx (DImode); + rtx hi = gen_reg_rtx (DImode); + emit_insn_before (gen_rtx_SET (lo, gen_lowpart (DImode, src)), + insn); + emit_insn_before (gen_rtx_SET (hi, gen_highpart (DImode, src)), + insn); + src = gen_reg_rtx (V2DImode); + emit_insn_before (gen_vec_concatv2di (src, lo, hi), insn); + src = gen_lowpart (V1TImode, src); + } + else if (!HARD_REGISTER_NUM_P (REGNO (src))) + { + PUT_MODE (src, V1TImode); + fix_debug_reg_uses (src); + } + else + src = gen_raw_REG (V1TImode, REGNO (src)); + } + if (GENERAL_REG_P (dst)) + { + rtx tmp = gen_reg_rtx (V2DImode); + src = gen_lowpart (V2DImode, src); + emit_insn_before (gen_rtx_SET (tmp, src), insn); + /* Extracting hi before lo helps register allocation. */ + rtx hi = gen_reg_rtx (DImode); + rtx lo = gen_reg_rtx (DImode); + emit_insn_before (gen_vec_extractv2didi (hi, tmp, const1_rtx), insn); + emit_insn_before (gen_vec_extractv2didi (lo, tmp, const0_rtx), insn); + + /* Construct *concatditi3 pattern from lo and hi. */ + hi = gen_rtx_ZERO_EXTEND (TImode, hi); + hi = gen_rtx_ASHIFT (TImode, hi, GEN_INT (64)); + lo = gen_rtx_ZERO_EXTEND (TImode, lo); + src = gen_rtx_PLUS (TImode, hi, lo); } break; @@ -2453,8 +2522,31 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn) { rtx def_set = pseudo_reg_set (insn); + /* We allow two exceptions to the pseudo registers only rule. + Setting a hard register from a pseudo, and setting a pseudo + from a hard register. */ if (!def_set) - return false; + { + def_set = single_set (insn); + if (def_set) + { + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + if (GET_MODE (dst) == TImode + && REG_P (src) && REG_P (dst)) + { + if (HARD_REGISTER_P (dst) + && !HARD_REGISTER_P (src) + && single_def_chain_p (src)) + return true; + if (HARD_REGISTER_P (src) + && !HARD_REGISTER_P (dst) + && single_def_chain_p (dst)) + return true; + } + } + return false; + } rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); diff --git a/gcc/testsuite/gcc.target/i386/avx-stv-1.c b/gcc/testsuite/gcc.target/i386/avx-stv-1.c new file mode 100644 index 000000000000..e9dea2d0f4a1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-stv-1.c @@ -0,0 +1,30 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx -mno-stackrealign" } */ + +__int128 a, b, c, z; + +__int128 fun(); + +void foo_in(__int128 x) +{ + z = (x ^ a ^ b ^ c); +} + +__int128 foo_out() +{ + return (z ^ a ^ b ^ c); +} + +__int128 foo_inout(__int128 x) +{ + return (x ^ a ^ b ^ c ^ z); +} + +void foo_fun() +{ + z = (fun() ^ a ^ b ^ c); +} + +/* { dg-final { scan-assembler-times "vpinsrq" 3 } } */ +/* { dg-final { scan-assembler-times "vpextrq" 2 } } */ +/* { dg-final { scan-assembler-times "vpxor" 13 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-3.c b/gcc/testsuite/gcc.target/i386/sse2-stv-3.c new file mode 100644 index 000000000000..0a638013aedb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-stv-3.c @@ -0,0 +1,31 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2 -mno-sse4 -mno-stackrealign" } */ + +__int128 a, b, c, z; + +__int128 fun(); + +void foo_in(__int128 x) +{ + z = (x ^ a ^ b ^ c); +} + +__int128 foo_out() +{ + return (z ^ a ^ b ^ c); +} + +__int128 foo_inout(__int128 x) +{ + return (x ^ a ^ b ^ c ^ z); +} + +void foo_fun() +{ + z = (fun() ^ a ^ b ^ c); +} + +/* { dg-final { scan-assembler-times "punpcklqdq" 2 } } */ +/* { dg-final { scan-assembler-times "movhlps" 1 } } */ +/* { dg-final { scan-assembler-times "pxor" 9 } } */ +/* { dg-final { scan-assembler-times "xorq" 8 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-4.c b/gcc/testsuite/gcc.target/i386/sse2-stv-4.c new file mode 100644 index 000000000000..8c655ff9a4bf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-stv-4.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2 -mno-sse4 -mno-stackrealign" } */ + +__int128 a, b, c, z; + +void foo() +{ + register __int128 x __asm("xmm0"); + z = (x ^ a ^ b ^ c); +} + +/* { dg-final { scan-assembler-times "pxor" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-5.c b/gcc/testsuite/gcc.target/i386/sse2-stv-5.c new file mode 100644 index 000000000000..b4bbc2bbae57 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-stv-5.c @@ -0,0 +1,13 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2 -mno-sse4 -mno-stackrealign" } */ + +__int128 m; + +void foo() +{ + register __int128 x __asm("xmm0"); + register __int128 y __asm("xmm1"); + m = x ^ y; +} + +/* { dg-final { scan-assembler-times "pxor" 1 } } */
