[gcc r17-548] x86_64: Handle hard registers in TImode STV with inter-unit moves.

Roger Sayle via Gcc-cvs Sat, 16 May 2026 00:46:25 -0700

https://gcc.gnu.org/g:f2609a4c653a4993eec7ed9a2ad05caf936f0178


commit r17-548-gf2609a4c653a4993eec7ed9a2ad05caf936f0178
Author: Roger Sayle <[email protected]>
Date:   Sat May 16 08:44:06 2026 +0100

    x86_64: Handle hard registers in TImode STV with inter-unit moves.
    
    This patch extends the types of chains that can be converted by x86's
    TImode Scalar-To-Vector (STV) pass, to include chains that originate
    and/or terminate with moves from/to hard registers.  Currently STV
    candidate instructions explicitly exclude those than mention hard
    registers.
    
    As motivation, consider the four following functions:
    
    __int128 a, b, c, z;
    __int128 fun();
    
    void foo_in(__int128 x) { z = (x ^ a ^ b ^ c); }
    
    __int128 foo_out() { return (z ^ a ^ b ^ c); }
    
    __int128 foo_inout(__int128 x) { return (x ^ a ^ b ^ c ^ z); }
    
    void foo_fun() { z = (fun() ^ a ^ b ^ c); }
    
    Of these, only the first, foo_in, is currently STV converted to use
    SSE instructions.  Its incoming argument is constructed from a concat
    of two DImode registers, and support for this idiom was added in a
    previous STV patch.  The next two functions aren't converted because
    the chain terminates with a return, which places the TImode result in
    a hard register.  Likewise, the final foo_fun case isn't converted as
    the result from fun initiates a chain from a hard register.
    
    This patch supports STV conversion of TImode register-to-register
    moves, where either the source or the destination (but not both) is
    a hard register, by implementing it as a (relatively expensive)
    inter-unit move.
    
    Before, with -O2 -mavx:
    
    foo_out:
            movq    z(%rip), %rax
            movq    z+8(%rip), %rdx
            xorq    a(%rip), %rax
            xorq    a+8(%rip), %rdx
            xorq    b(%rip), %rax
            xorq    b+8(%rip), %rdx
            xorq    c(%rip), %rax
            xorq    c+8(%rip), %rdx
            ret
    
    After, with -O2 -mavx:
    
    foo_out:
            vmovdqa z(%rip), %xmm0
            vpxor   a(%rip), %xmm0, %xmm0
            vpxor   b(%rip), %xmm0, %xmm0
            vpxor   c(%rip), %xmm0, %xmm0
            vpextrq $1, %xmm0, %rdx
            vmovq   %xmm0, %rax
            ret
    
    Likewise for foo_fun, before with -O2 -mavx:
    
    foo_fun:
            subq    $8, %rsp
            call    fun
            movq    a(%rip), %rsi
            movq    a+8(%rip), %rdi
            xorq    b(%rip), %rsi
            xorq    b+8(%rip), %rdi
            xorq    c(%rip), %rsi
            xorq    c+8(%rip), %rdi
            xorq    %rax, %rsi
            xorq    %rdx, %rdi
            movq    %rsi, z(%rip)
            movq    %rdi, z+8(%rip)
            addq    $8, %rsp
            ret
    
    After with -O2 -mavx:
    
    foo_fun:
            subq    $8, %rsp
            call    fun
            vmovdqa a(%rip), %xmm0
            vpxor   b(%rip), %xmm0, %xmm0
            vmovq   %rax, %xmm2
            vpxor   c(%rip), %xmm0, %xmm0
            vpinsrq $1, %rdx, %xmm2, %xmm1
            vpxor   %xmm1, %xmm0, %xmm0
            vmovdqa %xmm0, z(%rip)
            addq    $8, %rsp
            ret
    
    The one small subtlety in this patch is in the cost calculation
    for inter-unit moves, which now correctly uses both sse_to_integer
    and integer_to_sse costs.  This patch models the transfer of double
    word transfers between units as interunit_cost + COSTS_N_INSNS(1),
    i.e. that the two transfers are pipelined in parallel, so that the
    high latency is accounted for once [rather than 2*interunit_cost
    that assumes the transfers take place strictly sequentially with
    twice the single word transfer latency].
    
    This revision implements Hongtao's suggestions/fixes to support
    TImode values in non-general hard registers, and adds two more
    test cases.  Alas things turned out to be a little more complicated
    than originally proposed; previously STV used PUT_MODE on TImode
    pseudo registers to change their mode everywhere, but something
    different is required for hard registers, which may be used in
    multiple modes in a function.
    
    To demonstrate the (additional) benefits, consider the function:
    
    register __int128 x __asm("xmm0");
    register __int128 y __asm("xmm1");
    __int128 m;
    
    void foo()
    {
      m = x ^ y;
    }
    
    Previously GCC on x86_64 with -O2 generated:
    
    foo:    movaps  %xmm0, -24(%rsp)
            movq    -24(%rsp), %rax
            movq    -16(%rsp), %rdx
            movaps  %xmm1, -24(%rsp)
            xorq    -24(%rsp), %rax
            xorq    -16(%rsp), %rdx
            movq    %rax, m(%rip)
            movq    %rdx, m+8(%rip)
            ret
    
    With this revised patch, we now generate:
    
    foo:    movdqa  %xmm0, %xmm2
            pxor    %xmm1, %xmm2
            movaps  %xmm2, m(%rip)
            ret
    
    2026-05-16  Roger Sayle  <[email protected]>
                Hongtao Liu  <[email protected]>
    
    gcc/ChangeLog
            * config/i386/i386-features.cc (scalar_chain): If the chain
            starts with a register-to-register move from a hard register,
            then the hard register's defs don't need to converted.
            (timode_scalar_chain::compute_convert_gain): Provide costs
            for hard_reg-to-pseudo and pseudo-to-hard_reg moves.
            Tweak speed cost of timode_concatdi_p moves.
            (timode_scalar_chain::convert_insn): Add support for
            hard_reg-to-pseudo and pseudo-to-hard_reg TImode transfers.
            (timode_scalar_to_vector_candidate_p): Likewise.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/avx-stv-1.c: New test case.
            * gcc.target/i386/sse2-stv-3.c: Likewise.
            * gcc.target/i386/sse2-stv-4.c: Likewise.
            * gcc.target/i386/sse2-stv-5.c: Likewise.

Diff:
---
 gcc/config/i386/i386-features.cc           | 108 ++++++++++++++++++++++++++---
 gcc/testsuite/gcc.target/i386/avx-stv-1.c  |  30 ++++++++
 gcc/testsuite/gcc.target/i386/sse2-stv-3.c |  31 +++++++++
 gcc/testsuite/gcc.target/i386/sse2-stv-4.c |  12 ++++
 gcc/testsuite/gcc.target/i386/sse2-stv-5.c |  13 ++++
 5 files changed, 186 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index b8d4f672da0a..30c15e63a5e2 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -506,6 +506,10 @@ scalar_chain::add_insn (bitmap candidates, unsigned int 
insn_uid,
   if (def_set)
     switch (GET_CODE (SET_SRC (def_set)))
       {
+      case REG:
+       if (HARD_REGISTER_P (SET_SRC (def_set)))
+         return true;
+       break;
       case VEC_SELECT:
        return true;
       case ZERO_EXTEND:
@@ -1641,7 +1645,33 @@ timode_scalar_chain::compute_convert_gain ()
       switch (GET_CODE (src))
        {
        case REG:
-         if (!speed_p)
+         if (GENERAL_REGNO_P (REGNO (src)))
+           {
+             if (TARGET_AVX)
+               /* vmovq + vpinsrq */
+               igain = speed_p ? -ix86_cost->integer_to_sse
+                                 - COSTS_N_INSNS (1)
+                               : -COSTS_N_BYTES (11);
+             else
+               /* movq + movq + punpcklqdq */
+               igain = speed_p ? -ix86_cost->integer_to_sse
+                                 - COSTS_N_INSNS (2)
+                               : -COSTS_N_BYTES (14);
+           }
+         else if (GENERAL_REG_P (dst))
+           {
+             if (TARGET_AVX)
+               /* vpextrq + vmovq */
+               igain = speed_p ? -ix86_cost->sse_to_integer
+                                 - COSTS_N_INSNS (1)
+                               : -COSTS_N_BYTES (11);
+             else
+               /* movhlps + movq + movq */
+               igain = speed_p ? -ix86_cost->sse_to_integer
+                                 - COSTS_N_INSNS (2)
+                               : -COSTS_N_BYTES (13);
+           }
+         else if (!speed_p)
            igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
          else
            igain = COSTS_N_INSNS (1);
@@ -1680,7 +1710,7 @@ timode_scalar_chain::compute_convert_gain ()
          if (timode_concatdi_p (src))
            {
              /* vmovq;vpinsrq (11 bytes).  */
-             igain = speed_p ? -2 * ix86_cost->sse_to_integer
+             igain = speed_p ? -ix86_cost->integer_to_sse - COSTS_N_INSNS (1)
                              : -COSTS_N_BYTES (11);
              break;
            }
@@ -1693,7 +1723,7 @@ timode_scalar_chain::compute_convert_gain ()
        case PLUS:
          if (timode_concatdi_p (src))
            /* vmovq;vpinsrq (11 bytes).  */
-           igain = speed_p ? -2 * ix86_cost->sse_to_integer
+           igain = speed_p ? -ix86_cost->integer_to_sse - COSTS_N_INSNS (1)
                            : -COSTS_N_BYTES (11);
          break;
 
@@ -1963,8 +1993,13 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
     case REG:
       if (GET_MODE (dst) == TImode)
        {
-         PUT_MODE (dst, V1TImode);
-         fix_debug_reg_uses (dst);
+         if (!HARD_REGISTER_NUM_P (REGNO (dst)))
+           {
+             PUT_MODE (dst, V1TImode);
+             fix_debug_reg_uses (dst);
+           }
+         else if (!GENERAL_REGNO_P (REGNO (dst)))
+           dst = gen_raw_REG (V1TImode, REGNO (dst));
        }
       if (GET_MODE (dst) == V1TImode)
        {
@@ -1988,8 +2023,42 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
     case REG:
       if (GET_MODE (src) == TImode)
        {
-         PUT_MODE (src, V1TImode);
-         fix_debug_reg_uses (src);
+         if (GENERAL_REGNO_P (REGNO (src)))
+           {
+             rtx lo = gen_reg_rtx (DImode);
+             rtx hi = gen_reg_rtx (DImode);
+             emit_insn_before (gen_rtx_SET (lo, gen_lowpart (DImode, src)),
+                               insn);
+             emit_insn_before (gen_rtx_SET (hi, gen_highpart (DImode, src)),
+                               insn);
+             src = gen_reg_rtx (V2DImode);
+             emit_insn_before (gen_vec_concatv2di (src, lo, hi), insn);
+             src = gen_lowpart (V1TImode, src);
+           }
+         else if (!HARD_REGISTER_NUM_P (REGNO (src)))
+           {
+             PUT_MODE (src, V1TImode);
+             fix_debug_reg_uses (src);
+           }
+         else
+           src = gen_raw_REG (V1TImode, REGNO (src));
+       }
+      if (GENERAL_REG_P (dst))
+       {
+         rtx tmp = gen_reg_rtx (V2DImode);
+         src = gen_lowpart (V2DImode, src);
+         emit_insn_before (gen_rtx_SET (tmp, src), insn);
+         /* Extracting hi before lo helps register allocation.  */
+         rtx hi = gen_reg_rtx (DImode);
+         rtx lo = gen_reg_rtx (DImode);
+         emit_insn_before (gen_vec_extractv2didi (hi, tmp, const1_rtx), insn);
+         emit_insn_before (gen_vec_extractv2didi (lo, tmp, const0_rtx), insn);
+
+         /* Construct *concatditi3 pattern from lo and hi.  */
+         hi = gen_rtx_ZERO_EXTEND (TImode, hi);
+         hi = gen_rtx_ASHIFT (TImode, hi, GEN_INT (64));
+         lo = gen_rtx_ZERO_EXTEND (TImode, lo);
+         src = gen_rtx_PLUS (TImode, hi, lo);
        }
       break;
 
@@ -2453,8 +2522,31 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
 {
   rtx def_set = pseudo_reg_set (insn);
 
+  /* We allow two exceptions to the pseudo registers only rule.
+     Setting a hard register from a pseudo, and setting a pseudo
+     from a hard register.  */
   if (!def_set)
-    return false;
+    {
+      def_set = single_set (insn);
+      if (def_set)
+       {
+         rtx src = SET_SRC (def_set);
+         rtx dst = SET_DEST (def_set);
+         if (GET_MODE (dst) == TImode
+             && REG_P (src) && REG_P (dst))
+           {
+             if (HARD_REGISTER_P (dst)
+                 && !HARD_REGISTER_P (src)
+                 && single_def_chain_p (src))
+               return true;
+             if (HARD_REGISTER_P (src)
+                 && !HARD_REGISTER_P (dst)
+                 && single_def_chain_p (dst))
+               return true;
+           }
+       }
+      return false;
+    }
 
   rtx src = SET_SRC (def_set);
   rtx dst = SET_DEST (def_set);
diff --git a/gcc/testsuite/gcc.target/i386/avx-stv-1.c 
b/gcc/testsuite/gcc.target/i386/avx-stv-1.c
new file mode 100644
index 000000000000..e9dea2d0f4a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-stv-1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -mavx -mno-stackrealign" } */
+
+__int128 a, b, c, z;
+
+__int128 fun();
+
+void foo_in(__int128 x)
+{
+  z = (x ^ a ^ b ^ c);
+}
+
+__int128 foo_out()
+{
+  return (z ^ a ^ b ^ c);
+}
+
+__int128 foo_inout(__int128 x)
+{
+  return (x ^ a ^ b ^ c ^ z);
+}
+
+void foo_fun()
+{
+  z = (fun() ^ a ^ b ^ c);
+}
+
+/* { dg-final { scan-assembler-times "vpinsrq" 3 } } */
+/* { dg-final { scan-assembler-times "vpextrq" 2 } } */
+/* { dg-final { scan-assembler-times "vpxor" 13 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-3.c 
b/gcc/testsuite/gcc.target/i386/sse2-stv-3.c
new file mode 100644
index 000000000000..0a638013aedb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-stv-3.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2 -mno-sse4 -mno-stackrealign" } */
+
+__int128 a, b, c, z;
+
+__int128 fun();
+
+void foo_in(__int128 x)
+{
+  z = (x ^ a ^ b ^ c);
+}
+
+__int128 foo_out()
+{
+  return (z ^ a ^ b ^ c);
+}
+
+__int128 foo_inout(__int128 x)
+{
+  return (x ^ a ^ b ^ c ^ z);
+}
+
+void foo_fun()
+{
+  z = (fun() ^ a ^ b ^ c);
+}
+
+/* { dg-final { scan-assembler-times "punpcklqdq" 2 } } */
+/* { dg-final { scan-assembler-times "movhlps" 1 } } */
+/* { dg-final { scan-assembler-times "pxor" 9 } } */
+/* { dg-final { scan-assembler-times "xorq" 8 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-4.c 
b/gcc/testsuite/gcc.target/i386/sse2-stv-4.c
new file mode 100644
index 000000000000..8c655ff9a4bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-stv-4.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2 -mno-sse4 -mno-stackrealign" } */
+
+__int128 a, b, c, z;
+
+void foo()
+{
+  register __int128 x __asm("xmm0");
+  z = (x ^ a ^ b ^ c);
+}
+
+/* { dg-final { scan-assembler-times "pxor" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-5.c 
b/gcc/testsuite/gcc.target/i386/sse2-stv-5.c
new file mode 100644
index 000000000000..b4bbc2bbae57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-stv-5.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2 -mno-sse4 -mno-stackrealign" } */
+
+__int128 m;
+
+void foo()
+{
+  register __int128 x __asm("xmm0");
+  register __int128 y __asm("xmm1");
+  m = x ^ y;
+}
+
+/* { dg-final { scan-assembler-times "pxor" 1 } } */

[gcc r17-548] x86_64: Handle hard registers in TImode STV with inter-unit moves.

Reply via email to