Before this patch GCC would zero extend a DImode GPR value to TImode by first
zero extending the DImode value into a GPR TImode register pair, and then do a
MTVSRDD to move this value to a VSX register.

For example, consider the following code:

        #ifndef TYPE
        #define TYPE unsigned long long
        #endif

        void
        gpr_to_vsx (TYPE x, __uint128_t *p)
        {
          __uint128_t y = x;
          __asm__ (" # %x0" : "+wa" (y));
          *p = y;
        }

Currently GCC generates:

        gpr_to_vsx:
                mr 10,3
                li 11,0
                mtvsrdd 0,11,10
        #APP
                 # 0
        #NO_APP
                stxv 0,0(4)
                blr

I.e. the mr and li instructions create the zero extended TImode value in a GPR,
and then the mtvsrdd instruction moves both registers into a single vector
register.

Instead, GCC should generate the following code.  Since the mtvsrdd instruction
will clear the upper 64 bits if the 2nd argument is 0 (non-zero values are a GPR
to put in the upper 64 bits):

        gpr_to_vsx:
                mtvsrdd 0,0,3
        #APP
                 # 0
        #NO_APP
                stxv 0,0(4)
                blr

Originally, I posted a patch that added the zero_extendsiti2 insn.  I got some
pushback about using reload_completed in the split portion of the
define_insn_and_split.  However, this is a case where you absolutely have to use
the reload_completed test, because if you split the code before register
allocation to handle the normal, the split insns will not be compiled to
generate the appropriate mtvsrdd without creating the TImode value in the GPR
register.  I can imagine there might be concern about favoring generating code
using the vector registers instead of using the GPR registers if the code does
not require the TImode value to be in a vector register.

I completely rewrote the patch.  This patch creates a peephole2 to catch this
case, and it eliminates creating the TImode variable.  Instead it just does the
MTVSRDD instruction directly.  That way it will not influence register
allocation, and the code will only be generated in the specific case where we
need the TImode value in a vector register.

I have built GCC with the patches in this patch set applied on both little and
big endian PowerPC systems and there were no regressions.  Can I apply this
patch to GCC 16?

2025-06-05  Michael Meissner  <meiss...@linux.ibm.com>

gcc/

        PR target/108958
        * config/rs6000/rs6000.md (UNSPEC_ZERO_EXTEND): New unspec.
        (zero_extendsiti2 peephole2): Add a peephole2 to simplify zero extend
        between DImode value in a GPR to a TImode target in a vector register.
        (zero_extendsiti2_vsx): New insn.

gcc/testsuite/

        PR target/108958
        * gcc.target/powerpc/pr108958.c: New test.
---
 gcc/config/rs6000/rs6000.md                 | 26 ++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr108958.c | 47 +++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108958.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index c65d564f514..0674ab92209 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -173,6 +173,7 @@ (define_c_enum "unspec"
    UNSPEC_XXSPLTIW_CONST
    UNSPEC_FMAX
    UNSPEC_FMIN
+   UNSPEC_ZERO_EXTEND
   ])
 
 ;;
@@ -969,6 +970,31 @@ (define_insn_and_split "*zero_extendhi<mode>2_dot2"
    (set_attr "dot" "yes")
    (set_attr "length" "4,8")])
 
+;; Optimize zero_extendsiti2 from a GPR to a GPR and then moving the GPR to a
+;; VSX register
+(define_peephole2
+  [(set (match_operand:DI 0 "int_reg_operand")
+       (match_operand:DI 1 "int_reg_operand"))
+   (set (match_operand:DI 2 "int_reg_operand")
+       (const_int 0))
+   (set (match_operand:TI 3 "vsx_register_operand")
+       (match_operand:TI 4 "int_reg_operand"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && (reg_or_subregno (operands[0])
+       == reg_or_subregno (operands[4]) + !!WORDS_BIG_ENDIAN)
+   && (reg_or_subregno (operands[2])
+       == reg_or_subregno (operands[4]) + !WORDS_BIG_ENDIAN)
+   && peep2_reg_dead_p (3, operands[4])"
+  [(set (match_dup 3)
+       (unspec:TI [(match_dup 1)] UNSPEC_ZERO_EXTEND))])
+
+(define_insn "*zero_extendsiti2_vsx"
+  [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
+       (unspec:TI [(match_operand:DI 1 "int_reg_operand" "r")]
+                  UNSPEC_ZERO_EXTEND))]
+  "TARGET_DIRECT_MOVE_64BIT"
+  "mtvsrdd %x0,0,%1"
+  [(set_attr "type" "mtvsr")])
 
 (define_insn "zero_extendsi<mode>2"
   [(set (match_operand:EXTSI 0 "gpc_reg_operand" "=r,r,d,wa,wa,r,wa")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108958.c 
b/gcc/testsuite/gcc.target/powerpc/pr108958.c
new file mode 100644
index 00000000000..21b3f276691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108958.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
+
+#ifndef TYPE
+#define TYPE unsigned long long
+#endif
+
+/* PR target/108958, when zero extending a DImode to a TImode, and the TImode 
variable is in a VSX register, generate:
+
+       mtvsrdd vreg,0,gpr
+
+   instead of:
+
+       mr tmp,gpr
+       li tmp+1,0
+       mtvsrdd vreg,tmp+1,tmp.  */
+
+void
+gpr_to_vsx (TYPE x, __uint128_t *p)
+{
+  /* mtvsrdd 0,0,3
+     stvx 0,0(4)  */
+
+  __uint128_t y = x;
+  __asm__ (" # %x0" : "+wa" (y));
+  *p = y;
+}
+
+void
+gpr_to_gpr (TYPE x, __uint128_t *p)
+{
+  /* mr 2,3
+     li 3,0
+     std 2,0(4)
+     std 3,8(4)  */
+
+  __uint128_t y = x;
+  __asm__ (" # %0" : "+r" (y));
+  *p = y;
+}
+
+/* { dg-final { scan-assembler-times {\mli\M}              1 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd .*,0,.*\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstd\M}             2 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M}            1 } } */
-- 
2.49.0


-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com

Reply via email to