Before this patch GCC would zero extend a DImode GPR value to TImode by first zero extending the DImode value into a GPR TImode register pair, and then do a MTVSRDD to move this value to a VSX register.
For example, consider the following code: #ifndef TYPE #define TYPE unsigned long long #endif void gpr_to_vsx (TYPE x, __uint128_t *p) { __uint128_t y = x; __asm__ (" # %x0" : "+wa" (y)); *p = y; } Currently GCC generates: gpr_to_vsx: mr 10,3 li 11,0 mtvsrdd 0,11,10 #APP # 0 #NO_APP stxv 0,0(4) blr I.e. the mr and li instructions create the zero extended TImode value in a GPR, and then the mtvsrdd instruction moves both registers into a single vector register. Instead, GCC should generate the following code. Since the mtvsrdd instruction will clear the upper 64 bits if the 2nd argument is 0 (non-zero values are a GPR to put in the upper 64 bits): gpr_to_vsx: mtvsrdd 0,0,3 #APP # 0 #NO_APP stxv 0,0(4) blr Originally, I posted a patch that added the zero_extendsiti2 insn. I got some pushback about using reload_completed in the split portion of the define_insn_and_split. However, this is a case where you absolutely have to use the reload_completed test, because if you split the code before register allocation to handle the normal, the split insns will not be compiled to generate the appropriate mtvsrdd without creating the TImode value in the GPR register. I can imagine there might be concern about favoring generating code using the vector registers instead of using the GPR registers if the code does not require the TImode value to be in a vector register. I completely rewrote the patch. This patch creates a peephole2 to catch this case, and it eliminates creating the TImode variable. Instead it just does the MTVSRDD instruction directly. That way it will not influence register allocation, and the code will only be generated in the specific case where we need the TImode value in a vector register. I have built GCC with the patches in this patch set applied on both little and big endian PowerPC systems and there were no regressions. Can I apply this patch to GCC 16? 2025-06-05 Michael Meissner <meiss...@linux.ibm.com> gcc/ PR target/108958 * config/rs6000/rs6000.md (UNSPEC_ZERO_EXTEND): New unspec. (zero_extendsiti2 peephole2): Add a peephole2 to simplify zero extend between DImode value in a GPR to a TImode target in a vector register. (zero_extendsiti2_vsx): New insn. gcc/testsuite/ PR target/108958 * gcc.target/powerpc/pr108958.c: New test. --- gcc/config/rs6000/rs6000.md | 26 ++++++++++++ gcc/testsuite/gcc.target/powerpc/pr108958.c | 47 +++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108958.c diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index c65d564f514..0674ab92209 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -173,6 +173,7 @@ (define_c_enum "unspec" UNSPEC_XXSPLTIW_CONST UNSPEC_FMAX UNSPEC_FMIN + UNSPEC_ZERO_EXTEND ]) ;; @@ -969,6 +970,31 @@ (define_insn_and_split "*zero_extendhi<mode>2_dot2" (set_attr "dot" "yes") (set_attr "length" "4,8")]) +;; Optimize zero_extendsiti2 from a GPR to a GPR and then moving the GPR to a +;; VSX register +(define_peephole2 + [(set (match_operand:DI 0 "int_reg_operand") + (match_operand:DI 1 "int_reg_operand")) + (set (match_operand:DI 2 "int_reg_operand") + (const_int 0)) + (set (match_operand:TI 3 "vsx_register_operand") + (match_operand:TI 4 "int_reg_operand"))] + "TARGET_DIRECT_MOVE_64BIT + && (reg_or_subregno (operands[0]) + == reg_or_subregno (operands[4]) + !!WORDS_BIG_ENDIAN) + && (reg_or_subregno (operands[2]) + == reg_or_subregno (operands[4]) + !WORDS_BIG_ENDIAN) + && peep2_reg_dead_p (3, operands[4])" + [(set (match_dup 3) + (unspec:TI [(match_dup 1)] UNSPEC_ZERO_EXTEND))]) + +(define_insn "*zero_extendsiti2_vsx" + [(set (match_operand:TI 0 "vsx_register_operand" "=wa") + (unspec:TI [(match_operand:DI 1 "int_reg_operand" "r")] + UNSPEC_ZERO_EXTEND))] + "TARGET_DIRECT_MOVE_64BIT" + "mtvsrdd %x0,0,%1" + [(set_attr "type" "mtvsr")]) (define_insn "zero_extendsi<mode>2" [(set (match_operand:EXTSI 0 "gpc_reg_operand" "=r,r,d,wa,wa,r,wa") diff --git a/gcc/testsuite/gcc.target/powerpc/pr108958.c b/gcc/testsuite/gcc.target/powerpc/pr108958.c new file mode 100644 index 00000000000..21b3f276691 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr108958.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target int128 } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */ + +#ifndef TYPE +#define TYPE unsigned long long +#endif + +/* PR target/108958, when zero extending a DImode to a TImode, and the TImode variable is in a VSX register, generate: + + mtvsrdd vreg,0,gpr + + instead of: + + mr tmp,gpr + li tmp+1,0 + mtvsrdd vreg,tmp+1,tmp. */ + +void +gpr_to_vsx (TYPE x, __uint128_t *p) +{ + /* mtvsrdd 0,0,3 + stvx 0,0(4) */ + + __uint128_t y = x; + __asm__ (" # %x0" : "+wa" (y)); + *p = y; +} + +void +gpr_to_gpr (TYPE x, __uint128_t *p) +{ + /* mr 2,3 + li 3,0 + std 2,0(4) + std 3,8(4) */ + + __uint128_t y = x; + __asm__ (" # %0" : "+r" (y)); + *p = y; +} + +/* { dg-final { scan-assembler-times {\mli\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mmtvsrdd .*,0,.*\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mstxv\M} 1 } } */ -- 2.49.0 -- Michael Meissner, IBM PO Box 98, Ayer, Massachusetts, USA, 01432 email: meiss...@linux.ibm.com