On Wed, 2022-04-06 at 14:21 -0400, Michael Meissner wrote: > From bf51c49f1481001c7b3223474d261dcbf9365eda Mon Sep 17 00:00:00 2001 > From: Michael Meissner <meiss...@linux.ibm.com> > Date: Fri, 1 Apr 2022 22:27:13 -0400 > Subject: [PATCH] Add zero_extendditi2. Improve lxvr*x code generation. >
Hi, > This pattern adds zero_extendditi2 so that if we are extending DImode to > TImode, and we want the result in a vector register, the compiler can > generate MTVSRDDD. > > In addition the patterns for generating lxvr{b,h,w,d}x were tuned to allow > loading to gpr registers. This prevents needlessly doing direct moves to > get the value into the vector registers if the gpr register was already > selected. ok > > In updating the insn counts for two tests due to these changes, I noticed > the tests were done at -O0. I changed this so that the tests are now done > at the normal -O2 optimization level. Per the comments (which you fixed up later in patch), I note they were deliberately done at -O0 since under higher optimizations gcc would generate other load instructions during those tests. Presumably with these changes that is no longer the case. :-) > > I have tested this patch with bootstrap builds and running the regression > testsuite using this patch on: > > Little endian power10, --with-cpu=power10 > Little endian power9, --with-cpu=power9 > Big endian power8, --with-cpu=power8 (both 64/32-bit tests done). > > There were no regressions. Can I check this into the master branch? > > 2022-04-06 Michael Meissner <meiss...@linux.ibm.com> > > gcc/ > * config/rs6000/vsx.md (vsx_lxvr<wd>x): Add support for loading to > GPR registers. > (vsx_stxvr<wd>x): Add support for storing from GPR registers. > (zero_extendditi2): New insn. > > gcc/testsuite/ > * gcc.target/powerpc/vsx-load-element-extend-int.c: Use -O2 > instead of -O0 and update insn counts. > * gcc.target/powerpc/vsx-load-element-extend-short.c: Likewise. > * gcc.target/powerpc/zero-extend-di-ti.c: New test. > > --- > gcc/config/rs6000/vsx.md | 82 +++++++++++++++++-- > .../powerpc/vsx-load-element-extend-int.c | 36 ++++---- > .../powerpc/vsx-load-element-extend-short.c | 35 ++++---- > .../gcc.target/powerpc/zero-extend-di-ti.c | 62 ++++++++++++++ > 4 files changed, 164 insertions(+), 51 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index c091e5e2f47..ad971e3a1de 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -1315,14 +1315,32 @@ (define_expand "vsx_store_<mode>" > } > }) > > -;; Load rightmost element from load_data > -;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx. > -(define_insn "vsx_lxvr<wd>x" > - [(set (match_operand:TI 0 "vsx_register_operand" "=wa") > - (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "Z")))] > - "TARGET_POWER10" > - "lxvr<wd>x %x0,%y1" > - [(set_attr "type" "vecload")]) > +;; Load rightmost element from load_data using lxvrbx, lxvrhx, lxvrwx, > lxvrdx. > +;; Support TImode being in a GPR register to prevent generating lvxr{d,w,b}x > +;; and then two direct moves if we ultimately need the value in a GPR > register. > +(define_insn_and_split "vsx_lxvr<wd>x" > + [(set (match_operand:TI 0 "register_operand" "=r,wa") > + (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "m,Z")))] > + "TARGET_POWERPC64 && TARGET_POWER10" > + "@ > + # > + lxvr<wd>x %x0,%y1" > + "&& reload_completed && int_reg_operand (operands[0], TImode)" > + [(set (match_dup 2) (match_dup 3)) > + (set (match_dup 4) (const_int 0))] > +{ > + rtx op0 = operands[0]; > + rtx op1 = operands[1]; > + > + operands[2] = gen_lowpart (DImode, op0); > + operands[3] = (<MODE>mode == DImode > + ? op1 > + : gen_rtx_ZERO_EXTEND (DImode, op1)); > + > + operands[4] = gen_highpart (DImode, op0); > +} > + [(set_attr "type" "load,vecload") > + (set_attr "num_insns" "2,*")]) > > ;; Store rightmost element into store_data > ;; using stxvrbx, stxvrhx, strvxwx, strvxdx. > @@ -5019,6 +5037,54 @@ (define_expand "vsignextend_si_v2di" > DONE; > }) > > +;; Zero extend DI to TI. If we don't have the MTVSRDD instruction (and > LXVRDX > +;; in the case of power10), we use the machine independent code. If we are > +;; loading up GPRs, we fall back to the old code. In this context it's not clear what is the "old code" ? The mtvsrdd instruction is referenced in this code path. I see no direct reference to lxvrdx here, though I suppose it's assumed somewhere behind the emit_ calls. > +(define_insn_and_split "zero_extendditi2" > + [(set (match_operand:TI 0 "register_operand" "=r,r, > wa,&wa") > + (zero_extend:TI (match_operand:DI 1 "register_operand" "r,wa,r, > wa")))] > + "TARGET_POWERPC64 && TARGET_P9_VECTOR" > + "@ > + # > + # > + mtvsrdd %x0,0,%1 > + #" > + "&& reload_completed > + && (int_reg_operand (operands[0], TImode) > + || vsx_register_operand (operands[1], DImode))" > + [(pc)] > +{ > + rtx dest = operands[0]; > + rtx src = operands[1]; > + int dest_regno = reg_or_subregno (dest); > + > + /* Handle conversion to GPR registers. Load up the low part and then do > + zero out the upper part. */ > + if (INT_REGNO_P (dest_regno)) > + { > + rtx dest_hi = gen_highpart (DImode, dest); > + rtx dest_lo = gen_lowpart (DImode, dest); > + > + emit_move_insn (dest_lo, src); > + emit_move_insn (dest_hi, const0_rtx); > + DONE; > + } > + > + /* For settomg a VSX register from another VSX register, clear the result > + register, and use XXPERMDI to shift the value into the lower 64-bits. > */ setting No reference to xxpermdi in the code chunk here, though we are pretty sure it will be generated via gen_vsx_concat_v2di. > + rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno); > + rtx dest_di = gen_rtx_REG (DImode, dest_regno); > + > + emit_move_insn (dest_v2di, CONST0_RTX (V2DImode)); > + if (BYTES_BIG_ENDIAN) > + emit_insn (gen_vsx_concat_v2di (dest_v2di, dest_di, src)); > + else > + emit_insn (gen_vsx_concat_v2di (dest_v2di, src, dest_di)); > + DONE; > +} > + [(set_attr "type" "integer,mfvsr,vecmove,vecperm") > + (set_attr "length" "8, 8, *, 8")]) > + > ;; Sign extend DI to TI. We provide both GPR targets and Altivec targets on > ;; power10. On earlier systems, the machine independent code will generate a > ;; shift left to sign extend the 64-bit value to 128-bit. > diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > index c40e1a3a0f7..1f1281d6b75 100644 > --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > @@ -6,33 +6,25 @@ > /* { dg-do compile { target { ! power10_hw } } } */ > /* { dg-require-effective-target power10_ok } */ > /* { dg-require-effective-target int128 } */ > - > -/* Deliberately set optization to zero for this test to confirm > - the lxvr*x instruction is generated. At higher optimization levels > - the instruction we are looking for is sometimes replaced by other > - load instructions. */ > -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */ > - > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */ > /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */ > > #define NUM_VEC_ELEMS 4 > #define ITERS 16 > > -/* > -Codegen at time of writing is a single lxvrwx for the zero > -extended test, and a lwax,mtvsrdd,vextsd2q for the sign > -extended test. > - > -0000000010000c90 <test_sign_extended_load>: > - 10000c90: aa 1a 24 7d lwax r9,r4,r3 > - 10000c94: 67 4b 40 7c mtvsrdd vs34,0,r9 > - 10000c98: 02 16 5b 10 vextsd2q v2,v2 > - 10000c9c: 20 00 80 4e blr > - > -0000000010000cb0 <test_zero_extended_unsigned_load>: > - 10000cb0: 9b 18 44 7c lxvrwx vs34,r4,r3 > - 10000cb4: 20 00 80 4e blr > -*/ > +/* Codegen at time of writing is a single lxvrwx for the zero extended test, > + and a lxvrwx + vexts* sign extension instructions for the sign extended > + test. > + > + 0000000000000000 <test_sign_extended_load>: > + 0: 9b 18 44 7c lxvrwx vs34,r4,r3 > + 4: 02 16 5a 10 vextsw2d v2,v2 > + 8: 02 16 5b 10 vextsd2q v2,v2 > + c: 20 00 80 4e blr > + > + 0000000000000020 <test_zero_extended_unsigned_load>: > + 20: 9b 18 44 7c lxvrwx vs34,r4,r3 > + 24: 20 00 80 4e blr */ ok. > > #include <altivec.h> > #include <stdio.h> > diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c > b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c > index 837ba79c9ab..a7721318812 100644 > --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c > +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c > @@ -6,33 +6,26 @@ > /* { dg-do compile { target { ! power10_hw } } } */ > /* { dg-require-effective-target power10_ok } */ > /* { dg-require-effective-target int128 } */ > - > -/* Deliberately set optization to zero for this test to confirm > - the lxvr*x instruction is generated. At higher optimization levels > - the instruction we are looking for is sometimes replaced by other > - load instructions. */ > -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */ > > /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */ > > #define NUM_VEC_ELEMS 8 > #define ITERS 16 > > -/* > -Codegen at time of writing uses lxvrhx for the zero > -extension test and lhax,mtvsrdd,vextsd2q for the > -sign extended test. > - > -0000000010001810 <test_sign_extended_load>: > - 10001810: ae 1a 24 7d lhax r9,r4,r3 > - 10001814: 67 4b 40 7c mtvsrdd vs34,0,r9 > - 10001818: 02 16 5b 10 vextsd2q v2,v2 > - 1000181c: 20 00 80 4e blr > - > -0000000010001830 <test_zero_extended_unsigned_load>: > - 10001830: 5b 18 44 7c lxvrhx vs34,r4,r3 > - 10001834: 20 00 80 4e blr > -*/ > +/* Codegen at time of writing is a single lxvrwx for the zero extended test, > + and a lxvrwx + vexts* sign extension instructions for the sign extended > + test. > + > + 0000000000000000 <test_sign_extended_load>: > + 0: 5b 18 44 7c lxvrhx vs34,r4,r3 > + 4: 02 16 59 10 vextsh2d v2,v2 > + 8: 02 16 5b 10 vextsd2q v2,v2 > + c: 20 00 80 4e blr > + > + 0000000000000020 <test_zero_extended_unsigned_load>: > + 20: 5b 18 44 7c lxvrhx vs34,r4,r3 > + 24: 20 00 80 4e blr */ > ok > #include <altivec.h> > #include <stdio.h> > diff --git a/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > new file mode 100644 > index 00000000000..9b3b9c4dbd0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > @@ -0,0 +1,62 @@ > +/* { dg-require-effective-target int128 } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ > + > +/* This patch makes sure the various optimization and code paths are done for > + zero extending DImode to TImode on power10. */ > + > +__uint128_t > +gpr_to_gpr (unsigned long long a) > +{ > + /* li 4,0. */ > + return a; > +} > + > +__uint128_t > +mem_to_gpr (unsigned long long *p) > +{ > + /* ld 3,0(3); li 4,0. */ > + return *p; > +} > + > +__uint128_t > +vsx_to_gpr (__uint128_t *p, double d) > +{ > + /* fctiduz 1,1; li 4,0;mfvsrd 3,1. */ > + return (unsigned long long)d; > +} > + > +void > +gpr_to_vsx (__uint128_t *p, unsigned long long a) > +{ > + /* mtvsrdd 0,0,4; stxv 0,0(3). */ > + __uint128_t b = a; > + __asm__ (" # %x0" : "+wa" (b)); > + *p = b; > +} > + > +void > +mem_to_vsx (__uint128_t *p, unsigned long long *q) > +{ > + /* lxvrdx 0,0,4; stxv 0,0(3). */ > + __uint128_t a = *q; > + __asm__ (" # %x0" : "+wa" (a)); > + *p = a; > +} > + > +void > +vsx_to_vsx (__uint128_t *p, double d) > +{ > + /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3). */ > + __uint128_t a = (unsigned long long)d; > + __asm__ (" # %x0" : "+wa" (a)); > + *p = a; > +} ok thanks -Will > + > +/* { dg-final { scan-assembler-times {\mli\M} 3 } } */ > +/* { dg-final { scan-assembler-times {\mld\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mmfvsrd\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mstxv\M} 3 } } */ > +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */ > -- > 2.35.1 > >