Re: [PATCH] Add zero_extendditi2. Improve lxvr*x code generation.

will schmidt via Gcc-patches Wed, 06 Apr 2022 13:01:52 -0700

On Wed, 2022-04-06 at 14:21 -0400, Michael Meissner wrote:
> From bf51c49f1481001c7b3223474d261dcbf9365eda Mon Sep 17 00:00:00 2001
> From: Michael Meissner <meiss...@linux.ibm.com>
> Date: Fri, 1 Apr 2022 22:27:13 -0400
> Subject: [PATCH] Add zero_extendditi2.  Improve lxvr*x code generation.
>


Hi,

> This pattern adds zero_extendditi2 so that if we are extending DImode to
> TImode, and we want the result in a vector register, the compiler can
> generate MTVSRDDD.
> 
> In addition the patterns for generating lxvr{b,h,w,d}x were tuned to allow
> loading to gpr registers.  This prevents needlessly doing direct moves to
> get the value into the vector registers if the gpr register was already
> selected.

ok

> 
> In updating the insn counts for two tests due to these changes, I noticed
> the tests were done at -O0.  I changed this so that the tests are now done
> at the normal -O2 optimization level.

Per the comments (which you fixed up later in patch), I note they were
deliberately done at -O0 since under higher optimizations gcc would
generate other load instructions during those tests.  Presumably with
these changes that is no longer the case.  :-)
> 
> I have tested this patch with bootstrap builds and running the regression
> testsuite using this patch on:
> 
>       Little endian power10, --with-cpu=power10
>       Little endian power9, --with-cpu=power9
>       Big endian power8, --with-cpu=power8 (both 64/32-bit tests done).
> 
> There were no regressions.  Can I check this into the master branch?
> 
> 2022-04-06   Michael Meissner  <meiss...@linux.ibm.com>
> 
> gcc/
>       * config/rs6000/vsx.md (vsx_lxvr<wd>x): Add support for loading to
>       GPR registers.
>       (vsx_stxvr<wd>x): Add support for storing from GPR registers.
>       (zero_extendditi2): New insn.
> 
> gcc/testsuite/
>       * gcc.target/powerpc/vsx-load-element-extend-int.c: Use -O2
>       instead of -O0 and update insn counts.
>       * gcc.target/powerpc/vsx-load-element-extend-short.c: Likewise.
>       * gcc.target/powerpc/zero-extend-di-ti.c: New test.
> 
> ---
>  gcc/config/rs6000/vsx.md                      | 82 +++++++++++++++++--
>  .../powerpc/vsx-load-element-extend-int.c     | 36 ++++----
>  .../powerpc/vsx-load-element-extend-short.c   | 35 ++++----
>  .../gcc.target/powerpc/zero-extend-di-ti.c    | 62 ++++++++++++++
>  4 files changed, 164 insertions(+), 51 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> 
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index c091e5e2f47..ad971e3a1de 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -1315,14 +1315,32 @@ (define_expand "vsx_store_<mode>"
>      }
>  })
> 
> -;; Load rightmost element from load_data
> -;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx.
> -(define_insn "vsx_lxvr<wd>x"
> -  [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
> -     (zero_extend:TI (match_operand:INT_ISA3  1 "memory_operand" "Z")))]
> -  "TARGET_POWER10"
> -  "lxvr<wd>x %x0,%y1"
> -  [(set_attr "type" "vecload")])
> +;; Load rightmost element from load_data using lxvrbx, lxvrhx, lxvrwx, 
> lxvrdx.
> +;; Support TImode being in a GPR register to prevent generating lvxr{d,w,b}x
> +;; and then two direct moves if we ultimately need the value in a GPR 
> register.
> +(define_insn_and_split "vsx_lxvr<wd>x"
> +  [(set (match_operand:TI 0 "register_operand" "=r,wa")
> +     (zero_extend:TI (match_operand:INT_ISA3  1 "memory_operand" "m,Z")))]
> +  "TARGET_POWERPC64 && TARGET_POWER10"
> +  "@
> +   #
> +   lxvr<wd>x %x0,%y1"
> +  "&& reload_completed && int_reg_operand (operands[0], TImode)"
> +  [(set (match_dup 2) (match_dup 3))
> +   (set (match_dup 4) (const_int 0))]
> +{
> +  rtx op0 = operands[0];
> +  rtx op1 = operands[1];
> +
> +  operands[2] = gen_lowpart (DImode, op0);
> +  operands[3] = (<MODE>mode == DImode
> +              ? op1
> +              : gen_rtx_ZERO_EXTEND (DImode, op1));
> +
> +  operands[4] = gen_highpart (DImode, op0);
> +}
> +  [(set_attr "type" "load,vecload")
> +   (set_attr "num_insns" "2,*")])
> 
>  ;; Store rightmost element into store_data
>  ;; using stxvrbx, stxvrhx, strvxwx, strvxdx.
> @@ -5019,6 +5037,54 @@ (define_expand "vsignextend_si_v2di"
>    DONE;
>  })
> 
> +;; Zero extend DI to TI.  If we don't have the MTVSRDD instruction (and 
> LXVRDX
> +;; in the case of power10), we use the machine independent code.  If we are
> +;; loading up GPRs, we fall back to the old code.

In this context it's not clear what is the "old code" ?
The mtvsrdd
instruction is referenced in this code path.  I see no direct reference
to lxvrdx here, though I suppose it's assumed somewhere behind the
emit_ calls.


> +(define_insn_and_split "zero_extendditi2"
> +  [(set (match_operand:TI 0 "register_operand"                 "=r,r, 
> wa,&wa")
> +     (zero_extend:TI (match_operand:DI 1 "register_operand"  "r,wa,r,  
> wa")))]
> +  "TARGET_POWERPC64 && TARGET_P9_VECTOR"
> +  "@
> +   #
> +   #
> +   mtvsrdd %x0,0,%1
> +   #"
> +  "&& reload_completed
> +   && (int_reg_operand (operands[0], TImode)
> +       || vsx_register_operand (operands[1], DImode))"
> +  [(pc)]
> +{
> +  rtx dest = operands[0];
> +  rtx src = operands[1];
> +  int dest_regno = reg_or_subregno (dest);
> +
> +  /* Handle conversion to GPR registers.  Load up the low part and then do
> +     zero out the upper part.  */
> +  if (INT_REGNO_P (dest_regno))
> +    {
> +      rtx dest_hi = gen_highpart (DImode, dest);
> +      rtx dest_lo = gen_lowpart (DImode, dest);
> +
> +      emit_move_insn (dest_lo, src);
> +      emit_move_insn (dest_hi, const0_rtx);
> +      DONE;
> +    }
> +
> +  /* For settomg a VSX register from another VSX register, clear the result
> +     register, and use XXPERMDI to shift the value into the lower 64-bits.  
> */

setting

No reference to xxpermdi in the code chunk here, though we are pretty
sure it will be generated via gen_vsx_concat_v2di.  


> +  rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno);
> +  rtx dest_di = gen_rtx_REG (DImode, dest_regno);
> +
> +  emit_move_insn (dest_v2di, CONST0_RTX (V2DImode));
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (gen_vsx_concat_v2di (dest_v2di, dest_di, src));
> +  else
> +    emit_insn (gen_vsx_concat_v2di (dest_v2di, src, dest_di));
> +  DONE;
> +}
> +  [(set_attr "type"   "integer,mfvsr,vecmove,vecperm")
> +   (set_attr "length" "8,      8,    *,      8")])
> +
>  ;; Sign extend DI to TI.  We provide both GPR targets and Altivec targets on
>  ;; power10.  On earlier systems, the machine independent code will generate a
>  ;; shift left to sign extend the 64-bit value to 128-bit.


> diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c 
> b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> index c40e1a3a0f7..1f1281d6b75 100644
> --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
> @@ -6,33 +6,25 @@
>  /* { dg-do compile { target { ! power10_hw } } } */
>  /* { dg-require-effective-target power10_ok } */
>  /* { dg-require-effective-target int128 } */
> -
> -/* Deliberately set optization to zero for this test to confirm
> -   the lxvr*x instruction is generated. At higher optimization levels
> -   the instruction we are looking for is sometimes replaced by other
> -   load instructions. */
> -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
> -
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */
>  /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */
> 
>  #define NUM_VEC_ELEMS 4
>  #define ITERS 16
> 
> -/*
> -Codegen at time of writing is a single lxvrwx for the zero
> -extended test, and a lwax,mtvsrdd,vextsd2q for the sign
> -extended test.
> -
> -0000000010000c90 <test_sign_extended_load>:
> -    10000c90:        aa 1a 24 7d     lwax    r9,r4,r3
> -    10000c94:        67 4b 40 7c     mtvsrdd vs34,0,r9
> -    10000c98:        02 16 5b 10     vextsd2q v2,v2
> -    10000c9c:        20 00 80 4e     blr
> -
> -0000000010000cb0 <test_zero_extended_unsigned_load>:
> -    10000cb0:        9b 18 44 7c     lxvrwx  vs34,r4,r3
> -    10000cb4:        20 00 80 4e     blr
> -*/
> +/* Codegen at time of writing is a single lxvrwx for the zero extended test,
> +   and a lxvrwx + vexts* sign extension instructions for the sign extended
> +   test.
> +
> +   0000000000000000 <test_sign_extended_load>:
> +     0:   9b 18 44 7c     lxvrwx  vs34,r4,r3
> +     4:   02 16 5a 10     vextsw2d v2,v2
> +     8:   02 16 5b 10     vextsd2q v2,v2
> +     c:   20 00 80 4e     blr
> +
> +   0000000000000020 <test_zero_extended_unsigned_load>:
> +     20:   9b 18 44 7c     lxvrwx  vs34,r4,r3
> +     24:   20 00 80 4e     blr  */

ok.

> 
>  #include <altivec.h>
>  #include <stdio.h>
> diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c 
> b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> index 837ba79c9ab..a7721318812 100644
> --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
> @@ -6,33 +6,26 @@
>  /* { dg-do compile { target { ! power10_hw } } } */
>  /* { dg-require-effective-target power10_ok } */
>  /* { dg-require-effective-target int128 } */
> -
> -/* Deliberately set optization to zero for this test to confirm
> -   the lxvr*x instruction is generated. At higher optimization levels
> -   the instruction we are looking for is sometimes replaced by other
> -   load instructions.  */
> -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */
> 
>  /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */
> 
>  #define NUM_VEC_ELEMS 8
>  #define ITERS 16
> 
> -/*
> -Codegen at time of writing uses lxvrhx for the zero
> -extension test and lhax,mtvsrdd,vextsd2q for the
> -sign extended test.
> -
> -0000000010001810 <test_sign_extended_load>:
> -    10001810:        ae 1a 24 7d     lhax    r9,r4,r3
> -    10001814:        67 4b 40 7c     mtvsrdd vs34,0,r9
> -    10001818:        02 16 5b 10     vextsd2q v2,v2
> -    1000181c:        20 00 80 4e     blr
> -
> -0000000010001830 <test_zero_extended_unsigned_load>:
> -    10001830:        5b 18 44 7c     lxvrhx  vs34,r4,r3
> -    10001834:        20 00 80 4e     blr
> -*/
> +/* Codegen at time of writing is a single lxvrwx for the zero extended test,
> +   and a lxvrwx + vexts* sign extension instructions for the sign extended
> +   test.
> +
> +     0000000000000000 <test_sign_extended_load>:
> +        0:   5b 18 44 7c     lxvrhx  vs34,r4,r3
> +        4:   02 16 59 10     vextsh2d v2,v2
> +        8:   02 16 5b 10     vextsd2q v2,v2
> +        c:   20 00 80 4e     blr
> +
> +     0000000000000020 <test_zero_extended_unsigned_load>:
> +       20:   5b 18 44 7c     lxvrhx  vs34,r4,r3
> +       24:   20 00 80 4e     blr  */
> 

ok

>  #include <altivec.h>
>  #include <stdio.h>
> diff --git a/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c 
> b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> new file mode 100644
> index 00000000000..9b3b9c4dbd0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c
> @@ -0,0 +1,62 @@
> +/* { dg-require-effective-target int128     } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +/* This patch makes sure the various optimization and code paths are done for
> +   zero extending DImode to TImode on power10.  */
> +
> +__uint128_t
> +gpr_to_gpr (unsigned long long a)
> +{
> +  /* li 4,0.  */
> +  return a;
> +}
> +
> +__uint128_t
> +mem_to_gpr (unsigned long long *p)
> +{
> +  /* ld 3,0(3); li 4,0.  */
> +  return *p;
> +}
> +
> +__uint128_t
> +vsx_to_gpr (__uint128_t *p, double d)
> +{
> +  /* fctiduz 1,1; li 4,0;mfvsrd 3,1.  */
> +  return (unsigned long long)d;
> +}
> +
> +void
> +gpr_to_vsx (__uint128_t *p, unsigned long long a)
> +{
> +  /* mtvsrdd 0,0,4; stxv 0,0(3).  */
> +  __uint128_t b = a;
> +  __asm__ (" # %x0" : "+wa" (b));
> +  *p = b;
> +}
> +
> +void
> +mem_to_vsx (__uint128_t *p, unsigned long long *q)
> +{
> +  /* lxvrdx 0,0,4; stxv 0,0(3).  */
> +  __uint128_t a = *q;
> +  __asm__ (" # %x0" : "+wa" (a));
> +  *p = a;
> +}
> +
> +void
> +vsx_to_vsx (__uint128_t *p, double d)
> +{
> +  /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3).  */
> +  __uint128_t a = (unsigned long long)d;
> +  __asm__ (" # %x0" : "+wa" (a));
> +  *p = a;
> +}

ok

thanks
-Will


> +
> +/* { dg-final { scan-assembler-times {\mli\M}       3 } } */
> +/* { dg-final { scan-assembler-times {\mld\M}       1 } } */
> +/* { dg-final { scan-assembler-times {\mlxvrdx\M}   1 } } */
> +/* { dg-final { scan-assembler-times {\mmfvsrd\M}   1 } } */
> +/* { dg-final { scan-assembler-times {\mmtvsrdd\M}  1 } } */
> +/* { dg-final { scan-assembler-times {\mstxv\M}     3 } } */
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */
> -- 
> 2.35.1
> 
>

Re: [PATCH] Add zero_extendditi2. Improve lxvr*x code generation.

Reply via email to