103109.

will schmidt via Gcc-patches Fri, 13 May 2022 11:20:45 -0700

On Fri, 2022-05-13 at 12:17 -0400, Michael Meissner wrote:
> Optimize multiply/add of DImode extended to TImode, PR target/103109.
> 
> On power9 and power10 systems, we have instructions that support doing
> 64-bit integers converted to 128-bit integers and producing 128-bit
> results.  This patch adds support to generate these instructions.
> 
> Previously GCC had define_expands to handle conversion of the 64-bit
> extend to 128-bit and multiply.  This patch changes these define_expands
> to define_insn_and_split and then it provides combiner patterns to
> generate thes multiply/add instructions.
> 
> To support using this optimization on power9, this patch extends the sign
> extend DImode to TImode to also run on power9 (added for PR
> target/104698).
> 
> This patch needs the previous patch to add unsigned DImode to TImode
> conversion so that the combiner can combine the extend, multiply, and add
> instructions.
> 
> I have built this patch on little endian power10, little endian power9, and 
> big
> endian power8 systems.  There were no regressions when I ran it.  Can I 
> install
> this patch into the GCC 13 master branch?
> 
> 2022-05-13   Michael Meissner  <meiss...@linux.ibm.com>
> 
> gcc/
>       PR target/103109
>       * config/rs6000/rs6000.md (su_int32): New code attribute.
>       (<u>mul<mode><dmode>3): Convert from define_expand to
>       define_insn_and_split.
>       (maddld<mode>4): Add generator function.


-(define_insn "*maddld<mode>4"
+(define_insn "maddld<mode>4"

Is the removal of the "*" considering adding generator?  (Thats
terminology that I'm not immediately familiar with). 




>       (<u>mulditi3_<u>adddi3): New insn.
>       (<u>mulditi3_add_const): New insn.
>       (<u>mulditi3_<u>adddi3_upper): New insn.
> 
> gcc/testsuite/
>       PR target/103109
>       * gcc.target/powerpc/pr103109.c: New test.


ok


> ---
>  gcc/config/rs6000/rs6000.md                 | 128 +++++++++++++++++++-
>  gcc/testsuite/gcc.target/powerpc/pr103109.c |  62 ++++++++++
>  2 files changed, 184 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr103109.c
> 
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index 2aba70393d8..83eacec57ba 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -667,6 +667,9 @@ (define_code_attr uns [(fix               "")
>                      (float           "")
>                      (unsigned_float  "uns")])
> 
> +(define_code_attr su_int32 [(sign_extend "s32bit_cint_operand")
> +                         (zero_extend "c32bit_cint_operand")])
> +
>  ; Various instructions that come in SI and DI forms.
>  ; A generic w/d attribute, for things like cmpw/cmpd.
>  (define_mode_attr wd [(QI    "b")
> @@ -3190,13 +3193,16 @@ (define_insn "<su>mulsi3_highpart_64"
>    "mulhw<u> %0,%1,%2"
>    [(set_attr "type" "mul")])
> 
> -(define_expand "<u>mul<mode><dmode>3"
> -  [(set (match_operand:<DMODE> 0 "gpc_reg_operand")
> +(define_insn_and_split "<u>mul<mode><dmode>3"
> +  [(set (match_operand:<DMODE> 0 "gpc_reg_operand" "=&r")
>       (mult:<DMODE> (any_extend:<DMODE>
> -                     (match_operand:GPR 1 "gpc_reg_operand"))
> +                    (match_operand:GPR 1 "gpc_reg_operand" "r"))
>                     (any_extend:<DMODE>
> -                     (match_operand:GPR 2 "gpc_reg_operand"))))]
> +                    (match_operand:GPR 2 "gpc_reg_operand" "r"))))]
>    "!(<MODE>mode == SImode && TARGET_POWERPC64)"
> +  "#"
> +  "&& 1"
> +  [(pc)]
>  {
>    rtx l = gen_reg_rtx (<MODE>mode);
>    rtx h = gen_reg_rtx (<MODE>mode);
> @@ -3205,9 +3211,10 @@ (define_expand "<u>mul<mode><dmode>3"
>    emit_move_insn (gen_lowpart (<MODE>mode, operands[0]), l);
>    emit_move_insn (gen_highpart (<MODE>mode, operands[0]), h);
>    DONE;
> -})
> +}
> +  [(set_attr "length" "8")])
> 

ok


> -(define_insn "*maddld<mode>4"
> +(define_insn "maddld<mode>4"
>    [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
>       (plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
>                           (match_operand:GPR 2 "gpc_reg_operand" "r"))

ok

> @@ -3216,6 +3223,115 @@ (define_insn "*maddld<mode>4"
>    "maddld %0,%1,%2,%3"
>    [(set_attr "type" "mul")])
> 
> +(define_insn_and_split "*<u>mulditi3_<u>adddi3"
> +  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
> +     (plus:TI
> +      (mult:TI
> +       (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
> +       (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
> +      (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))))]
> +  "TARGET_MADDLD && TARGET_POWERPC64"
> +  "#"
> +  "&& 1"
> +  [(pc)]
> +{
> +  rtx dest = operands[0];
> +  rtx dest_hi = gen_highpart (DImode, dest);
> +  rtx dest_lo = gen_lowpart (DImode, dest);
> +  rtx op1 = operands[1];
> +  rtx op2 = operands[2];
> +  rtx op3 = operands[3];
> +  rtx tmp_hi, tmp_lo;
> +
> +  if (can_create_pseudo_p ())
> +    {
> +      tmp_hi = gen_reg_rtx (DImode);
> +      tmp_lo = gen_reg_rtx (DImode);
> +    }
> +  else
> +    {
> +      tmp_hi = dest_hi;
> +      tmp_lo = dest_lo;
> +    }
> +
> +  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
> +  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
> +
> +  if (can_create_pseudo_p ())
> +    {
> +      emit_move_insn (dest_hi, tmp_hi);
> +      emit_move_insn (dest_lo, tmp_lo);
> +    }
> +  DONE;
> +}
> +  [(set_attr "length" "8")])

Seems OK, I did not closely scrutinize.


> +
> +;; Optimize 128-bit multiply with zero/sign extend and adding a constant.  We
> +;; force the constant into a register to generate li, maddhd, and maddld,
> +;; instead of mulld, mulhd, addic, and addze.  We can't combine this pattern
> +;; with the pattern that handles registers, since constants don't have a sign
> +;; or zero extend around them.

ok.


> +(define_insn_and_split "*<u>mulditi3_add_const"
> +  [(set (match_operand:TI 0 "gpc_reg_operand" "=&r")
> +     (plus:TI
> +      (mult:TI
> +       (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
> +       (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
> +      (match_operand 3 "<su_int32>" "r")))]
> +  "TARGET_MADDLD && TARGET_POWERPC64
> +"
> +  "#"
> +  "&& 1"
> +  [(pc)]
> +{
> +  rtx dest = operands[0];
> +  rtx dest_hi = gen_highpart (DImode, dest);
> +  rtx dest_lo = gen_lowpart (DImode, dest);
> +  rtx op1 = operands[1];
> +  rtx op2 = operands[2];
> +  rtx op3 = force_reg (DImode, operands[3]);
> +  rtx tmp_hi, tmp_lo;
> +
> +  if (can_create_pseudo_p ())
> +    {
> +      tmp_hi = gen_reg_rtx (DImode);
> +      tmp_lo = gen_reg_rtx (DImode);
> +    }
> +  else
> +    {
> +      tmp_hi = dest_hi;
> +      tmp_lo = dest_lo;
> +    }
> +
> +  emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3));
> +  emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3));
> +
> +  if (can_create_pseudo_p ())
> +    {
> +      emit_move_insn (dest_hi, tmp_hi);
> +      emit_move_insn (dest_lo, tmp_lo);
> +    }
> +  DONE;
> +}
> +  [(set_attr "length" "8")
> +   (set_attr "type" "mul")
> +   (set_attr "size" "64")])
> +
> +(define_insn "<u>mulditi3_<u>adddi3_upper"
> +  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
> +     (truncate:DI
> +      (lshiftrt:TI
> +       (plus:TI
> +        (mult:TI
> +         (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
> +         (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
> +        (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
> +       (const_int 64))))]
> +  "TARGET_MADDLD && TARGET_POWERPC64"
> +  "maddhd<u> %0,%1,%2,%3"
> +  [(set_attr "type" "mul")
> +   (set_attr "size" "64")])
> +
>  (define_insn "udiv<mode>3"
>    [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
>          (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")


appears OK.  I defer to others for scrutiny :-)


> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c 
> b/gcc/testsuite/gcc.target/powerpc/pr103109.c
> new file mode 100644
> index 00000000000..ae2cfb9eda7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c
> @@ -0,0 +1,62 @@
> +/* { dg-require-effective-target int128     } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +/* This test makes sure that GCC generates the maddhd, maddhdu, and maddld
> +   power9 instructions when doing some forms of 64-bit integers converted to
> +   128-bit integers and used with multiply/add operations.  */
> +
> +__int128_t
> +s_mult_add (long long a,
> +         long long b,
> +         long long c)
> +{
> +  /* maddhd, maddld.  */
> +  return ((__int128_t)a * (__int128_t)b) + (__int128_t)c;
> +}
> +
> +/* Test 32-bit constants that are loaded into GPRs instead of doing the
> +   mulld/mulhd and then addic/addime or addc/addze.  */

addme ?



Thanks,
-Will



> +__int128_t
> +s_mult_add_m10 (long long a,
> +             long long b)
> +{
> +  /* maddhd, maddld.  */
> +  return ((__int128_t)a * (__int128_t)b) - 10;
> +}
> +
> +__int128_t
> +s_mult_add_70000 (long long a,
> +               long long b)
> +{
> +  /* maddhd, maddld.  */
> +  return ((__int128_t)a * (__int128_t)b) + 70000;
> +}
> +
> +__uint128_t
> +u_mult_add (unsigned long long a,
> +         unsigned long long b,
> +         unsigned long long c)
> +{
> +  /* maddhd, maddld.  */
> +  return ((__uint128_t)a * (__uint128_t)b) + (__uint128_t)c;
> +}
> +
> +__uint128_t
> +u_mult_add_0x80000000 (unsigned long long a,
> +                    unsigned long long b)
> +{
> +  /* maddhd, maddld.  */
> +  return ((__uint128_t)a * (__uint128_t)b) + 0x80000000UL;
> +}
> +
> +/* { dg-final { scan-assembler-not   {\maddc\M}      } } */
> +/* { dg-final { scan-assembler-not   {\madde\M}      } } */
> +/* { dg-final { scan-assembler-not   {\maddid\M}     } } */
> +/* { dg-final { scan-assembler-not   {\maddme\M}     } } */
> +/* { dg-final { scan-assembler-not   {\maddze\M}     } } */
> +/* { dg-final { scan-assembler-not   {\mmulhd\M}     } } */
> +/* { dg-final { scan-assembler-not   {\mmulld\M}     } } */
> +/* { dg-final { scan-assembler-times {\mmaddhd\M}  3 } } */
> +/* { dg-final { scan-assembler-times {\mmaddhdu\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mmaddld\M}  5 } } */
> -- 
> 2.35.3
> 
>

Re: [PATCH] Optimize multiply/add of DImode extended to TImode, PR target/103109.

Reply via email to