On Fri, 2022-05-13 at 12:17 -0400, Michael Meissner wrote: > Optimize multiply/add of DImode extended to TImode, PR target/103109. > > On power9 and power10 systems, we have instructions that support doing > 64-bit integers converted to 128-bit integers and producing 128-bit > results. This patch adds support to generate these instructions. > > Previously GCC had define_expands to handle conversion of the 64-bit > extend to 128-bit and multiply. This patch changes these define_expands > to define_insn_and_split and then it provides combiner patterns to > generate thes multiply/add instructions. > > To support using this optimization on power9, this patch extends the sign > extend DImode to TImode to also run on power9 (added for PR > target/104698). > > This patch needs the previous patch to add unsigned DImode to TImode > conversion so that the combiner can combine the extend, multiply, and add > instructions. > > I have built this patch on little endian power10, little endian power9, and > big > endian power8 systems. There were no regressions when I ran it. Can I > install > this patch into the GCC 13 master branch? > > 2022-05-13 Michael Meissner <meiss...@linux.ibm.com> > > gcc/ > PR target/103109 > * config/rs6000/rs6000.md (su_int32): New code attribute. > (<u>mul<mode><dmode>3): Convert from define_expand to > define_insn_and_split. > (maddld<mode>4): Add generator function.
-(define_insn "*maddld<mode>4" +(define_insn "maddld<mode>4" Is the removal of the "*" considering adding generator? (Thats terminology that I'm not immediately familiar with). > (<u>mulditi3_<u>adddi3): New insn. > (<u>mulditi3_add_const): New insn. > (<u>mulditi3_<u>adddi3_upper): New insn. > > gcc/testsuite/ > PR target/103109 > * gcc.target/powerpc/pr103109.c: New test. ok > --- > gcc/config/rs6000/rs6000.md | 128 +++++++++++++++++++- > gcc/testsuite/gcc.target/powerpc/pr103109.c | 62 ++++++++++ > 2 files changed, 184 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr103109.c > > diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md > index 2aba70393d8..83eacec57ba 100644 > --- a/gcc/config/rs6000/rs6000.md > +++ b/gcc/config/rs6000/rs6000.md > @@ -667,6 +667,9 @@ (define_code_attr uns [(fix "") > (float "") > (unsigned_float "uns")]) > > +(define_code_attr su_int32 [(sign_extend "s32bit_cint_operand") > + (zero_extend "c32bit_cint_operand")]) > + > ; Various instructions that come in SI and DI forms. > ; A generic w/d attribute, for things like cmpw/cmpd. > (define_mode_attr wd [(QI "b") > @@ -3190,13 +3193,16 @@ (define_insn "<su>mulsi3_highpart_64" > "mulhw<u> %0,%1,%2" > [(set_attr "type" "mul")]) > > -(define_expand "<u>mul<mode><dmode>3" > - [(set (match_operand:<DMODE> 0 "gpc_reg_operand") > +(define_insn_and_split "<u>mul<mode><dmode>3" > + [(set (match_operand:<DMODE> 0 "gpc_reg_operand" "=&r") > (mult:<DMODE> (any_extend:<DMODE> > - (match_operand:GPR 1 "gpc_reg_operand")) > + (match_operand:GPR 1 "gpc_reg_operand" "r")) > (any_extend:<DMODE> > - (match_operand:GPR 2 "gpc_reg_operand"))))] > + (match_operand:GPR 2 "gpc_reg_operand" "r"))))] > "!(<MODE>mode == SImode && TARGET_POWERPC64)" > + "#" > + "&& 1" > + [(pc)] > { > rtx l = gen_reg_rtx (<MODE>mode); > rtx h = gen_reg_rtx (<MODE>mode); > @@ -3205,9 +3211,10 @@ (define_expand "<u>mul<mode><dmode>3" > emit_move_insn (gen_lowpart (<MODE>mode, operands[0]), l); > emit_move_insn (gen_highpart (<MODE>mode, operands[0]), h); > DONE; > -}) > +} > + [(set_attr "length" "8")]) > ok > -(define_insn "*maddld<mode>4" > +(define_insn "maddld<mode>4" > [(set (match_operand:GPR 0 "gpc_reg_operand" "=r") > (plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r") > (match_operand:GPR 2 "gpc_reg_operand" "r")) ok > @@ -3216,6 +3223,115 @@ (define_insn "*maddld<mode>4" > "maddld %0,%1,%2,%3" > [(set_attr "type" "mul")]) > > +(define_insn_and_split "*<u>mulditi3_<u>adddi3" > + [(set (match_operand:TI 0 "gpc_reg_operand" "=&r") > + (plus:TI > + (mult:TI > + (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r")) > + (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r"))) > + (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))))] > + "TARGET_MADDLD && TARGET_POWERPC64" > + "#" > + "&& 1" > + [(pc)] > +{ > + rtx dest = operands[0]; > + rtx dest_hi = gen_highpart (DImode, dest); > + rtx dest_lo = gen_lowpart (DImode, dest); > + rtx op1 = operands[1]; > + rtx op2 = operands[2]; > + rtx op3 = operands[3]; > + rtx tmp_hi, tmp_lo; > + > + if (can_create_pseudo_p ()) > + { > + tmp_hi = gen_reg_rtx (DImode); > + tmp_lo = gen_reg_rtx (DImode); > + } > + else > + { > + tmp_hi = dest_hi; > + tmp_lo = dest_lo; > + } > + > + emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3)); > + emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3)); > + > + if (can_create_pseudo_p ()) > + { > + emit_move_insn (dest_hi, tmp_hi); > + emit_move_insn (dest_lo, tmp_lo); > + } > + DONE; > +} > + [(set_attr "length" "8")]) Seems OK, I did not closely scrutinize. > + > +;; Optimize 128-bit multiply with zero/sign extend and adding a constant. We > +;; force the constant into a register to generate li, maddhd, and maddld, > +;; instead of mulld, mulhd, addic, and addze. We can't combine this pattern > +;; with the pattern that handles registers, since constants don't have a sign > +;; or zero extend around them. ok. > +(define_insn_and_split "*<u>mulditi3_add_const" > + [(set (match_operand:TI 0 "gpc_reg_operand" "=&r") > + (plus:TI > + (mult:TI > + (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r")) > + (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r"))) > + (match_operand 3 "<su_int32>" "r")))] > + "TARGET_MADDLD && TARGET_POWERPC64 > +" > + "#" > + "&& 1" > + [(pc)] > +{ > + rtx dest = operands[0]; > + rtx dest_hi = gen_highpart (DImode, dest); > + rtx dest_lo = gen_lowpart (DImode, dest); > + rtx op1 = operands[1]; > + rtx op2 = operands[2]; > + rtx op3 = force_reg (DImode, operands[3]); > + rtx tmp_hi, tmp_lo; > + > + if (can_create_pseudo_p ()) > + { > + tmp_hi = gen_reg_rtx (DImode); > + tmp_lo = gen_reg_rtx (DImode); > + } > + else > + { > + tmp_hi = dest_hi; > + tmp_lo = dest_lo; > + } > + > + emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3)); > + emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3)); > + > + if (can_create_pseudo_p ()) > + { > + emit_move_insn (dest_hi, tmp_hi); > + emit_move_insn (dest_lo, tmp_lo); > + } > + DONE; > +} > + [(set_attr "length" "8") > + (set_attr "type" "mul") > + (set_attr "size" "64")]) > + > +(define_insn "<u>mulditi3_<u>adddi3_upper" > + [(set (match_operand:DI 0 "gpc_reg_operand" "=r") > + (truncate:DI > + (lshiftrt:TI > + (plus:TI > + (mult:TI > + (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r")) > + (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r"))) > + (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))) > + (const_int 64))))] > + "TARGET_MADDLD && TARGET_POWERPC64" > + "maddhd<u> %0,%1,%2,%3" > + [(set_attr "type" "mul") > + (set_attr "size" "64")]) > + > (define_insn "udiv<mode>3" > [(set (match_operand:GPR 0 "gpc_reg_operand" "=r") > (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r") appears OK. I defer to others for scrutiny :-) > diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c > b/gcc/testsuite/gcc.target/powerpc/pr103109.c > new file mode 100644 > index 00000000000..ae2cfb9eda7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c > @@ -0,0 +1,62 @@ > +/* { dg-require-effective-target int128 } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ > + > +/* This test makes sure that GCC generates the maddhd, maddhdu, and maddld > + power9 instructions when doing some forms of 64-bit integers converted to > + 128-bit integers and used with multiply/add operations. */ > + > +__int128_t > +s_mult_add (long long a, > + long long b, > + long long c) > +{ > + /* maddhd, maddld. */ > + return ((__int128_t)a * (__int128_t)b) + (__int128_t)c; > +} > + > +/* Test 32-bit constants that are loaded into GPRs instead of doing the > + mulld/mulhd and then addic/addime or addc/addze. */ addme ? Thanks, -Will > +__int128_t > +s_mult_add_m10 (long long a, > + long long b) > +{ > + /* maddhd, maddld. */ > + return ((__int128_t)a * (__int128_t)b) - 10; > +} > + > +__int128_t > +s_mult_add_70000 (long long a, > + long long b) > +{ > + /* maddhd, maddld. */ > + return ((__int128_t)a * (__int128_t)b) + 70000; > +} > + > +__uint128_t > +u_mult_add (unsigned long long a, > + unsigned long long b, > + unsigned long long c) > +{ > + /* maddhd, maddld. */ > + return ((__uint128_t)a * (__uint128_t)b) + (__uint128_t)c; > +} > + > +__uint128_t > +u_mult_add_0x80000000 (unsigned long long a, > + unsigned long long b) > +{ > + /* maddhd, maddld. */ > + return ((__uint128_t)a * (__uint128_t)b) + 0x80000000UL; > +} > + > +/* { dg-final { scan-assembler-not {\maddc\M} } } */ > +/* { dg-final { scan-assembler-not {\madde\M} } } */ > +/* { dg-final { scan-assembler-not {\maddid\M} } } */ > +/* { dg-final { scan-assembler-not {\maddme\M} } } */ > +/* { dg-final { scan-assembler-not {\maddze\M} } } */ > +/* { dg-final { scan-assembler-not {\mmulhd\M} } } */ > +/* { dg-final { scan-assembler-not {\mmulld\M} } } */ > +/* { dg-final { scan-assembler-times {\mmaddhd\M} 3 } } */ > +/* { dg-final { scan-assembler-times {\mmaddhdu\M} 2 } } */ > +/* { dg-final { scan-assembler-times {\mmaddld\M} 5 } } */ > -- > 2.35.3 > >