Hi Richard, > -----Original Message----- > From: Richard Sandiford <richard.sandif...@arm.com> > Sent: Monday, May 10, 2021 5:49 PM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw > <richard.earns...@arm.com>; Marcus Shawcroft > <marcus.shawcr...@arm.com>; Kyrylo Tkachov <kyrylo.tkac...@arm.com> > Subject: Re: [PATCH 2/4]AArch64: Add support for sign differing dot-product > usdot for NEON and SVE. > > Tamar Christina <tamar.christ...@arm.com> writes: > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index > > > 4edee99051c4e2112b546becca47da32aae21df2..c9fb8e702732dd311fb10de1 > 7126 > > 432e2a63a32b 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -648,6 +648,22 @@ (define_expand "<sur>dot_prod<vsi2qi>" > > DONE; > > }) > > > > +;; Auto-vectorizer pattern for usdot > > +(define_expand "usdot_prod<vsi2qi>" > > + [(set (match_operand:VS 0 "register_operand") > > + (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 > "register_operand") > > + (match_operand:<VSI2QI> 2 "register_operand")] > > + UNSPEC_USDOT) > > + (match_operand:VS 3 "register_operand")))] > > + "TARGET_I8MM" > > +{ > > + emit_insn ( > > + gen_aarch64_usdot<vsi2qi> (operands[3], operands[3], operands[1], > > + operands[2])); > > + emit_move_insn (operands[0], operands[3]); > > + DONE; > > +}) > > We can't modify operands[3] here; it's an input rather than an output.
Sorry, I should have noticed this.. I had blindly copied the existing pattern for dot-product and that looks like it's wrong. I'll send a different patch to fix that one. > > It looks like this would work with just the {…} removed though. > The pattern will match aarch64_usdot<vsi2qi> on its own accord. > > Even better would be to rename __builtin_aarch64_usdot… to > __builtin_usdot_prod…, change its arguments so that they line up with the > optabs, and change arm_neon.h to match. > > > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c > > b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..b99a945903c043c7410becaf6f > 09 > > 496dd038410d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c > > @@ -0,0 +1,38 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */ > > + > > +#define N 480 > > +#define SIGNEDNESS_1 unsigned > > +#define SIGNEDNESS_2 signed > > +#define SIGNEDNESS_3 signed > > +#define SIGNEDNESS_4 unsigned > > + > > +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res, > > +SIGNEDNESS_3 char *restrict a, > > + SIGNEDNESS_4 char *restrict b) > > +{ > > + for (__INTPTR_TYPE__ i = 0; i < N; ++i) > > + { > > + int av = a[i]; > > + int bv = b[i]; > > + SIGNEDNESS_2 short mult = av * bv; > > + res += mult; > > + } > > + return res; > > +} > > + > > +SIGNEDNESS_1 int __attribute__ ((noipa)) g (SIGNEDNESS_1 int res, > > +SIGNEDNESS_3 char *restrict b, > > + SIGNEDNESS_4 char *restrict a) > > +{ > > + for (__INTPTR_TYPE__ i = 0; i < N; ++i) > > + { > > + int av = a[i]; > > + int bv = b[i]; > > + SIGNEDNESS_2 short mult = av * bv; > > + res += mult; > > + } > > + return res; > > +} > > + > > +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c > > b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..094dd51cea62e0ba05ec35056 > 57b > > f05320e5fdbb > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c > > @@ -0,0 +1,38 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O3 -march=armv8.2-a+i8mm+sve" } */ > > + > > +#define N 480 > > +#define SIGNEDNESS_1 unsigned > > +#define SIGNEDNESS_2 signed > > +#define SIGNEDNESS_3 signed > > +#define SIGNEDNESS_4 unsigned > > + > > +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res, > > +SIGNEDNESS_3 char *restrict a, > > + SIGNEDNESS_4 char *restrict b) > > +{ > > + for (__INTPTR_TYPE__ i = 0; i < N; ++i) > > + { > > + int av = a[i]; > > + int bv = b[i]; > > + SIGNEDNESS_2 short mult = av * bv; > > + res += mult; > > + } > > + return res; > > +} > > + > > +SIGNEDNESS_1 int __attribute__ ((noipa)) g (SIGNEDNESS_1 int res, > > +SIGNEDNESS_3 char *restrict b, > > + SIGNEDNESS_4 char *restrict a) > > +{ > > + for (__INTPTR_TYPE__ i = 0; i < N; ++i) > > + { > > + int av = a[i]; > > + int bv = b[i]; > > + SIGNEDNESS_2 short mult = av * bv; > > + res += mult; > > + } > > + return res; > > +} > > + > > +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */ > > Guess this is personal preference, but I don't think the SIGNEDNESS_* > macros add anything when used like this. I remember doing something > similar in the past when including .c files from other .c files(!) in order to > avoid cut-&-paste, but there doesn't seem much benefit for standalone files > like these. If it's the same to you, I do prefer this version, since it's identical to the mid-end tests, It does allow when familiar with the tests to just quickly see what it's testing. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_usdot<vsi2qi>): Rename to... (usdot_prod<vsi2qi>): ... This. * config/aarch64/aarch64-simd-builtins.def (usdot): Rename to... (usdot_prod): ...This. * config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Likewise. * config/aarch64/aarch64-sve.md (@aarch64_<sur>dot_prod<vsi2qi>): Rename to... (@<sur>dot_prod<vsi2qi>): ...This. * config/aarch64/aarch64-sve-builtins-base.cc (svusdot_impl::expand): Use it. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vusdot-autovec.c: New test. * gcc.target/aarch64/sve/vusdot-autovec.c: New test. > > Thanks, > Richard
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index b885bd5b38bf7ad83eb9d801284bf9b34db17210..c869ed9a6ab7d63f0e3d5fe393a93c1cc9142e78 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -361,10 +361,11 @@ BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, NONE) BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) - /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>. */ + /* Implemented by <sur><dotprod>_prod<dot_mode>. */ BUILTIN_VB (TERNOP, sdot, 0, NONE) BUILTIN_VB (TERNOPU, udot, 0, NONE) - BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE) + BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) + /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE) BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 4edee99051c4e2112b546becca47da32aae21df2..253ddbe25d3a86af4b40b056132e6a86a0392ea6 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -601,7 +601,7 @@ (define_insn "aarch64_<sur>dot<vsi2qi>" ;; These instructions map to the __builtins for the armv8.6a I8MM usdot ;; (vector) Dot Product operation. -(define_insn "aarch64_usdot<vsi2qi>" +(define_insn "usdot_prod<vsi2qi>" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index dfdf0e2fd186389cbddcff51ef52f8778d7fdb24..50adcd5404e97e610485140fdbfe4c8ebbf2f602 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -2366,7 +2366,7 @@ public: Hence we do the same rotation on arguments as svdot_impl does. */ e.rotate_inputs_left (0, 3); machine_mode mode = e.vector_mode (0); - insn_code icode = code_for_aarch64_dot_prod (UNSPEC_USDOT, mode); + insn_code icode = code_for_dot_prod (UNSPEC_USDOT, mode); return e.use_exact_insn (icode); } diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 7db2938bb84e04d066a7b07574e5cf344a3a8fb6..1278f6f12fadf8eec693cd47fd545ff3277f08f1 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -6870,7 +6870,7 @@ (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>" [(set_attr "movprfx" "*,yes")] ) -(define_insn "@aarch64_<sur>dot_prod<vsi2qi>" +(define_insn "@<sur>dot_prod<vsi2qi>" [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w") (plus:VNx4SI_ONLY (unspec:VNx4SI_ONLY diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index baa30bd5a9d96c1bf04a37fb105091ea56a6444a..373f06a24ea6ce686d7e0cdf53dd364041c61092 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34384,14 +34384,14 @@ __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b); } __extension__ extern __inline int32x2_t diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c new file mode 100644 index 0000000000000000000000000000000000000000..b99a945903c043c7410becaf6f09496dd038410d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */ + +#define N 480 +#define SIGNEDNESS_1 unsigned +#define SIGNEDNESS_2 signed +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 unsigned + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, + SIGNEDNESS_4 char *restrict b) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +SIGNEDNESS_1 int __attribute__ ((noipa)) +g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b, + SIGNEDNESS_4 char *restrict a) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c new file mode 100644 index 0000000000000000000000000000000000000000..094dd51cea62e0ba05ec3505657bf05320e5fdbb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+i8mm+sve" } */ + +#define N 480 +#define SIGNEDNESS_1 unsigned +#define SIGNEDNESS_2 signed +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 unsigned + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, + SIGNEDNESS_4 char *restrict b) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +SIGNEDNESS_1 int __attribute__ ((noipa)) +g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b, + SIGNEDNESS_4 char *restrict a) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */