On Mon, Jun 20, 2016 at 7:09 PM, Ilya Verbin <iver...@gmail.com> wrote: > Hi! > > This patch emits vrcp28ps and vmulps istructions for ix86_emit_swdivsf. > The relative error is < 2^-23, so no additional iteration is necessary. > Regtested using various benchmarks on a AVX-512ER machine. OK for trunk? > > > gcc/ > * config/i386/i386.c (ix86_emit_swdivsf): Emit vrcp28ps. > gcc/testsuite/ > * gcc.target/i386/avx512er-vrcp28ps-3.c: New test. > * gcc.target/i386/avx512er-vrcp28ps-4.c: New test.
OK. Thanks, Uros. > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 56a5b9c..8e0bf26 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -48674,8 +48674,19 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, > machine_mode mode) > > /* x0 = rcp(b) estimate */ > if (mode == V16SFmode || mode == V8DFmode) > - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > - UNSPEC_RCP14))); > + { > + if (TARGET_AVX512ER) > + { > + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > + UNSPEC_RCP28))); > + /* res = a * x0 */ > + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0))); > + return; > + } > + else > + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > + UNSPEC_RCP14))); > + } > else > emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > UNSPEC_RCP))); > diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c > b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c > new file mode 100644 > index 0000000..e08bea4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c > @@ -0,0 +1,50 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target avx512er } */ > +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ > + > +#include "avx512er-check.h" > + > +#define MAX 1000 > +#define EPS 0.00001 > + > +__attribute__ ((noinline, optimize (0))) > +void static > +compute_rcp_ref (float *a, float *b, float *r) > +{ > + for (int i = 0; i < MAX; i++) > + r[i] = a[i] / b[i]; > +} > + > +__attribute__ ((noinline)) > +void static > +compute_rcp_exp (float *a, float *b, float *r) > +{ > + for (int i = 0; i < MAX; i++) > + r[i] = a[i] / b[i]; > +} > + > +void static > +avx512er_test (void) > +{ > + float a[MAX]; > + float b[MAX]; > + float ref[MAX]; > + float exp[MAX]; > + > + for (int i = 0; i < MAX; i++) > + { > + a[i] = 179.345 - 6.5645 * i; > + b[i] = 8765.987 - 8.6756 * i; > + } > + > + compute_rcp_ref (a, b, ref); > + compute_rcp_exp (a, b, exp); > + > + for (int i = 0; i < MAX; i++) > + { > + float rel_err = (ref[i] - exp[i]) / ref[i]; > + rel_err = rel_err > 0.0 ? rel_err : -rel_err; > + if (rel_err > EPS) > + abort (); > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c > b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c > new file mode 100644 > index 0000000..2c76d96 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c > @@ -0,0 +1,6 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ > + > +#include "avx512er-vrcp28ps-3.c" > + > +/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ > \\t\]+#)" 1 } } */ > > > -- Ilya