On Thu, Apr 24, 2025 at 12:54 AM Jan Hubicka <hubi...@ucw.cz> wrote: > > > From: "hongtao.liu" <hongtao....@intel.com> > > > > When FMA is available, N-R step can be rewritten with > > > > a / b = (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a > > > > which have 2 fma generated.[1] > > > > [1] https://bugs.llvm.org/show_bug.cgi?id=21385 > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ok for trunk? > > How this behaves on CPUs where FMA has longer latency then addition when > swdifsf is on the critical path through the loop? For the original N-R step, addition couldn't be on the cross-iteration critical path since it's internal inside the N-R step, only multiplication could be on the critical path.
It's like /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ x0 = rcp(b) / \ e0 = x0 *b e1 = x0 + x0 | / e0 = x0 * e0 / \ / x1 = e1 - e0 | res = a * x1 (multiplication here) For the new N-R step, even the last operation is addition, I don't think it can be on the cross-iteration critical path since there's multication to get either e0/e2. /* a / b = (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a */ x0 = rcp(b) | e0 = x0 * a | e1 = e0 * b | x1 = a - e1 | e2 = x0 * x1 | res = e0 + e2 (addition here) > > Honza > > > > > > gcc/ChangeLog: > > > > * config/i386/i386-expand.cc (ix86_emit_swdivsf): Generate 2 > > FMA instructions when TARGET_FMA. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/recip-vec-divf-fma.c: New test. > > --- > > gcc/config/i386/i386-expand.cc | 44 ++++++++++++++----- > > .../gcc.target/i386/recip-vec-divf-fma.c | 12 +++++ > > 2 files changed, 44 insertions(+), 12 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c > > > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > > index cdfd94d3c73..4fffbfdd574 100644 > > --- a/gcc/config/i386/i386-expand.cc > > +++ b/gcc/config/i386/i386-expand.cc > > @@ -19256,8 +19256,6 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, > > machine_mode mode) > > e1 = gen_reg_rtx (mode); > > x1 = gen_reg_rtx (mode); > > > > - /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ > > - > > b = force_reg (mode, b); > > > > /* x0 = rcp(b) estimate */ > > @@ -19270,20 +19268,42 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, > > machine_mode mode) > > emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > > UNSPEC_RCP))); > > > > - /* e0 = x0 * b */ > > - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b))); > > + unsigned vector_size = GET_MODE_SIZE (mode); > > > > - /* e0 = x0 * e0 */ > > - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0))); > > + /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a > > + N-R step with 2 fma implementation. */ > > + if (TARGET_FMA > > + || (TARGET_AVX512F && vector_size == 64) > > + || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16))) > > + { > > + /* e0 = x0 * a */ > > + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a))); > > + /* e1 = e0 * b - a */ > > + emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b, > > + gen_rtx_NEG (mode, a)))); > > + /* res = - e1 * x0 + e0 */ > > + emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode, > > + gen_rtx_NEG (mode, e1), > > + x0, e0))); > > + } > > + /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ > > + else > > + { > > + /* e0 = x0 * b */ > > + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b))); > > > > - /* e1 = x0 + x0 */ > > - emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0))); > > + /* e1 = x0 + x0 */ > > + emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0))); > > > > - /* x1 = e1 - e0 */ > > - emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0))); > > + /* e0 = x0 * e0 */ > > + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0))); > > > > - /* res = a * x1 */ > > - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1))); > > + /* x1 = e1 - e0 */ > > + emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0))); > > + > > + /* res = a * x1 */ > > + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1))); > > + } > > } > > > > /* Output code to perform a Newton-Rhapson approximation of a > > diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c > > b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c > > new file mode 100644 > > index 00000000000..ad9e07b1eb6 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c > > @@ -0,0 +1,12 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -mfma -mavx2" } */ > > +/* { dg-final { scan-assembler-times {(?n)vfn?m(add|sub)[1-3]*ps} 2 } } */ > > + > > +typedef float v4sf __attribute__((vector_size(16))); > > +/* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a */ > > + > > +v4sf > > +foo (v4sf a, v4sf b) > > +{ > > + return a / b; > > +} > > -- > > 2.34.1 > > -- BR, Hongtao