On Thu, Apr 24, 2025 at 12:54 AM Jan Hubicka <hubi...@ucw.cz> wrote:
>
> > From: "hongtao.liu" <hongtao....@intel.com>
> >
> > When FMA is available, N-R step can be rewritten with
> >
> > a / b = (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
> >
> > which have 2 fma generated.[1]
> >
> > [1] https://bugs.llvm.org/show_bug.cgi?id=21385
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> How this behaves on CPUs where FMA has longer latency then addition when
> swdifsf is on the critical path through the loop?
For the original N-R step, addition couldn't be on the cross-iteration
critical path since it's internal inside the N-R step, only
multiplication could be on the critical path.

It's like
    /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
       x0 = rcp(b)
         /              \
e0 = x0 *b      e1 = x0 + x0
      |                          /
e0 = x0 * e0            /
          \                   /
          x1 = e1 - e0
                |
          res = a * x1 (multiplication here)

For the new N-R step, even the last operation is addition, I don't
think it can be on the cross-iteration critical path since there's
multication to get either e0/e2.
/* a / b = (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a */
         x0 = rcp(b)
                |
         e0 = x0 * a
                |
          e1 = e0 * b
                |
         x1 = a - e1
                |
          e2 = x0  * x1
                |
        res = e0  + e2 (addition here)



>
> Honza
> >
> >
> > gcc/ChangeLog:
> >
> >       * config/i386/i386-expand.cc (ix86_emit_swdivsf): Generate 2
> >       FMA instructions when TARGET_FMA.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/i386/recip-vec-divf-fma.c: New test.
> > ---
> >  gcc/config/i386/i386-expand.cc                | 44 ++++++++++++++-----
> >  .../gcc.target/i386/recip-vec-divf-fma.c      | 12 +++++
> >  2 files changed, 44 insertions(+), 12 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index cdfd94d3c73..4fffbfdd574 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -19256,8 +19256,6 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, 
> > machine_mode mode)
> >    e1 = gen_reg_rtx (mode);
> >    x1 = gen_reg_rtx (mode);
> >
> > -  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
> > -
> >    b = force_reg (mode, b);
> >
> >    /* x0 = rcp(b) estimate */
> > @@ -19270,20 +19268,42 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, 
> > machine_mode mode)
> >      emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
> >                                               UNSPEC_RCP)));
> >
> > -  /* e0 = x0 * b */
> > -  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
> > +  unsigned vector_size = GET_MODE_SIZE (mode);
> >
> > -  /* e0 = x0 * e0 */
> > -  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
> > +  /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
> > +     N-R step with 2 fma implementation.  */
> > +  if (TARGET_FMA
> > +      || (TARGET_AVX512F && vector_size == 64)
> > +      || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
> > +    {
> > +      /* e0 = x0 * a  */
> > +      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
> > +      /* e1 = e0 * b - a  */
> > +      emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
> > +                                            gen_rtx_NEG (mode, a))));
> > +      /* res = - e1 * x0 + e0  */
> > +      emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
> > +                                            gen_rtx_NEG (mode, e1),
> > +                                            x0, e0)));
> > +    }
> > +    /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
> > +  else
> > +    {
> > +      /* e0 = x0 * b */
> > +      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
> >
> > -  /* e1 = x0 + x0 */
> > -  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
> > +      /* e1 = x0 + x0 */
> > +      emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
> >
> > -  /* x1 = e1 - e0 */
> > -  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
> > +      /* e0 = x0 * e0 */
> > +      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
> >
> > -  /* res = a * x1 */
> > -  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
> > +      /* x1 = e1 - e0 */
> > +      emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
> > +
> > +      /* res = a * x1 */
> > +      emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
> > +    }
> >  }
> >
> >  /* Output code to perform a Newton-Rhapson approximation of a
> > diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c 
> > b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
> > new file mode 100644
> > index 00000000000..ad9e07b1eb6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Ofast -mfma -mavx2" } */
> > +/* { dg-final { scan-assembler-times {(?n)vfn?m(add|sub)[1-3]*ps} 2 } } */
> > +
> > +typedef float v4sf __attribute__((vector_size(16)));
> > +/* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a  */
> > +
> > +v4sf
> > +foo (v4sf a, v4sf b)
> > +{
> > +    return a / b;
> > +}
> > --
> > 2.34.1
> >



-- 
BR,
Hongtao

Reply via email to