https://gcc.gnu.org/g:e1098c7b08d9e6018f60dae7a14c5ad621618223

commit r16-46-ge1098c7b08d9e6018f60dae7a14c5ad621618223
Author: hongtao.liu <hongtao....@intel.com>
Date:   Thu Apr 17 09:07:55 2025 +0200

    Generate 2 FMA instructions in ix86_expand_swdivsf.
    
    When FMA is available, N-R step can be rewritten with
    
    a / b = (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
    
    which have 2 fma generated.
    
    gcc/ChangeLog:
    
            * config/i386/i386-expand.cc (ix86_emit_swdivsf): Generate 2
            FMA instructions when TARGET_FMA.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/recip-vec-divf-fma.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc                     | 44 ++++++++++++++++------
 gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c | 12 ++++++
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index cdfd94d3c73d..36f71eb4a973 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19256,8 +19256,6 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode 
mode)
   e1 = gen_reg_rtx (mode);
   x1 = gen_reg_rtx (mode);
 
-  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
-
   b = force_reg (mode, b);
 
   /* x0 = rcp(b) estimate */
@@ -19270,20 +19268,42 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, 
machine_mode mode)
     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
                                                UNSPEC_RCP)));
 
-  /* e0 = x0 * b */
-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
+  unsigned vector_size = GET_MODE_SIZE (mode);
+
+  /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
+     N-R step with 2 fma implementation.  */
+  if (TARGET_FMA
+      || (TARGET_AVX512F && vector_size == 64)
+      || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
+    {
+      /* e0 = x0 * a  */
+      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+      /* e1 = e0 * b - a  */
+      emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
+                                              gen_rtx_NEG (mode, a))));
+      /* res = - e1 * x0 + e0  */
+      emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
+                                              gen_rtx_NEG (mode, e1),
+                                              x0, e0)));
+    }
+  else
+    /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
+    {
+      /* e0 = x0 * b */
+      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
 
-  /* e0 = x0 * e0 */
-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
+      /* e1 = x0 + x0 */
+      emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
 
-  /* e1 = x0 + x0 */
-  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
+      /* e0 = x0 * e0 */
+      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
 
-  /* x1 = e1 - e0 */
-  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
+      /* x1 = e1 - e0 */
+      emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
 
-  /* res = a * x1 */
-  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+      /* res = a * x1 */
+      emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+    }
 }
 
 /* Output code to perform a Newton-Rhapson approximation of a
diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c 
b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
new file mode 100644
index 000000000000..ad9e07b1eb65
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mfma -mavx2" } */
+/* { dg-final { scan-assembler-times {(?n)vfn?m(add|sub)[1-3]*ps} 2 } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+/* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a  */
+
+v4sf
+foo (v4sf a, v4sf b)
+{
+    return a / b;
+}

Reply via email to