Re: [PATCH, i386, AVX-512ER] vrcp28ps auto generation

2016-06-20 Thread Uros Bizjak
On Mon, Jun 20, 2016 at 7:09 PM, Ilya Verbin  wrote:
> Hi!
>
> This patch emits vrcp28ps and vmulps istructions for ix86_emit_swdivsf.
> The relative error is < 2^-23, so no additional iteration is necessary.
> Regtested using various benchmarks on a AVX-512ER machine.  OK for trunk?
>
>
> gcc/
> * config/i386/i386.c (ix86_emit_swdivsf): Emit vrcp28ps.
> gcc/testsuite/
> * gcc.target/i386/avx512er-vrcp28ps-3.c: New test.
> * gcc.target/i386/avx512er-vrcp28ps-4.c: New test.

OK.

Thanks,
Uros.

>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 56a5b9c..8e0bf26 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -48674,8 +48674,19 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, 
> machine_mode mode)
>
>/* x0 = rcp(b) estimate */
>if (mode == V16SFmode || mode == V8DFmode)
> -emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
> -   UNSPEC_RCP14)));
> +{
> +  if (TARGET_AVX512ER)
> +   {
> + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
> + UNSPEC_RCP28)));
> + /* res = a * x0 */
> + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
> + return;
> +   }
> +  else
> +   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
> +   UNSPEC_RCP14)));
> +}
>else
>  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
> UNSPEC_RCP)));
> diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c 
> b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c
> new file mode 100644
> index 000..e08bea4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c
> @@ -0,0 +1,50 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target avx512er } */
> +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
> +
> +#include "avx512er-check.h"
> +
> +#define MAX 1000
> +#define EPS 0.1
> +
> +__attribute__ ((noinline, optimize (0)))
> +void static
> +compute_rcp_ref (float *a, float *b, float *r)
> +{
> +  for (int i = 0; i < MAX; i++)
> +r[i] = a[i] / b[i];
> +}
> +
> +__attribute__ ((noinline))
> +void static
> +compute_rcp_exp (float *a, float *b, float *r)
> +{
> +  for (int i = 0; i < MAX; i++)
> +r[i] = a[i] / b[i];
> +}
> +
> +void static
> +avx512er_test (void)
> +{
> +  float a[MAX];
> +  float b[MAX];
> +  float ref[MAX];
> +  float exp[MAX];
> +
> +  for (int i = 0; i < MAX; i++)
> +{
> +  a[i] = 179.345 - 6.5645 * i;
> +  b[i] = 8765.987 - 8.6756 * i;
> +}
> +
> +  compute_rcp_ref (a, b, ref);
> +  compute_rcp_exp (a, b, exp);
> +
> +  for (int i = 0; i < MAX; i++)
> +{
> +  float rel_err = (ref[i] - exp[i]) / ref[i];
> +  rel_err = rel_err > 0.0 ? rel_err : -rel_err;
> +  if (rel_err > EPS)
> +   abort ();
> +}
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c 
> b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c
> new file mode 100644
> index 000..2c76d96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
> +
> +#include "avx512er-vrcp28ps-3.c"
> +
> +/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ 
> \\t\]+#)" 1 } } */
>
>
>   -- Ilya


[PATCH, i386, AVX-512ER] vrcp28ps auto generation

2016-06-20 Thread Ilya Verbin
Hi!

This patch emits vrcp28ps and vmulps istructions for ix86_emit_swdivsf.
The relative error is < 2^-23, so no additional iteration is necessary.
Regtested using various benchmarks on a AVX-512ER machine.  OK for trunk?


gcc/
* config/i386/i386.c (ix86_emit_swdivsf): Emit vrcp28ps.
gcc/testsuite/
* gcc.target/i386/avx512er-vrcp28ps-3.c: New test.
* gcc.target/i386/avx512er-vrcp28ps-4.c: New test.


diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 56a5b9c..8e0bf26 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -48674,8 +48674,19 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, 
machine_mode mode)
 
   /* x0 = rcp(b) estimate */
   if (mode == V16SFmode || mode == V8DFmode)
-emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
-   UNSPEC_RCP14)));
+{
+  if (TARGET_AVX512ER)
+   {
+ emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+ UNSPEC_RCP28)));
+ /* res = a * x0 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
+ return;
+   }
+  else
+   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+   UNSPEC_RCP14)));
+}
   else
 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
UNSPEC_RCP)));
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c 
b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c
new file mode 100644
index 000..e08bea4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512er } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
+
+#include "avx512er-check.h"
+
+#define MAX 1000
+#define EPS 0.1
+
+__attribute__ ((noinline, optimize (0)))
+void static
+compute_rcp_ref (float *a, float *b, float *r)
+{
+  for (int i = 0; i < MAX; i++)
+r[i] = a[i] / b[i];
+}
+
+__attribute__ ((noinline))
+void static
+compute_rcp_exp (float *a, float *b, float *r)
+{
+  for (int i = 0; i < MAX; i++)
+r[i] = a[i] / b[i];
+}
+
+void static
+avx512er_test (void)
+{
+  float a[MAX];
+  float b[MAX];
+  float ref[MAX];
+  float exp[MAX];
+
+  for (int i = 0; i < MAX; i++)
+{
+  a[i] = 179.345 - 6.5645 * i;
+  b[i] = 8765.987 - 8.6756 * i;
+}
+
+  compute_rcp_ref (a, b, ref);
+  compute_rcp_exp (a, b, exp);
+
+  for (int i = 0; i < MAX; i++)
+{
+  float rel_err = (ref[i] - exp[i]) / ref[i];
+  rel_err = rel_err > 0.0 ? rel_err : -rel_err;
+  if (rel_err > EPS)
+   abort ();
+}
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c 
b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c
new file mode 100644
index 000..2c76d96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
+
+#include "avx512er-vrcp28ps-3.c"
+
+/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */


  -- Ilya