Re: [PATCH] AVX512FP16: Optimize _Float16 reciprocal for div and sqrt

2021-10-27 Thread Hongtao Liu via Gcc-patches
On Tue, Oct 26, 2021 at 5:51 PM Hongyu Wang via Gcc-patches
 wrote:
>
> Hi,
>
> For _Float16 type, add insn and expanders to optimize x / y to
> x * rcp (y), and x / sqrt (y) to x * rsqrt (y).
> As Half float only have minor precision difference between div and
> mul * rcp, there is no need for Newton-Rhapson approximation.
>
> Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> Ok for master?
Ok.
>
> gcc/ChangeLog:
>
> * config/i386/i386.c (use_rsqrt_p): Add mode parameter, enable
>   HFmode rsqrt without TARGET_SSE_MATH.
> (ix86_optab_supported_p): Refactor rint, adjust floor, ceil,
> btrunc condition to be restricted by -ftrapping-math, adjust
> use_rsqrt_p function call.
> * config/i386/i386.md (rcphf2): New define_insn.
> (rsqrthf2): Likewise.
> * config/i386/sse.md (div3): Change VF2H to VF2.
> (div3): New expander for HF mode.
> (rsqrt2): Likewise.
> (*avx512fp16_vmrcpv8hf2): New define_insn for rpad pass.
> (*avx512fp16_vmrsqrtv8hf2): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx512fp16-recip-1.c: New test.
> * gcc.target/i386/avx512fp16-recip-2.c: Ditto.
> * gcc.target/i386/pr102464.c: Add -fno-trapping-math.
> ---
>  gcc/config/i386/i386.c| 29 +++---
>  gcc/config/i386/i386.md   | 44 -
>  gcc/config/i386/sse.md| 63 +++-
>  .../gcc.target/i386/avx512fp16-recip-1.c  | 43 
>  .../gcc.target/i386/avx512fp16-recip-2.c  | 97 +++
>  gcc/testsuite/gcc.target/i386/pr102464.c  |  2 +-
>  6 files changed, 258 insertions(+), 20 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 299e1ab2621..c5789365d3b 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -18905,9 +18905,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
> 1.0/sqrt.  */
>
>  static bool
> -use_rsqrt_p ()
> +use_rsqrt_p (machine_mode mode)
>  {
> -  return (TARGET_SSE && TARGET_SSE_MATH
> +  return ((mode == HFmode
> +  || (TARGET_SSE && TARGET_SSE_MATH))
>   && flag_finite_math_only
>   && !flag_trapping_math
>   && flag_unsafe_math_optimizations);
> @@ -23603,29 +23604,27 @@ ix86_optab_supported_p (int op, machine_mode mode1, 
> machine_mode,
>return opt_type == OPTIMIZE_FOR_SPEED;
>
>  case rint_optab:
> -  if (mode1 == HFmode)
> -   return true;
> -  else if (SSE_FLOAT_MODE_P (mode1)
> -  && TARGET_SSE_MATH
> -  && !flag_trapping_math
> -  && !TARGET_SSE4_1)
> +  if (SSE_FLOAT_MODE_P (mode1)
> + && TARGET_SSE_MATH
> + && !flag_trapping_math
> + && !TARGET_SSE4_1
> + && mode1 != HFmode)
> return opt_type == OPTIMIZE_FOR_SPEED;
>return true;
>
>  case floor_optab:
>  case ceil_optab:
>  case btrunc_optab:
> -  if (mode1 == HFmode)
> -   return true;
> -  else if (SSE_FLOAT_MODE_P (mode1)
> -  && TARGET_SSE_MATH
> -  && !flag_trapping_math
> -  && TARGET_SSE4_1)
> +  if (((SSE_FLOAT_MODE_P (mode1)
> +   && TARGET_SSE_MATH
> +   && TARGET_SSE4_1)
> +  || mode1 == HFmode)
> + && !flag_trapping_math)
> return true;
>return opt_type == OPTIMIZE_FOR_SPEED;
>
>  case rsqrt_optab:
> -  return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
> +  return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
>
>  default:
>return true;
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index e733a40fc90..11535df5425 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -8417,11 +8417,27 @@
> (match_operand:XF 2 "register_operand")))]
>"TARGET_80387")
>
> +/* There is no more precision loss than Newton-Rhapson approximation
> +  when using HFmode rcp/rsqrt, so do the transformation directly under
> +  TARGET_RECIP_DIV and fast-math.  */
>  (define_expand "divhf3"
>[(set (match_operand:HF 0 "register_operand")
> (div:HF (match_operand:HF 1 "register_operand")
>(match_operand:HF 2 "nonimmediate_operand")))]
> -  "TARGET_AVX512FP16")
> +  "TARGET_AVX512FP16"
> +{
> +  if (TARGET_RECIP_DIV
> +  && optimize_insn_for_speed_p ()
> +  && flag_finite_math_only && !flag_trapping_math
> +  && flag_unsafe_math_optimizations)
> +{
> +  rtx op = gen_reg_rtx (HFmode);
> +  operands[2] = force_reg (HFmode, operands[2]);
> +  emit_insn (gen_rcphf2 (op, operands[2]));
> +  emit_insn (gen_mulhf3 (operands[0], operands[1], op));
> +  DONE;
> +}
> +})
>
>  (define_expand "div3"
>[(set (match

[PATCH] AVX512FP16: Optimize _Float16 reciprocal for div and sqrt

2021-10-26 Thread Hongyu Wang via Gcc-patches
Hi,

For _Float16 type, add insn and expanders to optimize x / y to
x * rcp (y), and x / sqrt (y) to x * rsqrt (y).
As Half float only have minor precision difference between div and
mul * rcp, there is no need for Newton-Rhapson approximation.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
Ok for master?

gcc/ChangeLog:

* config/i386/i386.c (use_rsqrt_p): Add mode parameter, enable
  HFmode rsqrt without TARGET_SSE_MATH.
(ix86_optab_supported_p): Refactor rint, adjust floor, ceil,
btrunc condition to be restricted by -ftrapping-math, adjust
use_rsqrt_p function call.
* config/i386/i386.md (rcphf2): New define_insn.
(rsqrthf2): Likewise.
* config/i386/sse.md (div3): Change VF2H to VF2.
(div3): New expander for HF mode.
(rsqrt2): Likewise.
(*avx512fp16_vmrcpv8hf2): New define_insn for rpad pass.
(*avx512fp16_vmrsqrtv8hf2): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-recip-1.c: New test.
* gcc.target/i386/avx512fp16-recip-2.c: Ditto.
* gcc.target/i386/pr102464.c: Add -fno-trapping-math.
---
 gcc/config/i386/i386.c| 29 +++---
 gcc/config/i386/i386.md   | 44 -
 gcc/config/i386/sse.md| 63 +++-
 .../gcc.target/i386/avx512fp16-recip-1.c  | 43 
 .../gcc.target/i386/avx512fp16-recip-2.c  | 97 +++
 gcc/testsuite/gcc.target/i386/pr102464.c  |  2 +-
 6 files changed, 258 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 299e1ab2621..c5789365d3b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18905,9 +18905,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
1.0/sqrt.  */
 
 static bool
-use_rsqrt_p ()
+use_rsqrt_p (machine_mode mode)
 {
-  return (TARGET_SSE && TARGET_SSE_MATH
+  return ((mode == HFmode
+  || (TARGET_SSE && TARGET_SSE_MATH))
  && flag_finite_math_only
  && !flag_trapping_math
  && flag_unsafe_math_optimizations);
@@ -23603,29 +23604,27 @@ ix86_optab_supported_p (int op, machine_mode mode1, 
machine_mode,
   return opt_type == OPTIMIZE_FOR_SPEED;
 
 case rint_optab:
-  if (mode1 == HFmode)
-   return true;
-  else if (SSE_FLOAT_MODE_P (mode1)
-  && TARGET_SSE_MATH
-  && !flag_trapping_math
-  && !TARGET_SSE4_1)
+  if (SSE_FLOAT_MODE_P (mode1)
+ && TARGET_SSE_MATH
+ && !flag_trapping_math
+ && !TARGET_SSE4_1
+ && mode1 != HFmode)
return opt_type == OPTIMIZE_FOR_SPEED;
   return true;
 
 case floor_optab:
 case ceil_optab:
 case btrunc_optab:
-  if (mode1 == HFmode)
-   return true;
-  else if (SSE_FLOAT_MODE_P (mode1)
-  && TARGET_SSE_MATH
-  && !flag_trapping_math
-  && TARGET_SSE4_1)
+  if (((SSE_FLOAT_MODE_P (mode1)
+   && TARGET_SSE_MATH
+   && TARGET_SSE4_1)
+  || mode1 == HFmode)
+ && !flag_trapping_math)
return true;
   return opt_type == OPTIMIZE_FOR_SPEED;
 
 case rsqrt_optab:
-  return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+  return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
 
 default:
   return true;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e733a40fc90..11535df5425 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -8417,11 +8417,27 @@
(match_operand:XF 2 "register_operand")))]
   "TARGET_80387")
 
+/* There is no more precision loss than Newton-Rhapson approximation
+  when using HFmode rcp/rsqrt, so do the transformation directly under
+  TARGET_RECIP_DIV and fast-math.  */
 (define_expand "divhf3"
   [(set (match_operand:HF 0 "register_operand")
(div:HF (match_operand:HF 1 "register_operand")
   (match_operand:HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16")
+  "TARGET_AVX512FP16"
+{
+  if (TARGET_RECIP_DIV
+  && optimize_insn_for_speed_p ()
+  && flag_finite_math_only && !flag_trapping_math
+  && flag_unsafe_math_optimizations)
+{
+  rtx op = gen_reg_rtx (HFmode);
+  operands[2] = force_reg (HFmode, operands[2]);
+  emit_insn (gen_rcphf2 (op, operands[2]));
+  emit_insn (gen_mulhf3 (operands[0], operands[1], op));
+  DONE;
+}
+})
 
 (define_expand "div3"
   [(set (match_operand:MODEF 0 "register_operand")
@@ -16973,6 +16989,19 @@
]
(symbol_ref "true")))])
 
+(define_insn "rcphf2"
+  [(set (match_operand:HF 0 "register_operand" "=v,v")
+   (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
+  UNSPEC_RCP))]
+  "TARGET_AVX512FP16"