Re: [PATCH] AVX512FP16: Optimize _Float16 reciprocal for div and sqrt
On Tue, Oct 26, 2021 at 5:51 PM Hongyu Wang via Gcc-patches wrote: > > Hi, > > For _Float16 type, add insn and expanders to optimize x / y to > x * rcp (y), and x / sqrt (y) to x * rsqrt (y). > As Half float only have minor precision difference between div and > mul * rcp, there is no need for Newton-Rhapson approximation. > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde. > Ok for master? Ok. > > gcc/ChangeLog: > > * config/i386/i386.c (use_rsqrt_p): Add mode parameter, enable > HFmode rsqrt without TARGET_SSE_MATH. > (ix86_optab_supported_p): Refactor rint, adjust floor, ceil, > btrunc condition to be restricted by -ftrapping-math, adjust > use_rsqrt_p function call. > * config/i386/i386.md (rcphf2): New define_insn. > (rsqrthf2): Likewise. > * config/i386/sse.md (div3): Change VF2H to VF2. > (div3): New expander for HF mode. > (rsqrt2): Likewise. > (*avx512fp16_vmrcpv8hf2): New define_insn for rpad pass. > (*avx512fp16_vmrsqrtv8hf2): Likewise. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx512fp16-recip-1.c: New test. > * gcc.target/i386/avx512fp16-recip-2.c: Ditto. > * gcc.target/i386/pr102464.c: Add -fno-trapping-math. > --- > gcc/config/i386/i386.c| 29 +++--- > gcc/config/i386/i386.md | 44 - > gcc/config/i386/sse.md| 63 +++- > .../gcc.target/i386/avx512fp16-recip-1.c | 43 > .../gcc.target/i386/avx512fp16-recip-2.c | 97 +++ > gcc/testsuite/gcc.target/i386/pr102464.c | 2 +- > 6 files changed, 258 insertions(+), 20 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 299e1ab2621..c5789365d3b 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -18905,9 +18905,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype, > 1.0/sqrt. */ > > static bool > -use_rsqrt_p () > +use_rsqrt_p (machine_mode mode) > { > - return (TARGET_SSE && TARGET_SSE_MATH > + return ((mode == HFmode > + || (TARGET_SSE && TARGET_SSE_MATH)) > && flag_finite_math_only > && !flag_trapping_math > && flag_unsafe_math_optimizations); > @@ -23603,29 +23604,27 @@ ix86_optab_supported_p (int op, machine_mode mode1, > machine_mode, >return opt_type == OPTIMIZE_FOR_SPEED; > > case rint_optab: > - if (mode1 == HFmode) > - return true; > - else if (SSE_FLOAT_MODE_P (mode1) > - && TARGET_SSE_MATH > - && !flag_trapping_math > - && !TARGET_SSE4_1) > + if (SSE_FLOAT_MODE_P (mode1) > + && TARGET_SSE_MATH > + && !flag_trapping_math > + && !TARGET_SSE4_1 > + && mode1 != HFmode) > return opt_type == OPTIMIZE_FOR_SPEED; >return true; > > case floor_optab: > case ceil_optab: > case btrunc_optab: > - if (mode1 == HFmode) > - return true; > - else if (SSE_FLOAT_MODE_P (mode1) > - && TARGET_SSE_MATH > - && !flag_trapping_math > - && TARGET_SSE4_1) > + if (((SSE_FLOAT_MODE_P (mode1) > + && TARGET_SSE_MATH > + && TARGET_SSE4_1) > + || mode1 == HFmode) > + && !flag_trapping_math) > return true; >return opt_type == OPTIMIZE_FOR_SPEED; > > case rsqrt_optab: > - return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (); > + return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1); > > default: >return true; > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index e733a40fc90..11535df5425 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -8417,11 +8417,27 @@ > (match_operand:XF 2 "register_operand")))] >"TARGET_80387") > > +/* There is no more precision loss than Newton-Rhapson approximation > + when using HFmode rcp/rsqrt, so do the transformation directly under > + TARGET_RECIP_DIV and fast-math. */ > (define_expand "divhf3" >[(set (match_operand:HF 0 "register_operand") > (div:HF (match_operand:HF 1 "register_operand") >(match_operand:HF 2 "nonimmediate_operand")))] > - "TARGET_AVX512FP16") > + "TARGET_AVX512FP16" > +{ > + if (TARGET_RECIP_DIV > + && optimize_insn_for_speed_p () > + && flag_finite_math_only && !flag_trapping_math > + && flag_unsafe_math_optimizations) > +{ > + rtx op = gen_reg_rtx (HFmode); > + operands[2] = force_reg (HFmode, operands[2]); > + emit_insn (gen_rcphf2 (op, operands[2])); > + emit_insn (gen_mulhf3 (operands[0], operands[1], op)); > + DONE; > +} > +}) > > (define_expand "div3" >[(set (match
[PATCH] AVX512FP16: Optimize _Float16 reciprocal for div and sqrt
Hi, For _Float16 type, add insn and expanders to optimize x / y to x * rcp (y), and x / sqrt (y) to x * rsqrt (y). As Half float only have minor precision difference between div and mul * rcp, there is no need for Newton-Rhapson approximation. Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde. Ok for master? gcc/ChangeLog: * config/i386/i386.c (use_rsqrt_p): Add mode parameter, enable HFmode rsqrt without TARGET_SSE_MATH. (ix86_optab_supported_p): Refactor rint, adjust floor, ceil, btrunc condition to be restricted by -ftrapping-math, adjust use_rsqrt_p function call. * config/i386/i386.md (rcphf2): New define_insn. (rsqrthf2): Likewise. * config/i386/sse.md (div3): Change VF2H to VF2. (div3): New expander for HF mode. (rsqrt2): Likewise. (*avx512fp16_vmrcpv8hf2): New define_insn for rpad pass. (*avx512fp16_vmrsqrtv8hf2): Likewise. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512fp16-recip-1.c: New test. * gcc.target/i386/avx512fp16-recip-2.c: Ditto. * gcc.target/i386/pr102464.c: Add -fno-trapping-math. --- gcc/config/i386/i386.c| 29 +++--- gcc/config/i386/i386.md | 44 - gcc/config/i386/sse.md| 63 +++- .../gcc.target/i386/avx512fp16-recip-1.c | 43 .../gcc.target/i386/avx512fp16-recip-2.c | 97 +++ gcc/testsuite/gcc.target/i386/pr102464.c | 2 +- 6 files changed, 258 insertions(+), 20 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 299e1ab2621..c5789365d3b 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -18905,9 +18905,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype, 1.0/sqrt. */ static bool -use_rsqrt_p () +use_rsqrt_p (machine_mode mode) { - return (TARGET_SSE && TARGET_SSE_MATH + return ((mode == HFmode + || (TARGET_SSE && TARGET_SSE_MATH)) && flag_finite_math_only && !flag_trapping_math && flag_unsafe_math_optimizations); @@ -23603,29 +23604,27 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode, return opt_type == OPTIMIZE_FOR_SPEED; case rint_optab: - if (mode1 == HFmode) - return true; - else if (SSE_FLOAT_MODE_P (mode1) - && TARGET_SSE_MATH - && !flag_trapping_math - && !TARGET_SSE4_1) + if (SSE_FLOAT_MODE_P (mode1) + && TARGET_SSE_MATH + && !flag_trapping_math + && !TARGET_SSE4_1 + && mode1 != HFmode) return opt_type == OPTIMIZE_FOR_SPEED; return true; case floor_optab: case ceil_optab: case btrunc_optab: - if (mode1 == HFmode) - return true; - else if (SSE_FLOAT_MODE_P (mode1) - && TARGET_SSE_MATH - && !flag_trapping_math - && TARGET_SSE4_1) + if (((SSE_FLOAT_MODE_P (mode1) + && TARGET_SSE_MATH + && TARGET_SSE4_1) + || mode1 == HFmode) + && !flag_trapping_math) return true; return opt_type == OPTIMIZE_FOR_SPEED; case rsqrt_optab: - return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (); + return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1); default: return true; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index e733a40fc90..11535df5425 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -8417,11 +8417,27 @@ (match_operand:XF 2 "register_operand")))] "TARGET_80387") +/* There is no more precision loss than Newton-Rhapson approximation + when using HFmode rcp/rsqrt, so do the transformation directly under + TARGET_RECIP_DIV and fast-math. */ (define_expand "divhf3" [(set (match_operand:HF 0 "register_operand") (div:HF (match_operand:HF 1 "register_operand") (match_operand:HF 2 "nonimmediate_operand")))] - "TARGET_AVX512FP16") + "TARGET_AVX512FP16" +{ + if (TARGET_RECIP_DIV + && optimize_insn_for_speed_p () + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) +{ + rtx op = gen_reg_rtx (HFmode); + operands[2] = force_reg (HFmode, operands[2]); + emit_insn (gen_rcphf2 (op, operands[2])); + emit_insn (gen_mulhf3 (operands[0], operands[1], op)); + DONE; +} +}) (define_expand "div3" [(set (match_operand:MODEF 0 "register_operand") @@ -16973,6 +16989,19 @@ ] (symbol_ref "true")))]) +(define_insn "rcphf2" + [(set (match_operand:HF 0 "register_operand" "=v,v") + (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")] + UNSPEC_RCP))] + "TARGET_AVX512FP16"