Also introduce -m[no-]mmxfp-with-sse option to disable trapping V2SF named patterns in order to avoid generation of partial vector V4SFmode trapping instructions.
The new option is enabled by default, because even with sanitization, a small but consistent speed up of 2 to 3% with Polyhedron capacita benchmark can be achieved vs. scalar code. Using -fno-trapping-math improves Polyhedron capacita runtime 8 to 9% vs. scalar code. This is what clang does by default, as it defaults to -fno-trapping-math. PR target/110832 gcc/ChangeLog: * config/i386/i386.h (TARGET_MMXFP_WITH_SSE): New macro. * config/i386/i386/opt (mmmxfp-with-sse): New option. * config/i386/mmx.md (movq_<mode>_to_sse): Do not sanitize upper part of V2SFmode register with -fno-trapping-math. (<plusminusmult:insn>v2sf3): Enable for TARGET_MMXFP_WITH_SSE. (divv2sf3): Ditto. (<smaxmin:code>v2sf3): Ditto. (sqrtv2sf2): Ditto. (*mmx_haddv2sf3_low): Ditto. (*mmx_hsubv2sf3_low): Ditto. (vec_addsubv2sf3): Ditto. (vec_cmpv2sfv2si): Ditto. (vcond<V2FI:mode>v2sf): Ditto. (fmav2sf4): Ditto. (fmsv2sf4): Ditto. (fnmav2sf4): Ditto. (fnmsv2sf4): Ditto. (fix_truncv2sfv2si2): Ditto. (fixuns_truncv2sfv2si2): Ditto. (floatv2siv2sf2): Ditto. (floatunsv2siv2sf2): Ditto. (nearbyintv2sf2): Ditto. (rintv2sf2): Ditto. (lrintv2sfv2si2): Ditto. (ceilv2sf2): Ditto. (lceilv2sfv2si2): Ditto. (floorv2sf2): Ditto. (lfloorv2sfv2si2): Ditto. (btruncv2sf2): Ditto. (roundv2sf2): Ditto. (lroundv2sfv2si2): Ditto. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Uros.
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index ef342fcee9b..af72b6c48a9 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -50,6 +50,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define TARGET_16BIT_P(x) TARGET_CODE16_P(x) #define TARGET_MMX_WITH_SSE (TARGET_64BIT && TARGET_SSE2) +#define TARGET_MMXFP_WITH_SSE (TARGET_MMX_WITH_SSE && ix86_mmxfp_with_sse) #include "config/vxworks-dummy.h" diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 1cc8563477a..1b65fed5daf 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -670,6 +670,10 @@ m3dnowa Target Mask(ISA_3DNOW_A) Var(ix86_isa_flags) Save Support Athlon 3Dnow! built-in functions. +mmmxfp-with-sse +Target Var(ix86_mmxfp_with_sse) Init(1) +Enable MMX floating point vectors in SSE registers + msse Target Mask(ISA_SSE) Var(ix86_isa_flags) Save Support MMX and SSE built-in functions and code generation. diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 896af76a33f..0555da9022b 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -597,7 +597,18 @@ (define_expand "movq_<mode>_to_sse" (match_operand:V2FI 1 "nonimmediate_operand") (match_dup 2)))] "TARGET_SSE2" - "operands[2] = CONST0_RTX (<MODE>mode);") +{ + if (<MODE>mode == V2SFmode + && !flag_trapping_math) + { + rtx op1 = force_reg (<MODE>mode, operands[1]); + emit_move_insn (operands[0], lowpart_subreg (<mmxdoublevecmode>mode, + op1, <MODE>mode)); + DONE; + } + + operands[2] = CONST0_RTX (<MODE>mode); +}) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -650,7 +661,7 @@ (define_expand "<insn>v2sf3" (plusminusmult:V2SF (match_operand:V2SF 1 "nonimmediate_operand") (match_operand:V2SF 2 "nonimmediate_operand")))] - "TARGET_MMX_WITH_SSE" + "TARGET_MMXFP_WITH_SSE" { rtx op2 = gen_reg_rtx (V4SFmode); rtx op1 = gen_reg_rtx (V4SFmode); @@ -728,7 +739,7 @@ (define_expand "divv2sf3" [(set (match_operand:V2SF 0 "register_operand") (div:V2SF (match_operand:V2SF 1 "register_operand") (match_operand:V2SF 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" + "TARGET_MMXFP_WITH_SSE" { rtx op2 = gen_reg_rtx (V4SFmode); rtx op1 = gen_reg_rtx (V4SFmode); @@ -750,7 +761,7 @@ (define_expand "<code>v2sf3" (smaxmin:V2SF (match_operand:V2SF 1 "register_operand") (match_operand:V2SF 2 "register_operand")))] - "TARGET_MMX_WITH_SSE" + "TARGET_MMXFP_WITH_SSE" { rtx op2 = gen_reg_rtx (V4SFmode); rtx op1 = gen_reg_rtx (V4SFmode); @@ -852,7 +863,7 @@ (define_insn "mmx_rcpit2v2sf3" (define_expand "sqrtv2sf2" [(set (match_operand:V2SF 0 "register_operand") (sqrt:V2SF (match_operand:V2SF 1 "nonimmediate_operand")))] - "TARGET_MMX_WITH_SSE" + "TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -933,7 +944,7 @@ (define_insn_and_split "*mmx_haddv2sf3_low" (vec_select:SF (match_dup 1) (parallel [(match_operand:SI 3 "const_0_to_1_operand")]))))] - "TARGET_SSE3 && TARGET_MMX_WITH_SSE + "TARGET_SSE3 && TARGET_MMXFP_WITH_SSE && INTVAL (operands[2]) != INTVAL (operands[3]) && ix86_pre_reload_split ()" "#" @@ -979,7 +990,7 @@ (define_insn_and_split "*mmx_hsubv2sf3_low" (vec_select:SF (match_dup 1) (parallel [(const_int 1)]))))] - "TARGET_SSE3 && TARGET_MMX_WITH_SSE + "TARGET_SSE3 && TARGET_MMXFP_WITH_SSE && ix86_pre_reload_split ()" "#" "&& 1" @@ -1041,7 +1052,7 @@ (define_expand "vec_addsubv2sf3" (match_operand:V2SF 2 "nonimmediate_operand")) (plus:V2SF (match_dup 1) (match_dup 2)) (const_int 1)))] - "TARGET_SSE3 && TARGET_MMX_WITH_SSE" + "TARGET_SSE3 && TARGET_MMXFP_WITH_SSE" { rtx op2 = gen_reg_rtx (V4SFmode); rtx op1 = gen_reg_rtx (V4SFmode); @@ -1104,7 +1115,7 @@ (define_expand "vec_cmpv2sfv2si" (match_operator:V2SI 1 "" [(match_operand:V2SF 2 "nonimmediate_operand") (match_operand:V2SF 3 "nonimmediate_operand")]))] - "TARGET_MMX_WITH_SSE" + "TARGET_MMXFP_WITH_SSE" { rtx ops[4]; ops[3] = gen_reg_rtx (V4SFmode); @@ -1130,7 +1141,7 @@ (define_expand "vcond<mode>v2sf" (match_operand:V2SF 5 "nonimmediate_operand")]) (match_operand:V2FI 1 "general_operand") (match_operand:V2FI 2 "general_operand")))] - "TARGET_MMX_WITH_SSE" + "TARGET_MMXFP_WITH_SSE" { rtx ops[6]; ops[5] = gen_reg_rtx (V4SFmode); @@ -1320,7 +1331,7 @@ (define_expand "fmav2sf4" (match_operand:V2SF 2 "nonimmediate_operand") (match_operand:V2SF 3 "nonimmediate_operand")))] "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) - && TARGET_MMX_WITH_SSE" + && TARGET_MMXFP_WITH_SSE" { rtx op3 = gen_reg_rtx (V4SFmode); rtx op2 = gen_reg_rtx (V4SFmode); @@ -1345,7 +1356,7 @@ (define_expand "fmsv2sf4" (neg:V2SF (match_operand:V2SF 3 "nonimmediate_operand"))))] "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) - && TARGET_MMX_WITH_SSE" + && TARGET_MMXFP_WITH_SSE" { rtx op3 = gen_reg_rtx (V4SFmode); rtx op2 = gen_reg_rtx (V4SFmode); @@ -1370,7 +1381,7 @@ (define_expand "fnmav2sf4" (match_operand:V2SF 2 "nonimmediate_operand") (match_operand:V2SF 3 "nonimmediate_operand")))] "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) - && TARGET_MMX_WITH_SSE" + && TARGET_MMXFP_WITH_SSE" { rtx op3 = gen_reg_rtx (V4SFmode); rtx op2 = gen_reg_rtx (V4SFmode); @@ -1396,7 +1407,7 @@ (define_expand "fnmsv2sf4" (neg:V2SF (match_operand:V2SF 3 "nonimmediate_operand"))))] "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) - && TARGET_MMX_WITH_SSE" + && TARGET_MMXFP_WITH_SSE" { rtx op3 = gen_reg_rtx (V4SFmode); rtx op2 = gen_reg_rtx (V4SFmode); @@ -1422,7 +1433,7 @@ (define_expand "fnmsv2sf4" (define_expand "fix_truncv2sfv2si2" [(set (match_operand:V2SI 0 "register_operand") (fix:V2SI (match_operand:V2SF 1 "nonimmediate_operand")))] - "TARGET_MMX_WITH_SSE" + "TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SImode); @@ -1438,7 +1449,7 @@ (define_expand "fix_truncv2sfv2si2" (define_expand "fixuns_truncv2sfv2si2" [(set (match_operand:V2SI 0 "register_operand") (unsigned_fix:V2SI (match_operand:V2SF 1 "nonimmediate_operand")))] - "TARGET_AVX512VL && TARGET_MMX_WITH_SSE" + "TARGET_AVX512VL && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SImode); @@ -1463,7 +1474,7 @@ (define_insn "mmx_fix_truncv2sfv2si2" (define_expand "floatv2siv2sf2" [(set (match_operand:V2SF 0 "register_operand") (float:V2SF (match_operand:V2SI 1 "nonimmediate_operand")))] - "TARGET_MMX_WITH_SSE" + "TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SImode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -1479,7 +1490,7 @@ (define_expand "floatv2siv2sf2" (define_expand "floatunsv2siv2sf2" [(set (match_operand:V2SF 0 "register_operand") (unsigned_float:V2SF (match_operand:V2SI 1 "nonimmediate_operand")))] - "TARGET_AVX512VL && TARGET_MMX_WITH_SSE" + "TARGET_AVX512VL && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SImode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -1756,7 +1767,7 @@ (define_expand "vec_initv2sfsf" (define_expand "nearbyintv2sf2" [(match_operand:V2SF 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] - "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "TARGET_SSE4_1 && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -1772,7 +1783,7 @@ (define_expand "nearbyintv2sf2" (define_expand "rintv2sf2" [(match_operand:V2SF 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] - "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "TARGET_SSE4_1 && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -1788,8 +1799,8 @@ (define_expand "rintv2sf2" (define_expand "lrintv2sfv2si2" [(match_operand:V2SI 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] - "TARGET_SSE4_1 && !flag_trapping_math - && TARGET_MMX_WITH_SSE" + "TARGET_SSE4_1 && !flag_trapping_math + && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SImode); @@ -1806,7 +1817,7 @@ (define_expand "ceilv2sf2" [(match_operand:V2SF 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] "TARGET_SSE4_1 && !flag_trapping_math - && TARGET_MMX_WITH_SSE" + && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -1822,8 +1833,8 @@ (define_expand "ceilv2sf2" (define_expand "lceilv2sfv2si2" [(match_operand:V2SI 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] - "TARGET_SSE4_1 && !flag_trapping_math - && TARGET_MMX_WITH_SSE" + "TARGET_SSE4_1 && !flag_trapping_math + && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SImode); @@ -1840,7 +1851,7 @@ (define_expand "floorv2sf2" [(match_operand:V2SF 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] "TARGET_SSE4_1 && !flag_trapping_math - && TARGET_MMX_WITH_SSE" + && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -1856,8 +1867,8 @@ (define_expand "floorv2sf2" (define_expand "lfloorv2sfv2si2" [(match_operand:V2SI 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] - "TARGET_SSE4_1 && !flag_trapping_math - && TARGET_MMX_WITH_SSE" + "TARGET_SSE4_1 && !flag_trapping_math + && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SImode); @@ -1874,7 +1885,7 @@ (define_expand "btruncv2sf2" [(match_operand:V2SF 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] "TARGET_SSE4_1 && !flag_trapping_math - && TARGET_MMX_WITH_SSE" + && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -1891,7 +1902,7 @@ (define_expand "roundv2sf2" [(match_operand:V2SF 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] "TARGET_SSE4_1 && !flag_trapping_math - && TARGET_MMX_WITH_SSE" + && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SFmode); @@ -1907,8 +1918,8 @@ (define_expand "roundv2sf2" (define_expand "lroundv2sfv2si2" [(match_operand:V2SI 0 "register_operand") (match_operand:V2SF 1 "nonimmediate_operand")] - "TARGET_SSE4_1 && !flag_trapping_math - && TARGET_MMX_WITH_SSE" + "TARGET_SSE4_1 && !flag_trapping_math + && TARGET_MMXFP_WITH_SSE" { rtx op1 = gen_reg_rtx (V4SFmode); rtx op0 = gen_reg_rtx (V4SImode);