Broadcast from memory is better than load 128-bit vector + permutation to 256-bit vector.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready push to trunk. gcc/ChangeLog: * config/i386/predicates.md (avx_vbroadcast128_operand): New predicate. * config/i386/sse.md (*avx_vbroadcastf128_<mode>_perm): New pre_reload splitter. gcc/testsuite/ChangeLog: * gcc.target/i386/avx_vbroadcastf128.c: New test. --- gcc/config/i386/predicates.md | 19 +++++++++++++++++++ gcc/config/i386/sse.md | 15 +++++++++++++++ .../gcc.target/i386/avx_vbroadcastf128.c | 17 +++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx_vbroadcastf128.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 5dbe444847f..57950d31878 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -2077,6 +2077,25 @@ (define_predicate "avx_vbroadcast_operand" return true; }) +;; Return true if OP is a parallel for a vbroadcastf128 permute. +(define_predicate "avx_vbroadcast128_operand" + (and (match_code "parallel") + (match_code "const_int" "a")) +{ + int i, nelt = XVECLEN (op, 0); + int half = nelt / 2; + + for (i = 0; i < nelt; ++i) + { + int index = INTVAL (XVECEXP (op, 0, i)); + if ((i < half && index != i) + || (i >= half && index != (i - half))) + return false; + } + + return true; +}) + ;; Return true if OP is a parallel for a palignr permute. (define_predicate "palignr_operand" (and (match_code "parallel") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index e87c26fcc07..8b28c8edb19 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -28148,6 +28148,21 @@ (define_insn "avx_vbroadcastf128_<mode>" (set_attr "prefix" "vex,vex,vex,evex,evex,evex,evex") (set_attr "mode" "<sseinsnmode>")]) + +(define_insn_and_split "*avx_vbroadcastf128_<mode>_perm" + [(set (match_operand:V_256 0 "register_operand") + (vec_select:V_256 + (vec_concat:V_256 + (match_operand:<ssehalfvecmode> 1 "memory_operand") + (match_operand:<ssehalfvecmode> 2 "general_operand")) + (match_parallel 3 "avx_vbroadcast128_operand" + [(match_operand 4 "const_int_operand")])))] + "TARGET_AVX && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_concat: V_256 (match_dup 1) (match_dup 1)))]) + ;; For broadcast[i|f]32x2. Yes there is no v4sf version, only v4si. (define_mode_iterator VI4F_BRCST32x2 [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") diff --git a/gcc/testsuite/gcc.target/i386/avx_vbroadcastf128.c b/gcc/testsuite/gcc.target/i386/avx_vbroadcastf128.c new file mode 100644 index 00000000000..e0bda7dda10 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx_vbroadcastf128.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -O3" } */ +/* { dg-final { scan-assembler-not "vpermpd"} } */ +/* { dg-final { scan-assembler {(?n)vbroadcastf(?:128|64x2)} } } */ + +void +foo (double* __restrict a, double* b, double* c, int n) +{ + for (int i = 0; i != n; i+=4) + { + a[i] += b[i] * c[i]; + a[i+1] += b[i+1] * c[i+1]; + a[i+2] += b[i] * c[i+2]; + a[i+3] += b[i+1] * c[i+3]; + } + +} -- 2.34.1