The AVX-512 masked epilogue emits a masked load (UNSPEC_MASKLOAD under
a VEC_MERGE) feeding a masked binary op; the wrapper hides the memory
operand from combine, so it is not folded into the operation.
Add four define_insn_and_split patterns that strip the wrapper, for
commutative and non-commutative ops, with and without an outer mask.
Iterator V1248FH_AVX512VLBW covers all maskable modes; a new
noncommutative_binary_operator predicate handles minus and div.
gcc/ChangeLog:
PR target/123997
* config/i386/predicates.md (noncommutative_binary_operator):
New predicate.
* config/i386/sse.md (V1248FH_AVX512VLBW): New mode iterator.
(*comm_maskload_fold_unmasked_op): New pattern.
(*comm_maskload_fold_masked_op): New pattern.
(*noncomm_maskload_fold_unmasked_op): New pattern.
(*noncomm_maskload_fold_masked_op): New pattern.
gcc/testsuite/ChangeLog:
PR target/123997
* gcc.target/i386/avx512-maskload-fold-1.c: New test.
* gcc.target/i386/avx512-maskload-fold-2.c: New test.
* gcc.target/i386/avx512-maskload-fold-3.c: New test.
* gcc.target/i386/avx512-maskload-fold-4.c: New test.
* gcc.target/i386/avx512-maskload-fold-5.c: New test.
* gcc.target/i386/avx512-maskload-fold-6.c: New test.
* gcc.target/i386/avx512-maskload-fold-7.c: New test.
* gcc.target/i386/avx512-maskload-fold-8.c: New test.
* gcc.target/i386/avx512-maskload-fold-9.c: New test.
* gcc.target/i386/avx512-maskload-fold-10.c: New test.
* gcc.target/i386/avx512-maskload-fold-11.c: New test.
* gcc.target/i386/avx512-maskload-fold-12.c: New test.
* gcc.target/i386/avx512-maskload-fold-13.c: New test.
* gcc.target/i386/avx512-maskload-fold-14.c: New test.
* gcc.target/i386/avx512-maskload-fold-15.c: New test.
* gcc.target/i386/avx512-maskload-fold-16.c: New test.
Co-authored-by: Venkataramanan Kumar <[email protected]>
Signed-off-by: Sarvesh Chandra <[email protected]>
---
RFC: The generic approach to avoid the explosion of patterns here would be
to hoist the vec_merge at the root of the RTL tree during combine; that
would target unary, binary and ternary operations, including operations
with a complex RTL pattern such as vpavgb. The following vpavgb case
results in no fold at -O3/-Ofast (avx512-maskload-fold-15.c, -16.c):
void vector_avg_uint8 (unsigned char * __restrict dst,
const unsigned char * __restrict a,
const unsigned char * __restrict b, int n)
{
for (int i = 0; i < n; ++i)
dst[i] = (a[i] + b[i] + 1) >> 1;
}
Bootstrapped and regression-tested on x86_64-pc-linux-gnu.
gcc/config/i386/predicates.md | 4 +
gcc/config/i386/sse.md | 116 ++++++++++++++++++
.../gcc.target/i386/avx512-maskload-fold-1.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-10.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-11.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-12.c | 15 +++
.../gcc.target/i386/avx512-maskload-fold-13.c | 14 +++
.../gcc.target/i386/avx512-maskload-fold-14.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-15.c | 15 +++
.../gcc.target/i386/avx512-maskload-fold-16.c | 15 +++
.../gcc.target/i386/avx512-maskload-fold-2.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-3.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-4.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-5.c | 15 +++
.../gcc.target/i386/avx512-maskload-fold-6.c | 14 +++
.../gcc.target/i386/avx512-maskload-fold-7.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-8.c | 12 ++
.../gcc.target/i386/avx512-maskload-fold-9.c | 12 ++
18 files changed, 328 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-10.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-11.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-12.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-13.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-14.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-15.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-16.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-5.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-6.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-7.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-8.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-9.c
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index cf3a68a9fe8..c98f77e6ad5 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1740,6 +1740,10 @@
(define_predicate "commutative_operator"
(match_code "plus,mult,and,ior,xor,smin,smax,umin,umax"))
+;; Return true for non-commutative binary operators (minus, div).
+(define_predicate "noncommutative_binary_operator"
+ (match_code "minus,div"))
+
;; Return true if OP is a binary operator that can be promoted to wider mode.
(define_predicate "promotable_binary_operator"
(ior (match_code "plus,minus,and,ior,xor,ashift")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 51d1e9b455a..2538b39c0bf 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2792,6 +2792,122 @@
(set_attr "btver2_decode" "direct,double")
(set_attr "mode" "<MODE>")])
+;; AVX-512 modes that support masked loads.
+
+(define_mode_iterator V1248FH_AVX512VLBW
+ [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
+ V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")
+ (V32HF "TARGET_AVX512FP16")
+ (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
+ V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")
+ (V64QI "TARGET_AVX512BW")
+ (V32QI "TARGET_AVX512VL && TARGET_AVX512BW")
+ (V16QI "TARGET_AVX512VL && TARGET_AVX512BW")
+ (V32HI "TARGET_AVX512BW")
+ (V16HI "TARGET_AVX512VL && TARGET_AVX512BW")
+ (V8HI "TARGET_AVX512VL && TARGET_AVX512BW")])
+
+;; Fold masked load into commutative op, no outer mask.
+(define_insn_and_split "*comm_maskload_fold_unmasked_op"
+ [(set (match_operand:V1248FH_AVX512VLBW 0 "register_operand")
+ (match_operator:V1248FH_AVX512VLBW 1 "commutative_operator"
+ [(vec_merge:V1248FH_AVX512VLBW
+ (unspec:V1248FH_AVX512VLBW
+ [(match_operand:V1248FH_AVX512VLBW 2 "memory_operand")]
+ UNSPEC_MASKLOAD)
+ (match_operand:V1248FH_AVX512VLBW 3 "const0_operand")
+ (match_operand:<avx512fmaskmode> 4 "register_operand"))
+ (match_operand:V1248FH_AVX512VLBW 5 "register_operand")]))]
+ "TARGET_AVX512F"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op = gen_rtx_fmt_ee (GET_CODE (operands[1]), <MODE>mode,
+ operands[5], operands[2]);
+ rtx vm = gen_rtx_VEC_MERGE (<MODE>mode, op, operands[3], operands[4]);
+ emit_insn (gen_rtx_SET (operands[0], vm));
+ DONE;
+})
+
+;; Fold masked load into commutative op, outer mask.
+(define_insn_and_split "*comm_maskload_fold_masked_op"
+ [(set (match_operand:V1248FH_AVX512VLBW 0 "register_operand")
+ (vec_merge:V1248FH_AVX512VLBW
+ (match_operator:V1248FH_AVX512VLBW 1 "commutative_operator"
+ [(vec_merge:V1248FH_AVX512VLBW
+ (unspec:V1248FH_AVX512VLBW
+ [(match_operand:V1248FH_AVX512VLBW 2 "memory_operand")]
+ UNSPEC_MASKLOAD)
+ (match_operand:V1248FH_AVX512VLBW 3 "const0_operand")
+ (match_operand:<avx512fmaskmode> 4 "register_operand"))
+ (match_operand:V1248FH_AVX512VLBW 5 "register_operand")])
+ (match_operand:V1248FH_AVX512VLBW 6 "const0_operand")
+ (match_dup 4)))]
+ "TARGET_AVX512F"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op = gen_rtx_fmt_ee (GET_CODE (operands[1]), <MODE>mode,
+ operands[5], operands[2]);
+ rtx vm = gen_rtx_VEC_MERGE (<MODE>mode, op, operands[6], operands[4]);
+ emit_insn (gen_rtx_SET (operands[0], vm));
+ DONE;
+})
+
+;; Fold masked load into non-commutative op, no outer mask.
+(define_insn_and_split "*noncomm_maskload_fold_unmasked_op"
+ [(set (match_operand:V1248FH_AVX512VLBW 0 "register_operand")
+ (match_operator:V1248FH_AVX512VLBW 1 "noncommutative_binary_operator"
+ [(match_operand:V1248FH_AVX512VLBW 5 "register_operand")
+ (vec_merge:V1248FH_AVX512VLBW
+ (unspec:V1248FH_AVX512VLBW
+ [(match_operand:V1248FH_AVX512VLBW 2 "memory_operand")]
+ UNSPEC_MASKLOAD)
+ (match_operand:V1248FH_AVX512VLBW 3 "const0_operand")
+ (match_operand:<avx512fmaskmode> 4 "register_operand"))]))]
+ "TARGET_AVX512F"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op = gen_rtx_fmt_ee (GET_CODE (operands[1]), <MODE>mode,
+ operands[5], operands[2]);
+ rtx vm = gen_rtx_VEC_MERGE (<MODE>mode, op, operands[3], operands[4]);
+ emit_insn (gen_rtx_SET (operands[0], vm));
+ DONE;
+})
+
+;; Fold masked load into non-commutative op, outer mask.
+(define_insn_and_split "*noncomm_maskload_fold_masked_op"
+ [(set (match_operand:V1248FH_AVX512VLBW 0 "register_operand")
+ (vec_merge:V1248FH_AVX512VLBW
+ (match_operator:V1248FH_AVX512VLBW 1 "noncommutative_binary_operator"
+ [(match_operand:V1248FH_AVX512VLBW 5 "register_operand")
+ (vec_merge:V1248FH_AVX512VLBW
+ (unspec:V1248FH_AVX512VLBW
+ [(match_operand:V1248FH_AVX512VLBW 2 "memory_operand")]
+ UNSPEC_MASKLOAD)
+ (match_operand:V1248FH_AVX512VLBW 3 "const0_operand")
+ (match_operand:<avx512fmaskmode> 4 "register_operand"))])
+ (match_operand:V1248FH_AVX512VLBW 6 "const0_operand")
+ (match_dup 4)))]
+ "TARGET_AVX512F"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op = gen_rtx_fmt_ee (GET_CODE (operands[1]), <MODE>mode,
+ operands[5], operands[2]);
+ rtx vm = gen_rtx_VEC_MERGE (<MODE>mode, op, operands[6], operands[4]);
+ emit_insn (gen_rtx_SET (operands[0], vm));
+ DONE;
+})
+
+
;; Standard scalar operation patterns which preserve the rest of the
;; vector for combiner.
(define_insn "*<sse>_vm<multdiv_mnemonic><mode>3"
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-1.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-1.c
new file mode 100644
index 00000000000..78125cad545
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile }*/
+/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" } */
+/* { dg-final { scan-assembler "vaddpd\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_add_fp(double *__restrict dst,
+ const double *__restrict a,
+ const double *__restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] + b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-10.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-10.c
new file mode 100644
index 00000000000..839693bdf94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-10.c
@@ -0,0 +1,12 @@
+/* { dg-do compile }*/
+/* { dg-options "-Ofast -march=x86-64-v4 --param vect-partial-vector-usage=1"
} */
+/* { dg-final { scan-assembler "vpaddd\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_add_int(int *__restrict dst,
+ const int *__restrict a,
+ const int *__restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] + b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-11.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-11.c
new file mode 100644
index 00000000000..2841cf758a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-11.c
@@ -0,0 +1,12 @@
+/* { dg-do compile }*/
+/* { dg-options "-Ofast -march=x86-64-v4 --param vect-partial-vector-usage=1"
} */
+/* { dg-final { scan-assembler "vpmulld\[
\\t\]+\[^\\n\]*\\(\[^\\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_mul_int(int *__restrict dst,
+ const int *__restrict a,
+ const int *__restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] * b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-12.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-12.c
new file mode 100644
index 00000000000..64b3870bb42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-12.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=x86-64-v4 --param vect-partial-vector-usage=1"
} */
+/* { dg-final { scan-assembler "vsubps\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void
+vector_sub_fp (float * __restrict dst,
+ const float * __restrict a,
+ const float * __restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] - b[i];
+}
+
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-13.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-13.c
new file mode 100644
index 00000000000..d18aa4552bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-13.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=x86-64-v4 --param vect-partial-vector-usage=1"
} */
+/* { dg-final { scan-assembler "vpsubq\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void
+vector_sub_int64 (long long * __restrict dst,
+ const long long * __restrict a,
+ const long long * __restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] - b[i];
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-14.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-14.c
new file mode 100644
index 00000000000..ea0a90596a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-14.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=x86-64-v4 --param vect-partial-vector-usage=1"
} */
+/* { dg-final { scan-assembler-not "vsubps\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_sub_no_fold(float *__restrict dst,
+ const float *__restrict a,
+ float b,
+ int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = a[i] - b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-15.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-15.c
new file mode 100644
index 00000000000..658f0a6cc8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-15.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" } */
+/* vpavgb is a complex RTL pattern, not a plain binary operator, so the masked
+ load is not folded into it. A generic combine approach that hoists the
+ vec_merge to the root of the RTL tree would also cover this case. */
+/* { dg-final { scan-assembler "vpavgb\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" { xfail *-*-* } } } */
+
+void vector_avg_uint8 (unsigned char * __restrict dst,
+ const unsigned char * __restrict a,
+ const unsigned char * __restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = (a[i] + b[i] + 1) >> 1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-16.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-16.c
new file mode 100644
index 00000000000..f06c6ff33f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-16.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=x86-64-v4 --param vect-partial-vector-usage=1"
} */
+/* vpavgb is a complex RTL pattern, not a plain binary operator, so the masked
+ load is not folded into it. A generic combine approach that hoists the
+ vec_merge to the root of the RTL tree would also cover this case. */
+/* { dg-final { scan-assembler "vpavgb\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" { xfail *-*-* } } } */
+
+void vector_avg_uint8 (unsigned char * __restrict dst,
+ const unsigned char * __restrict a,
+ const unsigned char * __restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = (a[i] + b[i] + 1) >> 1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-2.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-2.c
new file mode 100644
index 00000000000..885b349e176
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile }*/
+/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" } */
+/* { dg-final { scan-assembler "vmulpd\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_mul_fp(double *__restrict dst,
+ const double *__restrict a,
+ const double *__restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] * b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-3.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-3.c
new file mode 100644
index 00000000000..9fb0066fdb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile }*/
+/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" } */
+/* { dg-final { scan-assembler "vpaddd\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_add_int(int *__restrict dst,
+ const int *__restrict a,
+ const int *__restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] + b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-4.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-4.c
new file mode 100644
index 00000000000..48a658d1d89
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-4.c
@@ -0,0 +1,12 @@
+/* { dg-do compile }*/
+/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" } */
+/* { dg-final { scan-assembler "vpmulld\[
\\t\]+\[^\\n\]*\\(\[^\\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_mul_int(int *__restrict dst,
+ const int *__restrict a,
+ const int *__restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] * b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-5.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-5.c
new file mode 100644
index 00000000000..8663b32f2c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-5.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" } */
+/* { dg-final { scan-assembler "vsubps\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void
+vector_sub_fp (float * __restrict dst,
+ const float * __restrict a,
+ const float * __restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] - b[i];
+}
+
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-6.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-6.c
new file mode 100644
index 00000000000..20c86bce0fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-6.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" } */
+/* { dg-final { scan-assembler "vpsubq\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void
+vector_sub_int64 (long long * __restrict dst,
+ long long * __restrict a,
+ long long * __restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] - b[i];
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-7.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-7.c
new file mode 100644
index 00000000000..be0397bfa43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-7.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" } */
+/* { dg-final { scan-assembler-not "vsubps\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_sub_no_fold(float * __restrict dst,
+ const float * __restrict a,
+ float b,
+ int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = a[i] - b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-8.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-8.c
new file mode 100644
index 00000000000..9743d2902c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-8.c
@@ -0,0 +1,12 @@
+/* { dg-do compile }*/
+/* { dg-options "-Ofast -march=x86-64-v4 --param vect-partial-vector-usage=1"
} */
+/* { dg-final { scan-assembler "vaddpd\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_add_fp(double *__restrict dst,
+ const double *__restrict a,
+ const double *__restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] + b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-9.c
b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-9.c
new file mode 100644
index 00000000000..5a5c37fb4a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-9.c
@@ -0,0 +1,12 @@
+/* { dg-do compile }*/
+/* { dg-options "-Ofast -march=x86-64-v4 --param vect-partial-vector-usage=1"
} */
+/* { dg-final { scan-assembler "vmulpd\[
\\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
+
+void vector_mul_fp(double *__restrict dst,
+ const double *__restrict a,
+ const double *__restrict b,
+ int n)
+{
+ for (int i = 0; i < n; ++i)
+ dst[i] = a[i] * b[i];
+}
--
2.34.1