Re: [SVE] PR86753

Prathamesh Kulkarni Mon, 07 Oct 2019 17:10:45 -0700

On Fri, 4 Oct 2019 at 16:08, Richard Biener <richard.guent...@gmail.com> wrote:
>
> On Thu, Oct 3, 2019 at 1:42 AM Prathamesh Kulkarni
> <prathamesh.kulka...@linaro.org> wrote:
> >
> > On Wed, 25 Sep 2019 at 09:17, Prathamesh Kulkarni
> > <prathamesh.kulka...@linaro.org> wrote:
> > >
> > > On Mon, 16 Sep 2019 at 08:54, Prathamesh Kulkarni
> > > <prathamesh.kulka...@linaro.org> wrote:
> > > >
> > > > On Mon, 9 Sep 2019 at 09:36, Prathamesh Kulkarni
> > > > <prathamesh.kulka...@linaro.org> wrote:
> > > > >
> > > > > On Mon, 9 Sep 2019 at 16:45, Richard Sandiford
> > > > > <richard.sandif...@arm.com> wrote:
> > > > > >
> > > > > > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes:
> > > > > > > With patch, the only following FAIL remains for aarch64-sve.exp:
> > > > > > > FAIL: gcc.target/aarch64/sve/cond_unary_2.c -march=armv8.2-a+sve
> > > > > > > scan-assembler-times \\tmovprfx\\t 6
> > > > > > > which now contains 14.
> > > > > > > Should I adjust the test, assuming the change isn't a regression ?
> > > > > >
> > > > > > Well, it is kind-of a regression, but it really just means that the
> > > > > > integer code is now consistent with the floating-point code in 
> > > > > > having
> > > > > > an unnecessary MOVPRFX.  So I think adjusting the count is fine.
> > > > > > Presumably any future fix for the existing redundant MOVPRFXs will
> > > > > > apply to the new ones as well.
> > > > > >
> > > > > > The patch looks good to me, just some very minor nits:
> > > > > >
> > > > > > > @@ -8309,11 +8309,12 @@ vect_double_mask_nunits (tree type)
> > > > > > >
> > > > > > >  /* Record that a fully-masked version of LOOP_VINFO would need 
> > > > > > > MASKS to
> > > > > > >     contain a sequence of NVECTORS masks that each control a 
> > > > > > > vector of type
> > > > > > > -   VECTYPE.  */
> > > > > > > +   VECTYPE. SCALAR_MASK if non-null, represents the mask used 
> > > > > > > for corresponding
> > > > > > > +   load/store stmt.  */
> > > > > >
> > > > > > Should be two spaces between sentences.  Maybe:
> > > > > >
> > > > > >    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would 
> > > > > > AND
> > > > > >    these vector masks with the vector version of SCALAR_MASK.  */
> > > > > >
> > > > > > since the mask isn't necessarily for a load or store statement.
> > > > > >
> > > > > > > [...]
> > > > > > > @@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, 
> > > > > > > tree, tree, stmt_vec_info,
> > > > > > >     says how the load or store is going to be implemented and 
> > > > > > > GROUP_SIZE
> > > > > > >     is the number of load or store statements in the containing 
> > > > > > > group.
> > > > > > >     If the access is a gather load or scatter store, GS_INFO 
> > > > > > > describes
> > > > > > > -   its arguments.
> > > > > > > +   its arguments. SCALAR_MASK is the scalar mask used for 
> > > > > > > corresponding
> > > > > > > +   load or store stmt.
> > > > > >
> > > > > > Maybe:
> > > > > >
> > > > > >    its arguments.  If the load or store is conditional, SCALAR_MASK 
> > > > > > is the
> > > > > >    condition under which it occurs.
> > > > > >
> > > > > > since SCALAR_MASK can be null here too.
> > > > > >
> > > > > > > [...]
> > > > > > > @@ -9975,6 +9978,31 @@ vectorizable_condition (stmt_vec_info 
> > > > > > > stmt_info, gimple_stmt_iterator *gsi,
> > > > > > >    /* Handle cond expr.  */
> > > > > > >    for (j = 0; j < ncopies; j++)
> > > > > > >      {
> > > > > > > +      tree loop_mask = NULL_TREE;
> > > > > > > +      bool swap_cond_operands = false;
> > > > > > > +
> > > > > > > +      if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > > > > > +     {
> > > > > > > +       scalar_cond_masked_key cond (cond_expr, ncopies);
> > > > > > > +       if (loop_vinfo->scalar_cond_masked_set.contains (cond))
> > > > > > > +         {
> > > > > > > +           vec_loop_masks *masks = &LOOP_VINFO_MASKS 
> > > > > > > (loop_vinfo);
> > > > > > > +           loop_mask = vect_get_loop_mask (gsi, masks, ncopies, 
> > > > > > > vectype, j);
> > > > > > > +         }
> > > > > > > +       else
> > > > > > > +         {
> > > > > > > +           cond.code = invert_tree_comparison (cond.code,
> > > > > > > +                                               HONOR_NANS 
> > > > > > > (TREE_TYPE (cond.op0)));
> > > > > >
> > > > > > Long line.  Maybe just split it out into a separate assignment:
> > > > > >
> > > > > >               bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
> > > > > >               cond.code = invert_tree_comparison (cond.code, 
> > > > > > honor_nans);
> > > > > >
> > > > > > > +           if (loop_vinfo->scalar_cond_masked_set.contains 
> > > > > > > (cond))
> > > > > > > +             {
> > > > > > > +               vec_loop_masks *masks = &LOOP_VINFO_MASKS 
> > > > > > > (loop_vinfo);
> > > > > > > +               loop_mask = vect_get_loop_mask (gsi, masks, 
> > > > > > > ncopies, vectype, j);
> > > > > >
> > > > > > Long line here too.
> > > > > >
> > > > > > > [...]
> > > > > > > @@ -10090,6 +10121,26 @@ vectorizable_condition (stmt_vec_info 
> > > > > > > stmt_info, gimple_stmt_iterator *gsi,
> > > > > > >                   }
> > > > > > >               }
> > > > > > >           }
> > > > > > > +
> > > > > > > +       if (loop_mask)
> > > > > > > +         {
> > > > > > > +           if (COMPARISON_CLASS_P (vec_compare))
> > > > > > > +             {
> > > > > > > +               tree tmp = make_ssa_name (vec_cmp_type);
> > > > > > > +               gassign *g = gimple_build_assign (tmp,
> > > > > > > +                                                 TREE_CODE 
> > > > > > > (vec_compare),
> > > > > > > +                                                 TREE_OPERAND 
> > > > > > > (vec_compare, 0),
> > > > > > d> +                                                TREE_OPERAND 
> > > > > > (vec_compare, 1));
> > > > > >
> > > > > > Two long lines.
> > > > > >
> > > > > > > +               vect_finish_stmt_generation (stmt_info, g, gsi);
> > > > > > > +               vec_compare = tmp;
> > > > > > > +             }
> > > > > > > +
> > > > > > > +           tree tmp2 = make_ssa_name (vec_cmp_type);
> > > > > > > +           gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, 
> > > > > > > vec_compare, loop_mask);
> > > > > >
> > > > > > Long line here too.
> > > > > >
> > > > > > > [...]
> > > > > > > diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> > > > > > > index dc181524744..c4b2d8e8647 100644
> > > > > > > --- a/gcc/tree-vectorizer.c
> > > > > > > +++ b/gcc/tree-vectorizer.c
> > > > > > > @@ -1513,3 +1513,39 @@ make_pass_ipa_increase_alignment 
> > > > > > > (gcc::context *ctxt)
> > > > > > >  {
> > > > > > >    return new pass_ipa_increase_alignment (ctxt);
> > > > > > >  }
> > > > > > > +
> > > > > > > +/* If code(T) is comparison op or def of comparison stmt,
> > > > > > > +   extract it's operands.
> > > > > > > +   Else return <NE_EXPR, T, 0>.  */
> > > > > > > +
> > > > > > > +void
> > > > > > > +scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
> > > > > > > +{
> > > > > > > +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
> > > > > > > +    {
> > > > > > > +      this->code = TREE_CODE (t);
> > > > > > > +      this->op0 = TREE_OPERAND (t, 0);
> > > > > > > +      this->op1 = TREE_OPERAND (t, 1);
> > > > > > > +      return;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  if (TREE_CODE (t) == SSA_NAME)
> > > > > > > +    {
> > > > > > > +      gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT 
> > > > > > > (t));
> > > > > > > +      if (stmt)
> > > > > > > +        {
> > > > > >
> > > > > > Might as well do this as:
> > > > > >
> > > > > >   if (TREE_CODE (t) == SSA_NAME)
> > > > > >     if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
> > > > > >       {
> > > > > >
> > > > > > The patch (as hoped) introduces some XPASSes:
> > > > > >
> > > > > > XPASS: gcc.target/aarch64/sve/cond_cnot_2.c scan-assembler-not 
> > > > > > \\tsel\\t
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmgt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmgt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0\\n 21
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmlt\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 42
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0\\n 15
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmlt\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 30
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmuo\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d\\n 252
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_4.c scan-assembler-times 
> > > > > > \\tfcmuo\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s\\n 180
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times 
> > > > > > \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0 21
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times 
> > > > > > \\tfcmge\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d 42
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times 
> > > > > > \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0 15
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times 
> > > > > > \\tfcmge\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s 30
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times 
> > > > > > \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, #0\\.0 21
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times 
> > > > > > \\tfcmle\\tp[0-9]+\\.d, p[0-7]/z, z[0-9]+\\.d, z[0-9]+\\.d 42
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times 
> > > > > > \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, #0\\.0 15
> > > > > > XPASS: gcc.target/aarch64/sve/vcond_5.c scan-assembler-times 
> > > > > > \\tfcmle\\tp[0-9]+\\.s, p[0-7]/z, z[0-9]+\\.s, z[0-9]+\\.s 30
> > > > > >
> > > > > > Could you remove the associated xfails (and comments above them 
> > > > > > where
> > > > > > appropriate)?
> > > > > >
> > > > > > OK with those changes from my POV, but please give Richi a day or so
> > > > > > to object.
> > > > > >
> > > > > > Thanks for doing this.
> > > > > Thanks for the suggestions, I have updated the patch accordingly.
> > > > > Boostrap+test in progress on x86_64-unknown-linux-gnu and 
> > > > > aarch64-linux-gnu.
> > > > > Richi, does the patch look OK to you ?
> > > > ping https://gcc.gnu.org/ml/gcc-patches/2019-09/msg00573.html
> > > ping * 2: https://gcc.gnu.org/ml/gcc-patches/2019-09/msg00573.html
> > ping * 3: https://gcc.gnu.org/ml/gcc-patches/2019-09/msg00573.html
>
> It looks reasonable but the vectorizable_condition totally lack
> comments...
Hi Richard,
I rebased the patch on top of trunk, and added some comments to
vectorizable_condition,
and scalar_cond_masked_key.
Does it look OK ?


Thanks,
Prathamesh
>
> Richard.
>
> > Thanks,
> > Prathamesh
> > >
> > > Thanks,
> > > Prathamesh
> > > >
> > > > Thanks,
> > > > Prathamesh
> > > > >
> > > > > Thanks,
> > > > > Prathamesh
> > > > > >
> > > > > > Richard

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
index d689e21dc11..3df2431be38 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
@@ -32,4 +32,4 @@ TEST_ALL (DEF_LOOP)
 /* { dg-final { scan-assembler-not {\tmov\tz} } } */
 /* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
 /* Currently we canonicalize the ?: so that !b[i] is the "false" value.  */
-/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
index dcc30768f88..86064ebfcba 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
@@ -11,7 +11,10 @@
 		   INT_TYPE *__restrict pred, int n)		\
   {								\
     for (int i = 0; i < n; ++i)					\
-      r[i] = pred[i] ? (FLOAT_TYPE) a[i] : b[i];		\
+      {								\
+	FLOAT_TYPE bi = b[i];					\
+	r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi;		\
+      }								\
   }
 
 #define TEST_ALL(T) \
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
index 7e5f2a73ed9..e3a947b2698 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
@@ -11,7 +11,10 @@
 		   INT_TYPE *__restrict pred, int n)		\
   {								\
     for (int i = 0; i < n; ++i)					\
-      r[i] = pred[i] ? (INT_TYPE) a[i] : b[i];			\
+      {								\
+	INT_TYPE bi = b[i];					\
+	r[i] = pred[i] ? (INT_TYPE) a[i] : bi;			\
+      }								\
   }
 
 #define TEST_ALL(T) \
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
index 991ccf016d1..97d1b8f5d45 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
@@ -13,7 +13,10 @@
 		      TYPE *__restrict pred, int n)		\
   {								\
     for (int i = 0; i < n; ++i)					\
-      r[i] = pred[i] ? OP (a[i]) : b[i];			\
+      {								\
+	TYPE bi = b[i];						\
+	r[i] = pred[i] ? OP (a[i]) : bi;			\
+      }								\
   }
 
 #define TEST_INT_TYPE(T, TYPE) \
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
index 5c04bcdb3f5..a1b0667dab5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
@@ -15,5 +15,9 @@ f (double *restrict a, double *restrict b, double *restrict c,
     }
 }
 
-/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* See https://gcc.gnu.org/ml/gcc-patches/2019-08/msg01644.html
+   for XFAILing the below test.  */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tfmad\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
index 00d84760a19..b38f23e87ba 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
@@ -98,24 +98,24 @@ TEST_CMP (nugt)
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
 
 /* 5 for lt, 5 for ult and 5 for nult.  */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for le, 5 for ule and 5 for nule.  */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for gt, 5 for ugt and 5 for nugt.  */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for ge, 5 for uge and 5 for nuge.  */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */
 /* 3 loops * 5 invocations for all 12 unordered comparisons.  */
-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */
 
 /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */
@@ -123,19 +123,19 @@ TEST_CMP (nugt)
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */
 /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
    for all 12 unordered comparisons.  */
-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
index 23bfb7b2649..2f16fbff522 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
@@ -19,16 +19,16 @@
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
 
 /* 5 for le, 5 for ule and 5 for nule.  */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
 
 /* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt.  */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
 
 /* 5 for ge, 5 for uge and 5 for nuge.  */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */
 /* 3 loops * 5 invocations for ordered, unordered amd ueq.  */
@@ -43,14 +43,14 @@
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
 
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */
 /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 3db4a5cdf78..da952645759 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -6603,7 +6603,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	}
       else
 	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-			       vectype_in);
+			       vectype_in, NULL);
     }
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
@@ -8005,7 +8005,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
 	      gcc_assert (ncopies == 1 && !slp_node);
 	      vect_record_loop_mask (loop_vinfo,
 				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype);
+				     1, vectype, NULL);
 	    }
 	}
       return true;
@@ -8204,11 +8204,12 @@ vect_double_mask_nunits (tree type)
 
 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
    contain a sequence of NVECTORS masks that each control a vector of type
-   VECTYPE.  */
+   VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
+   these vector masks with the vector version of SCALAR_MASK.  */
 
 void
 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
-		       unsigned int nvectors, tree vectype)
+		       unsigned int nvectors, tree vectype, tree scalar_mask)
 {
   gcc_assert (nvectors != 0);
   if (masks->length () < nvectors)
@@ -8219,6 +8220,13 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
   unsigned int nscalars_per_iter
     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+  if (scalar_mask)
+    {
+      scalar_cond_masked_key cond (scalar_mask, nvectors);
+      loop_vinfo->scalar_cond_masked_set.add (cond);
+    }
+
   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
     {
       rgm->max_nscalars_per_iter = nscalars_per_iter;
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index cac7410387b..4db8e24ccd1 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info,
    says how the load or store is going to be implemented and GROUP_SIZE
    is the number of load or store statements in the containing group.
    If the access is a gather load or scatter store, GS_INFO describes
-   its arguments.
+   its arguments.  If the load or store is conditional, SCALAR_MASK is the
+   condition under which it occurs.
 
    Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not
    supported, otherwise record the required mask types.  */
@@ -1888,7 +1889,7 @@ static void
 check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
 			  vec_load_store_type vls_type, int group_size,
 			  vect_memory_access_type memory_access_type,
-			  gather_scatter_info *gs_info)
+			  gather_scatter_info *gs_info, tree scalar_mask)
 {
   /* Invariant loads need no special support.  */
   if (memory_access_type == VMAT_INVARIANT)
@@ -1912,7 +1913,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
 	  return;
 	}
       unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
       return;
     }
 
@@ -1936,7 +1937,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
 	  return;
 	}
       unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
       return;
     }
 
@@ -1974,7 +1975,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned int nvectors;
   if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
-    vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype);
+    vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
   else
     gcc_unreachable ();
 }
@@ -3436,7 +3437,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  unsigned int nvectors = (slp_node
 				   ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
 				   : ncopies);
-	  vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out);
+	  tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
+	  vect_record_loop_mask (loop_vinfo, masks, nvectors,
+				 vectype_out, scalar_mask);
 	}
       return true;
     }
@@ -7390,7 +7393,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (loop_vinfo
 	  && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	check_load_store_masking (loop_vinfo, vectype, vls_type, group_size,
-				  memory_access_type, &gs_info);
+				  memory_access_type, &gs_info, mask);
 
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
       vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type,
@@ -8637,7 +8640,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (loop_vinfo
 	  && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size,
-				  memory_access_type, &gs_info);
+				  memory_access_type, &gs_info, mask);
 
       STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
@@ -9774,6 +9777,10 @@ vect_is_simple_cond (tree cond, vec_info *vinfo,
 
    When STMT_INFO is vectorized as a nested cycle, for_reduction is true.
 
+   For COND_EXPR<C, T, E> if T comes from masked load, and is conditional
+   on C, we apply loop mask to result of vector comparison, if it's present.
+   Similarly for E, if it is conditional on !C.
+
    Return true if STMT_INFO is vectorizable in this way.  */
 
 bool
@@ -9999,6 +10006,35 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   /* Handle cond expr.  */
   for (j = 0; j < ncopies; j++)
     {
+      tree loop_mask = NULL_TREE;
+      bool swap_cond_operands = false;
+
+      /* Look up if there is a loop mask associated with the
+	 scalar cond, or it's inverse.  */
+
+      if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	{
+	  scalar_cond_masked_key cond (cond_expr, ncopies);
+	  if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+	    {
+	      vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+	      loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
+	    }
+	  else
+	    {
+	      bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
+	      cond.code = invert_tree_comparison (cond.code, honor_nans);
+	      if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+		{
+		  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+		  loop_mask = vect_get_loop_mask (gsi, masks, ncopies,
+						  vectype, j);
+		  cond_code = cond.code;
+		  swap_cond_operands = true;
+		}
+	    }
+	}
+
       stmt_vec_info new_stmt_info = NULL;
       if (j == 0)
 	{
@@ -10076,6 +10112,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
           vec_then_clause = vec_oprnds2[i];
           vec_else_clause = vec_oprnds3[i];
 
+	  if (swap_cond_operands)
+	    std::swap (vec_then_clause, vec_else_clause);
+
 	  if (masked)
 	    vec_compare = vec_cond_lhs;
 	  else
@@ -10114,6 +10153,47 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		    }
 		}
 	    }
+
+	  /* If loop mask is present, then AND it with
+	     result of vec comparison, so later passes (fre4)
+	     will reuse the same condition used in masked load.
+
+	     For example:
+	     for (int i = 0; i < 100; ++i)
+	       x[i] = y[i] ? z[i] : 10;
+
+	     results in following optimized GIMPLE: 
+
+	     mask__35.8_43 = vect__4.7_41 != { 0, ... };
+	     vec_mask_and_46 = loop_mask_40 & mask__35.8_43;
+	     _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B];
+	     vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46);
+	     vect_iftmp.12_52 = VEC_COND_EXPR <vec_mask_and_46,
+					       vect_iftmp.11_47, { 10, ... }>;
+
+	     instead of recomputing vec != { 0, ... } in vec_cond_expr  */
+
+	  if (loop_mask)
+	    {
+	      if (COMPARISON_CLASS_P (vec_compare))
+		{
+		  tree tmp = make_ssa_name (vec_cmp_type);
+		  tree op0 = TREE_OPERAND (vec_compare, 0);
+		  tree op1 = TREE_OPERAND (vec_compare, 1);
+		  gassign *g = gimple_build_assign (tmp,
+						    TREE_CODE (vec_compare),
+						    op0, op1);
+		  vect_finish_stmt_generation (stmt_info, g, gsi);
+		  vec_compare = tmp;
+		}
+
+	      tree tmp2 = make_ssa_name (vec_cmp_type);
+	      gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR,
+						vec_compare, loop_mask);
+	      vect_finish_stmt_generation (stmt_info, g, gsi);
+	      vec_compare = tmp2;
+	    }
+
 	  if (reduction_type == EXTRACT_LAST_REDUCTION)
 	    {
 	      if (!is_gimple_val (vec_compare))
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 800c99fea26..20945a39c84 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1516,3 +1516,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt)
 {
   return new pass_ipa_increase_alignment (ctxt);
 }
+
+/* If code(T) is comparison op or def of comparison stmt,
+   extract it's operands.
+   Else return <NE_EXPR, T, 0>.  */
+
+void
+scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
+{
+  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
+    {
+      this->code = TREE_CODE (t);
+      this->op0 = TREE_OPERAND (t, 0);
+      this->op1 = TREE_OPERAND (t, 1);
+      return;
+    }
+
+  if (TREE_CODE (t) == SSA_NAME)
+    if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
+      {
+	tree_code code = gimple_assign_rhs_code (stmt);
+	if (TREE_CODE_CLASS (code) == tcc_comparison)
+	  {
+	    this->code = code;
+	    this->op0 = gimple_assign_rhs1 (stmt);
+	    this->op1 = gimple_assign_rhs2 (stmt);
+	    return;
+	  }
+      }
+
+  this->code = NE_EXPR;
+  this->op0 = t;
+  this->op1 = build_zero_cst (TREE_TYPE (t));
+}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 837fb5ab525..632f12a30dc 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info;
 #include "tree-data-ref.h"
 #include "tree-hash-traits.h"
 #include "target.h"
+#include "hash-set.h"
 
 /* Used for naming of new temporaries.  */
 enum vect_var_kind {
@@ -177,7 +178,78 @@ public:
 #define SLP_TREE_TWO_OPERATORS(S)		 (S)->two_operators
 #define SLP_TREE_DEF_TYPE(S)			 (S)->def_type
 
+/* Key for map that records association between
+   scalar conditions used in masked loads and corresponding
+   loop mask. The map is populated by vect_record_loop_mask.
+   vectorizable_condition uses the map to check if scalar
+   cond (or it's inverse) has a loop mask associated with it,
+   and if yes, applies loop mask to result of vector comparison.  */
+ 
+struct scalar_cond_masked_key
+{
+  scalar_cond_masked_key (tree t, unsigned ncopies_)
+    : ncopies (ncopies_)
+  {
+    get_cond_ops_from_tree (t);
+  }
+
+  void get_cond_ops_from_tree (tree);
+
+  unsigned ncopies;
+  tree_code code;
+  tree op0;
+  tree op1;
+};
 
+template<>
+struct default_hash_traits<scalar_cond_masked_key>
+{
+  typedef scalar_cond_masked_key compare_type;
+  typedef scalar_cond_masked_key value_type;
+
+  static inline hashval_t
+  hash (value_type v)
+  {
+    inchash::hash h;
+    h.add_int (v.code);
+    inchash::add_expr (v.op0, h, 0);
+    inchash::add_expr (v.op1, h, 0);
+    h.add_int (v.ncopies);
+    return h.end ();
+  }
+
+  static inline bool
+  equal (value_type existing, value_type candidate)
+  {
+    return (existing.ncopies == candidate.ncopies
+	    && existing.code == candidate.code
+	    && operand_equal_p (existing.op0, candidate.op0, 0)
+	    && operand_equal_p (existing.op1, candidate.op1, 0));
+  }
+
+  static inline void
+  mark_empty (value_type &v)
+  {
+    v.ncopies = 0;
+  }
+
+  static inline bool
+  is_empty (value_type v)
+  {
+    return v.ncopies == 0;
+  }
+
+  static inline void mark_deleted (value_type &) {}
+
+  static inline bool is_deleted (const value_type &)
+  {
+    return false;
+  }
+
+  static inline void remove (value_type &) {}
+};
+
+typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
 
 /* Describes two objects whose addresses must be unequal for the vectorized
    loop to be valid.  */
@@ -258,6 +330,9 @@ public:
   /* Cost data used by the target cost model.  */
   void *target_cost_data;
 
+  /* Set of scalar conditions that have loop mask applied.  */
+  scalar_cond_masked_set_type scalar_cond_masked_set;
+
 private:
   stmt_vec_info new_stmt_vec_info (gimple *stmt);
   void set_vinfo_for_stmt (gimple *, stmt_vec_info);
@@ -1641,7 +1716,7 @@ extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 extern tree vect_halve_mask_nunits (tree);
 extern tree vect_double_mask_nunits (tree);
 extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
-				   unsigned int, tree);
+				   unsigned int, tree, tree);
 extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
 				unsigned int, tree, unsigned int);

Re: [SVE] PR86753

Reply via email to