Hi,

This patch fixes incorrect comparison vectorization for booleans.
The problem is that regular comparison which works for scalars
doesn't work for vectors due to different binary representation.
Also this never works for scalar masks.

This patch replaces such comparisons with bitwise operations
which work correctly for both vector and scalar masks.

Bootstrapped and regtested on x86_64-unknown-linux-gnu.  Is it
OK for trunk?  What should be done for gcc-6-branch?  Port this
patch or just restrict vectorization for comparison of booleans?

Thanks,
Ilya
--
gcc/

2016-06-15  Ilya Enkovich  <ilya.enkov...@intel.com>

        PR middle-end/71488
        * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): Support
        comparison of boolean vectors.
        * tree-vect-stmts.c (vectorizable_comparison): Vectorize comparison
        of boolean vectors using bitwise operations.

gcc/testsuite/

2016-06-15  Ilya Enkovich  <ilya.enkov...@intel.com>

        PR middle-end/71488
        * g++.dg/pr71488.C: New test.
        * gcc.dg/vect/vect-bool-cmp.c: New test.


diff --git a/gcc/testsuite/g++.dg/pr71488.C b/gcc/testsuite/g++.dg/pr71488.C
new file mode 100644
index 0000000..d7d657e
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr71488.C
@@ -0,0 +1,24 @@
+// PR middle-end/71488
+// { dg-do run }
+// { dg-options "-O3 -std=c++11" }
+// { dg-additional-options "-march=westmere" { target i?86-*-* x86_64-*-* } }
+// { dg-require-effective-target c++11 }
+
+#include <valarray>
+
+int var_4 = 1;
+long long var_9 = 0;
+
+int main() {
+  
+  std::valarray<std::valarray<long long>> v10;
+
+  v10.resize(1);
+  v10[0].resize(4);
+
+  for (int i = 0; i < 4; i++)
+    v10[0][i] = ((var_9 == 0) > unsigned (var_4 == 0)) + (var_9 == 0);
+
+  if (v10[0][0] != 2)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c 
b/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c
new file mode 100644
index 0000000..a1e2a24
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c
@@ -0,0 +1,252 @@
+/* PR71488 */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_pack_trunc } */
+/* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */
+
+int i1, i2;
+
+void __attribute__((noclone,noinline))
+fn1 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int 
size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn2 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, 
int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn3 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ 
p3, int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn4 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int 
size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn5 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, 
int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn6 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ 
p3, int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn7 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int 
size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn8 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, 
int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn9 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ 
p3, int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn10 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int 
size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn11 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, 
int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn12 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ 
p3, int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn13 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int 
size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn14 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, 
int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn15 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ 
p3, int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn16 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int 
size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn17 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, 
int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+void __attribute__((noclone,noinline))
+fn18 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ 
p3, int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0);
+}
+
+int eq (int i1, int i2) { return i1 == i2; }
+int ne (int i1, int i2) { return i1 != i2; }
+int lt (int i1, int i2) { return i1 < i2; }
+int le (int i1, int i2) { return i1 <= i2; }
+int gt (int i1, int i2) { return i1 > i2; }
+int ge (int i1, int i2) { return i1 >= i2; }
+
+typedef int (*cmp_fn)(int, int);
+
+void
+check (int *p, cmp_fn fn)
+{
+  int i;
+
+  for (i = 0; i < 32; i++)
+    {
+      int t1 = ((i % 4) > 1) == 0;
+      int t2 = (i % 2) == 0;
+      int res = fn (t1, t2) + t1;
+      if (p[i] != res)
+       __builtin_abort ();
+    }
+}
+
+int
+main (int argc, char **argv)
+{
+  int i1[32], i2[32], res[32];
+  short s2[32];
+  long long l2[32];
+  int i;
+
+  for (i = 0; i < 32; i++)
+    {
+      l2[i] = i2[i] = s2[i] = i % 2;
+      i1[i] = (i % 4) > 1;
+      asm ("":::"memory");
+    }
+
+  fn1 (res, i1, i2, 32);
+  check (res, gt);
+  fn2 (res, i1, s2, 32);
+  check (res, gt);
+  fn3 (res, i1, l2, 32);
+  check (res, gt);
+
+  fn4 (res, i1, i2, 32);
+  check (res, ge);
+  fn5 (res, i1, s2, 32);
+  check (res, ge);
+  fn6 (res, i1, l2, 32);
+  check (res, ge);
+
+  fn7 (res, i1, i2, 32);
+  check (res, lt);
+  fn8 (res, i1, s2, 32);
+  check (res, lt);
+  fn9 (res, i1, l2, 32);
+  check (res, lt);
+
+  fn10 (res, i1, i2, 32);
+  check (res, le);
+  fn11 (res, i1, s2, 32);
+  check (res, le);
+  fn12 (res, i1, l2, 32);
+  check (res, le);
+
+  fn13 (res, i1, i2, 32);
+  check (res, eq);
+  fn14 (res, i1, s2, 32);
+  check (res, eq);
+  fn15 (res, i1, l2, 32);
+  check (res, eq);
+
+  fn16 (res, i1, i2, 32);
+  check (res, ne);
+  fn17 (res, i1, s2, 32);
+  check (res, ne);
+  fn18 (res, i1, l2, 32);
+  check (res, ne);
+}
+
+/* { dg-final { scan-tree-dump-times "VECTORIZED" 18 "vect" { target { 
i?86-*-* x86_64-*-* } } } } */
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 8a2221f..f0c515d 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -3763,7 +3763,8 @@ vect_recog_mask_conversion_pattern (vec<gimple *> *stmts, 
tree *type_in,
 
   if (rhs_code != BIT_IOR_EXPR
       && rhs_code != BIT_XOR_EXPR
-      && rhs_code != BIT_AND_EXPR)
+      && rhs_code != BIT_AND_EXPR
+      && TREE_CODE_CLASS (rhs_code) != tcc_comparison)
     return NULL;
 
   rhs2 = gimple_assign_rhs2 (last_stmt);
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index c74f14f..5c65502 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -7756,7 +7756,7 @@ vectorizable_comparison (gimple *stmt, 
gimple_stmt_iterator *gsi,
   enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type};
   unsigned nunits;
   int ncopies;
-  enum tree_code code;
+  enum tree_code code, bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
   stmt_vec_info prev_stmt_info = NULL;
   int i, j;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
@@ -7829,11 +7829,74 @@ vectorizable_comparison (gimple *stmt, 
gimple_stmt_iterator *gsi,
   else if (nunits != TYPE_VECTOR_SUBPARTS (vectype))
     return false;
 
+  /* Can't compare mask and non-mask types.  */
+  if (vectype1 && vectype2
+      && (VECTOR_BOOLEAN_TYPE_P (vectype1) ^ VECTOR_BOOLEAN_TYPE_P (vectype2)))
+    return false;
+
+  /* Boolean values may have another representation in vectors
+     and therefore we prefer bit operations over comparison for
+     them (which also works for scalar masks).  We store opcodes
+     to use in bitop1 and bitop2.  Statement is vectorized as
+       BITOP2 (rhs1 BITOP1 rhs2) or
+       rhs1 BITOP2 (BITOP1 rhs2)
+     depending on bitop1 and bitop2 arity.  */
+  if (VECTOR_BOOLEAN_TYPE_P (vectype))
+    {
+      if (code == GT_EXPR)
+       {
+         bitop1 = BIT_NOT_EXPR;
+         bitop2 = BIT_AND_EXPR;
+       }
+      else if (code == GE_EXPR)
+       {
+         bitop1 = BIT_NOT_EXPR;
+         bitop2 = BIT_IOR_EXPR;
+       }
+      else if (code == LT_EXPR)
+       {
+         bitop1 = BIT_NOT_EXPR;
+         bitop2 = BIT_AND_EXPR;
+         std::swap (rhs1, rhs2);
+       }
+      else if (code == LE_EXPR)
+       {
+         bitop1 = BIT_NOT_EXPR;
+         bitop2 = BIT_IOR_EXPR;
+         std::swap (rhs1, rhs2);
+       }
+      else
+       {
+         bitop1 = BIT_XOR_EXPR;
+         if (code == EQ_EXPR)
+           bitop2 = BIT_NOT_EXPR;
+       }
+    }
+
   if (!vec_stmt)
     {
       STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type;
-      vect_model_simple_cost (stmt_info, ncopies, dts, NULL, NULL);
-      return expand_vec_cmp_expr_p (vectype, mask_type);
+      vect_model_simple_cost (stmt_info, ncopies * (1 + (bitop2 != NOP_EXPR)),
+                             dts, NULL, NULL);
+      if (bitop1 == NOP_EXPR)
+       return expand_vec_cmp_expr_p (vectype, mask_type);
+      else
+       {
+         machine_mode mode = TYPE_MODE (vectype);
+         optab optab;
+
+         optab = optab_for_tree_code (bitop1, vectype, optab_default);
+         if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
+           return false;
+
+         if (bitop2 != NOP_EXPR)
+           {
+             optab = optab_for_tree_code (bitop2, vectype, optab_default);
+             if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
+               return false;
+           }
+         return true;
+       }
     }
 
   /* Transform.  */
@@ -7890,8 +7953,31 @@ vectorizable_comparison (gimple *stmt, 
gimple_stmt_iterator *gsi,
          vec_rhs2 = vec_oprnds1[i];
 
          new_temp = make_ssa_name (mask);
-         new_stmt = gimple_build_assign (new_temp, code, vec_rhs1, vec_rhs2);
-         vect_finish_stmt_generation (stmt, new_stmt, gsi);
+         if (bitop1 == NOP_EXPR)
+           {
+             new_stmt = gimple_build_assign (new_temp, code,
+                                             vec_rhs1, vec_rhs2);
+             vect_finish_stmt_generation (stmt, new_stmt, gsi);
+           }
+         else
+           {
+             if (bitop1 == BIT_NOT_EXPR)
+               new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2);
+             else
+               new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1,
+                                               vec_rhs2);
+             vect_finish_stmt_generation (stmt, new_stmt, gsi);
+             if (bitop2 != NOP_EXPR)
+               {
+                 tree res = make_ssa_name (mask);
+                 if (bitop2 == BIT_NOT_EXPR)
+                   new_stmt = gimple_build_assign (res, bitop2, new_temp);
+                 else
+                   new_stmt = gimple_build_assign (res, bitop2, vec_rhs1,
+                                                   new_temp);
+                 vect_finish_stmt_generation (stmt, new_stmt, gsi);
+               }
+           }
          if (slp_node)
            SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
        }

Reply via email to