Hi All,

Here’s a respin of the patch.

The following example

void f5(float * restrict z0, float * restrict z1, float *restrict x,
        float * restrict y, float c, int n)
{
    for (int i = 0; i < n; i++) {
        float a = x[i];
        float b = y[i];
        if (a > b) {
            z0[i] = a + b;
            if (a > c) {
                z1[i] = a - b;
            }
        }
    }
}

generates currently:

        ptrue   p3.b, all
        ld1w    z1.s, p1/z, [x2, x5, lsl 2]
        ld1w    z2.s, p1/z, [x3, x5, lsl 2]
        fcmgt   p0.s, p3/z, z1.s, z0.s
        fcmgt   p2.s, p1/z, z1.s, z2.s
        fcmgt   p0.s, p0/z, z1.s, z2.s
        and     p0.b, p0/z, p1.b, p1.b

The conditions for a > b and a > c become separate comparisons.

After this patch we generate:

        ld1w    z1.s, p0/z, [x2, x5, lsl 2]
        ld1w    z2.s, p0/z, [x3, x5, lsl 2]
        fcmgt   p1.s, p0/z, z1.s, z2.s
        fcmgt   p1.s, p1/z, z1.s, z0.s

Where the condition a > b && a > c are folded by using the predicate result of
the previous compare and thus allows the removal of one of the compares.

When never a mask is being generated from an BIT_AND we mask the operands of
the and instead and then just AND the result.

This allows us to be able to CSE the masks and generate the right combination.
However because re-assoc will try to re-order the masks in the & we have to now
perform a small local CSE on the vectorized loop is vectorization is successful.

Note: This patch series is working incrementally towards generating the most
      efficient code for this and other loops in small steps.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no 
issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vect-stmts.c (prepare_load_store_mask): When combining two masks
        mask the operands instead of the combined operation.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/pred-combine-and.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
new file mode 100644
index 
0000000000000000000000000000000000000000..ed7fb591ec69dbdafe27fc9aa08a0b0910c94003
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --save-temps" } */
+
+void f5(float * restrict z0, float * restrict z1, float *restrict x, float * 
restrict y, float c, int n)
+{
+    for (int i = 0; i < n; i++) {
+        float a = x[i];
+        float b = y[i];
+        if (a > b) {
+            z0[i] = a + b;
+            if (a > c) {
+                z1[i] = a - b;
+            }
+        }
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, 
z[0-9]+\.s, z[0-9]+\.s} 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 
1f56e10709e8f27d768c04f7ef914e2cd9347c36..27ee48aea429810a37777d907435a92b8fd1817d
 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -6302,10 +6302,39 @@ vectorizable_operation (vec_info *vinfo,
        }
       else
        {
+         /* When combining two masks check is either of them has already been
+            combined with a loop mask, if that's the case we can mark that the
+            new combined mask doesn't need to be combined with a loop mask.  */
+         if (masked_loop_p && code == BIT_AND_EXPR)
+           {
+             scalar_cond_masked_key cond1 (op0, ncopies);
+             if (loop_vinfo->scalar_cond_masked_set.contains (cond1))
+               {
+                 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+                                                 vectype, i);
+
+                 vop0 = prepare_load_store_mask (TREE_TYPE (mask), mask, vop0, 
gsi);
+                 scalar_cond_masked_key cond (scalar_dest, ncopies);
+                 loop_vinfo->scalar_cond_masked_set.add (cond);
+               }
+
+             scalar_cond_masked_key cond2 (op1, ncopies);
+             if (loop_vinfo->scalar_cond_masked_set.contains (cond2))
+               {
+                 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+                                                 vectype, i);
+
+                 vop1 = prepare_load_store_mask (TREE_TYPE (mask), mask, vop1, 
gsi);
+                 scalar_cond_masked_key cond (scalar_dest, ncopies);
+                 loop_vinfo->scalar_cond_masked_set.add (cond);
+               }
+           }
+
          new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
          new_temp = make_ssa_name (vec_dest, new_stmt);
          gimple_assign_set_lhs (new_stmt, new_temp);
          vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
          if (vec_cvt_dest)
            {
              new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);

Attachment: rb14777.patch
Description: rb14777.patch

Reply via email to