The example

float *e;
void f (float *f, float *g, char *h, int n,
        int b, int c, int d)
{
  float a = 0;
  for (int i = 0; i < n; ++i) {
    int j = b + i, k = c + i * d;
    float l = g[j], m = h[i] ? g[k] : l;
    a += f[i] * m;
  }
  *e = a;
}

gets vectorized using gathers for the access to g:

.L5:
        ld1b    z4.s, p7/z, [x2, x6]
        cmpne   p6.b, p7/z, z4.b, #0
        ld1w    z2.s, p7/z, [x0, x6, lsl 2]
        add     z7.s, z30.s, z16.s
        add     z6.s, z16.s, z18.s
        add     x6, x6, x7
        ld1w    z5.s, p7/z, [x1, z6.s, sxtw 2]
        ld1w    z3.s, p6/z, [x1, z7.s, sxtw 2]
        incw    z16.s
        sel     z3.s, p6, z3.s, z5.s
        fmla    z17.s, p7/m, z2.s, z3.s
        whilelo p7.s, w6, w3
        b.any   .L5

however the first g is g[b+i] and second is g[c + i*d];

since b is loop invariant the access to g[b+i] is actually linear and since c
is loop invariant, then the base of the second access g[c + i *d] can be
simplified by recognizing the base as g + c.

Today however SCEV fails to analyze these accesses as affine and as a
consequence we end up with gathers:

: missed:  failed: evolution of base is not affine.
        base_address:
        offset from base address:
        constant offset from base address:
        step:
        base alignment: 0
        base misalignment: 0
        offset alignment: 0
        step alignment: 0
        base_object: *_63

Looking at SCEV this is because of an outer cast around the CHREC:

)
(set_scalar_evolution
  instantiated_below = 25
  (scalar = _65)
  (scalar_evolution = (long unsigned int) {b_22(D), +, 1}_2))
)
(instantiate_scev
  (instantiate_below = 25 -> 12)
  (evolution_loop = 2)
  (chrec = (long unsigned int) {b_22(D), +, 1}_2)

(instantiate_scev
  (instantiate_below = 25 -> 12)
  (evolution_loop = 2)
  (chrec = g_27(D))
  (res = g_27(D)))

  which corresponds to

  j_66 = b_22(D) + i_67;
  _65 = (long unsigned int) j_66;
  _64 = _65 * 4;
  _63 = g_27(D) + _64;
  l_62 = *_63;

and the _64 is deemed to not be affine:

(instantiate_scev
  (instantiate_below = 25 -> 12)
  (evolution_loop = 2)
  (chrec = _64)
(analyze_scalar_evolution
  (loop_nb = 2)
  (scalar = _64)
(get_scalar_evolution
  (scalar = _64)
  (scalar_evolution = _64))
)
  (res = scev_not_known))

This patch fixes it by (very carefully) folding a multiply on an unsigned affine
CHREC into the CHREC itself.

which results in

(instantiate_scev
  (instantiate_below = 25 -> 12)
  (evolution_loop = 2)
  (chrec = 4)
  (res = 4))
(set_scalar_evolution
  instantiated_below = 25
  (scalar = _64)
  (scalar_evolution = {(long unsigned int) b_22(D) * 4, +, 4}_2))
)
(instantiate_scev
  (instantiate_below = 25 -> 12)
  (evolution_loop = 2)
  (chrec = g_27(D))
  (res = g_27(D)))
(instantiate_scev
  (instantiate_below = 25 -> 12)
  (evolution_loop = 2)
  (chrec = {(long unsigned int) b_22(D) * 4, +, 4}_2)
  (res = {(long unsigned int) b_22(D) * 4, +, 4}_2))
(set_scalar_evolution
  instantiated_below = 25
  (scalar = _63)
  (scalar_evolution = {g_27(D) + (long unsigned int) b_22(D) * 4, +, 4}_2))
)

and dataref now correctly analyzes the base

        base_address: g_27(D) + (sizetype) b_22(D) * 4
        offset from base address: 0
        constant offset from base address: 0
        step: 4
        base alignment: 4
        base misalignment: 0
        offset alignment: 128
        step alignment: 4
        base_object: *g_27(D) + (sizetype) b_22(D) * 4
        Access function 0: {0B, +, 4}_2

producing the final codegen:

.L7:
        ld1b    z4.s, p7/z, [x2, x6]
        cmpne   p6.b, p7/z, z4.b, #0
        ld1w    z29.s, p7/z, [x4, x6, lsl 2]
        ld1w    z2.s, p7/z, [x0, x6, lsl 2]
        ld1w    z3.s, p6/z, [x5]
        add     x6, x6, x7
        sel     z3.s, p6, z3.s, z29.s
        add     x5, x5, x1
        fmla    z30.s, p7/m, z2.s, z3.s
        whilelo p7.s, w6, w3
        b.any   .L7
        faddv   s31, p5, z30.s

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.
Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-chrec.cc (chrec_fold_multiply): Fold unsigned CHREC mult.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/vect-scev-affine_1.c: New test.

---
diff --git a/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c 
b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..929012184e0a2595af826d3d06284d0a6a510119
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+float *e;
+void f (float *f, float *g, char *h, int n,
+        int b, int c, int d)
+{
+  float a = 0;
+  for (int i = 0; i < n; ++i) {
+    int j = b + i, k = c + i * d;
+    float l = g[j], m = h[i] ? g[k] : l;
+    a += f[i] * m;
+  }
+  *e = a;
+}
+
+/* { dg-final { scan-tree-dump-not {failed: evolution of base is not affine} 
"vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-chrec.cc b/gcc/tree-chrec.cc
index 
09dd81900bce70138f975c68b77c4ba6d0e45fc3..ff77c7a6c2397f65f3ee17a386408b5ceec4676d
 100644
--- a/gcc/tree-chrec.cc
+++ b/gcc/tree-chrec.cc
@@ -508,9 +508,52 @@ chrec_fold_multiply (tree type,
     CASE_CONVERT:
       if (tree_contains_chrecs (op0, NULL))
        {
+         tree inner = TREE_OPERAND (op0, 0);
+         tree inner_type = TREE_TYPE (inner);
+
+         /* Keep widening unsigned multiplies of affine CHRECs affine.
+            This handles byte-offset computations such as
+            (unsigned T) {base, +, step} * C and fold these into
+            {(unsigned T) base * C, +, (unsigned T) step * C}.  */
+         if (evolution_function_is_affine_p (inner)
+             /* The CHREC we're trying to distribute the cast into must be
+                affine already.  */
+             && tree_does_not_contain_chrecs (op1)
+             && INTEGRAL_TYPE_P (type)
+             && INTEGRAL_TYPE_P (inner_type)
+             /* Must be unsigned so we don't introduce any UB.  */
+             && TYPE_UNSIGNED (type)
+             /* The outer type must at least as wide than the inner type so we
+                don't truncate when we fold and must the inner CHREC must be
+                non-wrapping so we don't change the behavior when folding to
+                a wider type.  */
+             && TYPE_PRECISION (type) >= TYPE_PRECISION (inner_type)
+             && (!TYPE_UNSIGNED (inner_type)
+                 || TYPE_PRECISION (type) == TYPE_PRECISION (inner_type)
+                 || nonwrapping_chrec_p (inner))
+             /* The component we are multiplying must be loop invariant
+                otherwise the base expression can't be simplified and the
+                resulting CHREC won't be affine.  */
+             && evolution_function_is_invariant_p (op1,
+                                                   CHREC_VARIABLE (inner)))
+           {
+             tree top1 = chrec_convert (type, op1, NULL);
+             tree left
+               = chrec_fold_multiply (type,
+                                      chrec_convert (type, CHREC_LEFT (inner),
+                                                     NULL), top1);
+             tree right
+               = chrec_fold_multiply (type,
+                                      chrec_convert_rhs (type,
+                                                         CHREC_RIGHT (inner),
+                                                         NULL), top1);
+             return build_polynomial_chrec (CHREC_VARIABLE (inner),
+                                            left, right);
+           }
+
          /* We can strip sign-conversions to signed by performing the
             operation in unsigned.  */
-         tree optype = TREE_TYPE (TREE_OPERAND (op0, 0));
+         tree optype = inner_type;
          if (INTEGRAL_TYPE_P (type)
              && INTEGRAL_TYPE_P (optype)
              && tree_nop_conversion_p (type, optype)


-- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..929012184e0a2595af826d3d06284d0a6a510119
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+float *e;
+void f (float *f, float *g, char *h, int n,
+        int b, int c, int d)
+{
+  float a = 0;
+  for (int i = 0; i < n; ++i) {
+    int j = b + i, k = c + i * d;
+    float l = g[j], m = h[i] ? g[k] : l;
+    a += f[i] * m;
+  }
+  *e = a;
+}
+
+/* { dg-final { scan-tree-dump-not {failed: evolution of base is not affine} "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-chrec.cc b/gcc/tree-chrec.cc
index 09dd81900bce70138f975c68b77c4ba6d0e45fc3..ff77c7a6c2397f65f3ee17a386408b5ceec4676d 100644
--- a/gcc/tree-chrec.cc
+++ b/gcc/tree-chrec.cc
@@ -508,9 +508,52 @@ chrec_fold_multiply (tree type,
     CASE_CONVERT:
       if (tree_contains_chrecs (op0, NULL))
 	{
+	  tree inner = TREE_OPERAND (op0, 0);
+	  tree inner_type = TREE_TYPE (inner);
+
+	  /* Keep widening unsigned multiplies of affine CHRECs affine.
+	     This handles byte-offset computations such as
+	     (unsigned T) {base, +, step} * C and fold these into
+	     {(unsigned T) base * C, +, (unsigned T) step * C}.  */
+	  if (evolution_function_is_affine_p (inner)
+	      /* The CHREC we're trying to distribute the cast into must be
+		 affine already.  */
+	      && tree_does_not_contain_chrecs (op1)
+	      && INTEGRAL_TYPE_P (type)
+	      && INTEGRAL_TYPE_P (inner_type)
+	      /* Must be unsigned so we don't introduce any UB.  */
+	      && TYPE_UNSIGNED (type)
+	      /* The outer type must at least as wide than the inner type so we
+		 don't truncate when we fold and must the inner CHREC must be
+		 non-wrapping so we don't change the behavior when folding to
+		 a wider type.  */
+	      && TYPE_PRECISION (type) >= TYPE_PRECISION (inner_type)
+	      && (!TYPE_UNSIGNED (inner_type)
+		  || TYPE_PRECISION (type) == TYPE_PRECISION (inner_type)
+		  || nonwrapping_chrec_p (inner))
+	      /* The component we are multiplying must be loop invariant
+		 otherwise the base expression can't be simplified and the
+		 resulting CHREC won't be affine.  */
+	      && evolution_function_is_invariant_p (op1,
+						    CHREC_VARIABLE (inner)))
+	    {
+	      tree top1 = chrec_convert (type, op1, NULL);
+	      tree left
+		= chrec_fold_multiply (type,
+				       chrec_convert (type, CHREC_LEFT (inner),
+						      NULL), top1);
+	      tree right
+		= chrec_fold_multiply (type,
+				       chrec_convert_rhs (type,
+							  CHREC_RIGHT (inner),
+							  NULL), top1);
+	      return build_polynomial_chrec (CHREC_VARIABLE (inner),
+					     left, right);
+	    }
+
 	  /* We can strip sign-conversions to signed by performing the
 	     operation in unsigned.  */
-	  tree optype = TREE_TYPE (TREE_OPERAND (op0, 0));
+	  tree optype = inner_type;
 	  if (INTEGRAL_TYPE_P (type)
 	      && INTEGRAL_TYPE_P (optype)
 	      && tree_nop_conversion_p (type, optype)

Reply via email to