Author: pallavimathew
Date: 2011-06-01 14:06:08 -0400 (Wed, 01 Jun 2011)
New Revision: 3636

Modified:
   trunk/osprey/be/lno/simd.cxx
Log:

Sample testcases that fail at -O3:

#include <stdio.h>
int a[1000], b[1000], c[1000];
void init (void) {
    int i;
    for (i = 0; i < 1000; i++) {
        a[i] = 1;
        b[i] = 2;
        c[i] = 3;
    }
}

int
main (int argc, char** argv) {
    int i; 
    init ();
    for (i = 2; i < 1000; i++) {
        b[i] = a[i-2];
        a[i] = c[i];
    }
    fprintf (stderr, "%d\n", b[6]);
    return 0;
}
---
#include <stdio.h>

#define N 1000
double d1[N], d2[N], d3[N];
float f1[N], f2[N];

void foo (void) {
    int i;
    for (i = 4; i < N; i++) {
        d1[i] = d2[i];         /* s1 */
        f1[i] = f2[i];         /* s2 */
        d3[i] = d1[i+3];       /* s3 */
    }
}

void
init (void) {
    int i;
    for (i = 0; i < N; i++) {
        d1[i] = 1.0;
        d2[i] = 2.0;
        d3[i] = 3.0;
    }
}

int
main (int argc, char** argv) {
    init ();
    foo ();
    fprintf (stderr, "%f\n", (float)d3[4]);
    return 0;
} 

---
#include <stdio.h>

#define N 1000
double d1[N], d2[N], d3[N];
char f1[N], f2[N];

void foo (void) {
    int i;
    for (i = 0; i < N - 4; i++) {
        d1[i] = d2[i] + d3[i+1];         /* s1 */
        f1[i] = f2[i];         /* s2 */
        d3[i] = d1[i+3];       /* s3 */
    }
}

void init (void) {
    int i;
    for (i = 0; i < N; i++) {
        d1[i] = 1.0 * i;
        d2[i] = 2.0 * i;
        d3[i] = 3.0 * i;
    }
}

int main (int argc, char** argv) {
    int i;
    init ();
    foo ();
    for (i=0; i<10; i++){
        fprintf (stderr, "%f ", (float)d3[i]);
    }
    fprintf(stderr, "\n");
    return 0;
}
---
#include <stdio.h>
#define N 1000
double d1[N], d2[N], d3[N];
char f1[N], f2[N];

int main (int argc, char** argv) {
  int i;
  for (i = 0; i < N; i++) {
    d3[i] = 3.0 * i;       /* s1 */
    f1[i] = f2[i];         /* s2 */
  }
  for (i=0; i<10; i++){
    fprintf (stderr, "%f ", (float)d3[i]);
  }
  fprintf(stderr, "\n");
  return 0;
}

---
Problem/Fix-Description:
 The central issue is incorrect ordering of vectorized statements.

 In the first example, the vector length is 4. Therefore, we need to 
 re-order the two statements in order to perform vectorization correctly.

 b[i] = a[i-2];
 a[i] = c[i];

 To fix the problem, we topologically reorder the statements right
 before performing vectorization.
 
 In the other examples the problem is caused by incorrect order of certain 
 unrolled statements during vectorization, coupled sometimes (last two 
examples) 
 with incorrect handling of induction variables in loop unrolling during SIMD. 
 Before this fix, copies of unrolled statement are inserted right after current 
statement, 
 without checking dependencies. This causes problems to the following example 
code:

 d1[i] = d2[i] + d3[i+1]; /* s1 */
 f1[i] = f2[i]; /* s2 */
 d3[i] = d1[i+3]; /* s3 */

 Due to the presence of s2, s1 and s3 need to be unrolled by 2. So,
 the code will become the following:

 d1[i:i+1] = d2[i:i+1] + d3[i+1:i+2];/* s1 */
 d1[i+2:i+3] = d2[i+2:i+3] + d3[i+3:i+4];/* s1' */
 f1[i:i+3] = f2[i:i+3]; /* s2 */
 d3[i:i+1] = d1[i+3:i+4]; /* s3 */
 d3[i+2:i+3] = d1[i+5:i+6]; /* s3' */

 Due to s1', d1[i+3] is written before read which is not so in original
 code. The fix is to insert unrolled copies of statements in the order
 of their appearence in the original loop.

 In the last example, when vectorizing `3.0 * I' (use of index variable in non 
array
 subscript operation), the first loop is changed to (conceptually):

 II = [start:start+2]
 INC= [16, 16, 16, 16]
 for (i = start; i < end; i+=16) {
    d3[i:i+1] = 3.0 * II;      /* vectorized version of s1 */
    f1[i:i+15]= f2[i:i+15];    /* vectorized version of s2 */
    II += INC;                 /* s3, created by SIMD */
 }

 Then, unrolling is performed. Since the loop is unrolled 16 times, we
 need to fill remaining copies of s1. SIMD does this by creating the
 following unrolled copy and then insert it to the loop.

 KK = [2, 2, 2, 2]
 d3[i+2:i+3] = 3.0 * (II + KK);

 In the original code, unrolled copy was inserted after the last
 statement of the loop. However, in this case, it should be s2, not
 s3. The other problem is that, vectorized version of s1 needs to be
 unrolled 8 times, so the increment to base should be 2, 4, 6, 8, 10,
 12, 14. However, the original code only deals with increment being 1,
 2, 4, 8 case (hence the assertion failure).

CR by Mei Ye.



Modified: trunk/osprey/be/lno/simd.cxx
===================================================================
--- trunk/osprey/be/lno/simd.cxx        2011-05-30 03:02:36 UTC (rev 3635)
+++ trunk/osprey/be/lno/simd.cxx        2011-06-01 18:06:08 UTC (rev 3636)
@@ -124,6 +124,17 @@
 static INT Last_Vectorizable_Loop_Id = 0;
 SIMD_VECTOR_CONF Simd_vect_conf;
 
+typedef struct {
+    INT unroll_times;
+    INT add_to_base;
+    INT base_incr;
+    WN  *innerloop;
+    WN  *vec_index_preg_store;
+    TYPE_ID index_type;
+} UNROLL_PARAMS;
+
+static WN_MAP unroll_map; 
+
 // Return TRUE iff there are too few iterations to generate a single 
 // vectorized iteration.
 //
@@ -2409,12 +2420,10 @@
 }
 
 // Have we created a vector type preg to create unroll copies for the use of 
-// induction variable. Note that the index variable can only be updated by 
-// factors : 1, 2, 4, 8. So, we only need to create these 4 types of pregs 
-// utmost and can reuse every time the loop induction variable is used inside
-// the loop.
-BOOL vec_unroll_preg_created[4]; 
-WN *vec_unroll_preg_store[4]; 
+// induction variable. 
+// So far, the maximum unroll times is 128/8 = 16 times
+BOOL vec_unroll_preg_created[16]; 
+WN *vec_unroll_preg_store[16]; 
 
 // Descend unroll copy and update the index into the last dimension for all 
 // arrays. Assumes all operators inside WN copy are vectorizable.
@@ -2479,14 +2488,11 @@
       TYPE_ID vec_type = WN_desc(copy);
       INT unroll_type;
 
-      switch(add_to_base) {
-      case 1: unroll_type = 0; break;
-      case 2: unroll_type = 1; break;
-      case 4: unroll_type = 2; break;
-      case 8: unroll_type = 3; break;
-      default: FmtAssert(FALSE, ("NYI"));
-      }
+      FmtAssert((add_to_base < 16),
+              ("Loop unrolled more than 16 times, need to expand 
vec_unroll_preg_created array"));
 
+      unroll_type = add_to_base;
+
       if (!vec_unroll_preg_created[unroll_type]) {
        WN* body = WN_do_body(loop);
        // Create the const (..., add_to_base, add_to_base, ...) in a vector 
preg
@@ -3434,6 +3440,11 @@
       return FALSE;
     }
   }
+
+  // at this point, we know that the loop can be vectorized
+  // reorder the loop statement according to their dependencies
+  toplogical_reordering(innerloop, 1, adg);
+
   // separate the loop and expand scalars which is expandable and has
   // references in different fissions loops
   if (needs_scalar_expansion)
@@ -4402,8 +4413,19 @@
       // Parentize copy
       LWN_Parentize(copy);
 
-      // Now, insert the new copy of the istore after istore inside innerloop.
-      LWN_Insert_Block_After(LWN_Get_Parent(istore),istore,copy);
+      // Now, insert the new copy of the istore at the end of current loop
+      // current loop should not be empty since we are performing 
vectorization on it
+      WN *body = WN_do_body(innerloop);
+      Is_True((body && WN_last(body)),
+              ("Loop body should not be empty for unrolling"));
+      if (vec_index_preg_store != NULL &&
+              SYMBOL(WN_last(body)) == SYMBOL(vec_index_preg_store)) {
+          Is_True((WN_first(body) != WN_last(body)),
+                  ("Loop body was empty before we inserted an increment 
operation in Simd_Vectorize_Induction_Variables"));
+          LWN_Insert_Block_After(body, WN_prev(WN_last(body)), copy);
+      } else {
+          LWN_Insert_Block_After(body, WN_last(body),copy);
+      }
 
       // Add the vertices of copy to array dependence graph.
       Add_Vertices(copy);
@@ -4428,7 +4450,36 @@
     }
 }
 
+// unroll statements that are necessary according to their appearance
+// in the loop to keep the dependencies
+static void Simd_Unroll_Necessary_Loop_Statements(WN *wn)
+{
+    Is_True((WN_opcode(wn) == OPC_BLOCK),
+            ("This function should only be called with do loop body"));
+    BOOL changed = FALSE; 
 
+    do {
+        changed = FALSE;
+        WN *kid = WN_first(wn);
+        while (kid) {
+            if (WN_MAP_Get(unroll_map, kid) != NULL) {
+                // we need to unroll this wn
+                UNROLL_PARAMS *params = (UNROLL_PARAMS*) 
WN_MAP_Get(unroll_map, kid);
+                if (params->unroll_times > 1) {
+                    Simd_Unroll_Statement(2, params->add_to_base, kid, 
params->vec_index_preg_store,
+                            params->innerloop, params->index_type);
+                    params->unroll_times -= 1;
+                    params->add_to_base += params->base_incr;
+                } else {
+                    WN_MAP_Set(unroll_map, kid, NULL);
+                }
+                changed = TRUE;
+            }
+            kid = WN_next(kid);
+        }
+    } while(changed);
+}
+
 static BOOL Simd_Good_Reduction_Load(WN *innerloop, WN *load)
 {
   WN* stmt = Find_Stmt_Under(load,WN_do_body(innerloop));
@@ -5774,11 +5825,9 @@
 //START: Vectorization Module 
   WN* reduction_node= NULL;
   WN *vec_index_preg_store = NULL;
-  for(INT ii=0; ii<4; ii++){
-    vec_unroll_preg_created[ii] = FALSE;
-    vec_unroll_preg_store[ii] = NULL;
-  }
 
+  unroll_map = WN_MAP_Create(&SIMD_default_pool);
+
   for (INT i=vec_simd_ops->Elements()-1; i >= 0; i--){
     simd_op=vec_simd_ops->Top_nth(i); 
 
@@ -5877,17 +5926,31 @@
   INT unroll_times = vect/stmt_unroll; //copies of statement needed
   INT add_to_base = unroll_times>1?vect/unroll_times:0; //dim index incr
   
-  if(unroll_times > 1 && WN_operator(istore) == OPR_ISTORE)
-     Simd_Unroll_Statement( unroll_times, add_to_base,
-                            istore,
-                            vec_index_preg_store,
-                            innerloop, index_type);
+  if(unroll_times > 1 && WN_operator(istore) == OPR_ISTORE) {
+      //record unroll parameters for now, use them later
+      //so that we don't depend on which direction we use to perform 
vectorization for the loop statements
+      UNROLL_PARAMS *params = TYPE_MEM_POOL_ALLOC(UNROLL_PARAMS, 
&SIMD_default_pool);
+
+      params->unroll_times = unroll_times;
+      params->add_to_base = add_to_base;
+      params->base_incr   = add_to_base;
+      params->innerloop = innerloop;
+      params->vec_index_preg_store = vec_index_preg_store;
+      params->index_type = index_type;
+
+      WN_MAP_Set(unroll_map, istore, params);
+  }
  
   //Finalize innerloop and remainderloop
    if (simd_op_last_in_loop[i])
       Simd_Finalize_Loops(innerloop, remainderloop, vect, reduction_node);
   }
+
+  Simd_Unroll_Necessary_Loop_Statements(WN_do_body(innerloop));
+
   dli->Loop_Vectorized = TRUE;
+
+  WN_MAP_Delete(unroll_map);
  }
  MEM_POOL_Pop(&SIMD_default_pool);
 


------------------------------------------------------------------------------
Simplify data backup and recovery for your virtual environment with vRanger. 
Installation's a snap, and flexible recovery options mean your data is safe,
secure and there when you need it. Data protection magic?
Nope - It's vRanger. Get your free trial download today. 
http://p.sf.net/sfu/quest-sfdev2dev
_______________________________________________
Open64-devel mailing list
Open64-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/open64-devel

Reply via email to