This patch improves the auto loop partitioning algorithm in  2 ways.

1) rather than try and assign just the outer loop to the outer partition and then all innermost loops to partitioning axis, this changes the algorithm to assign the innermost loop to the innermost partition and then the outermost loop nests to the outermost partitions. The difference will be seen when the loop nest exceeds the number of available partitioning axes. Now the unpartitioned loops will be the loops just outside the innermost loop -- rather than the loops just inside the outermost loop.

2) If the loop nest is shallower than the number of available partitions, we attempt to assign an outer loop to two partitions. This piece of the algorithm isn't fully generalized as there are only 3 paritioning axes. The interesting cases are nests of 1 or 2 loops. In the latter case, the outer loop will get gang+worker partitioning and the inner loop will get vector partitioning. In the former case the loop will get gang+vector partitioning. Whether it's worth extending this case to assign all 3 axes is something to investigate further.

This patch gives a 5-fold speedup of 304.olbm and a 16-fold speedup of 360.ilbdc over the current implementation.

committed to gomp4.

nathan
2016-05-06  Nathan Sidwell  <nat...@codesourcery.com>

	gcc/
	* omp-low.c (lower_oacc_head_mark): Ensure 2 levels for auto
	loops.
	(oacc_loop_auto_partitions): Add outer_assign parm. Assign all but
	vector partitioning to outer loops.  Assign 2 partitions to loops
	when available.
	(oacc_loop_partition): Adjust oacc_loop_auto_partitions call.

	gcc/testsuite/
	* c-c++-common/goacc/loop-auto-1.c: Adjust and add additionnal
	case.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust and
	add additional case.

Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 235775)
+++ gcc/omp-low.c	(working copy)
@@ -6396,9 +6396,9 @@ lower_oacc_head_mark (location_t loc, tr
 	       | OLF_SEQ)))
       tag |= OLF_AUTO;
 
-  /* Ensure at least one level.  */
-  if (!levels)
-    levels++;
+  /* Ensure at least one level, or 2 for AUTO partitioning  */
+  if (levels < 1 + ((tag & OLF_AUTO) != 0))
+    levels = 1 + ((tag & OLF_AUTO) != 0);
 
   args.quick_push (build_int_cst (integer_type_node, levels));
   args.quick_push (build_int_cst (integer_type_node, tag));
@@ -19682,11 +19682,13 @@ oacc_loop_fixed_partitions (oacc_loop *l
 
 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
    OUTER_MASK is the partitioning this loop is contained within.
+   OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
    Return the cumulative partitioning used by this loop, siblings and
    children.  */
 
 static unsigned
-oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
+oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
+			   bool outer_assign)
 {
   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
   bool noisy = true;
@@ -19697,31 +19699,34 @@ oacc_loop_auto_partitions (oacc_loop *lo
   noisy = false;
 #endif
 
-  if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1))
+  if (assign && (!outer_assign | loop->inner))
     {
-      /* Allocate the outermost loop at the outermost available
-	 level.  */
+      /* Allocate outermost and non-innermost loops at the outermost
+	 non-innermost available level.  */
       unsigned this_mask = outer_mask + 1;
 
-      if (!(this_mask & loop->inner))
+      /* Make sure it's the single outermost available partition.  */
+      while (this_mask != (this_mask & -this_mask))
+	this_mask += this_mask & -this_mask;
+
+      if (!(this_mask & (loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX)
+			 | GOMP_DIM_MASK (GOMP_DIM_MAX - 1))))
 	loop->mask = this_mask;
     }
 
   if (loop->child)
-    {
-      unsigned child_mask = outer_mask | loop->mask;
-
-      if (loop->mask || assign)
-	child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX);
-
-      loop->inner = oacc_loop_auto_partitions (loop->child, child_mask);
-    }
-
-  if (assign && !loop->mask)
-    {
-      /* Allocate the loop at the innermost available level.  */
+    loop->inner = oacc_loop_auto_partitions (loop->child,
+					     outer_mask | loop->mask,
+					     outer_assign | assign);
+
+  if (assign && (!loop->mask || !outer_assign))
+    {
+      /* Allocate the loop at the innermost available level.  Note
+	 that we do this even if we already assigned this loop the
+	 outermost available level above.  That way we'll partition
+	 this along 2 axes, if they are available.  */
       unsigned this_mask = 0;
-      
+
       /* Determine the outermost partitioning used within this loop. */
       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
       this_mask = (this_mask & -this_mask);
@@ -19732,11 +19737,11 @@ oacc_loop_auto_partitions (oacc_loop *lo
       /* And avoid picking one use by an outer loop. */
       this_mask &= ~outer_mask;
 
-      if (!this_mask && noisy)
+      if (!this_mask && !loop->mask && noisy)
 	warning_at (loop->loc, 0,
 		    "insufficient partitioning available to parallelize loop");
 
-      loop->mask = this_mask;
+      loop->mask |= this_mask;
     }
 
   if (assign && dump_file)
@@ -19747,7 +19752,8 @@ oacc_loop_auto_partitions (oacc_loop *lo
   unsigned inner_mask = 0;
   
   if (loop->sibling)
-    inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+    inner_mask |= oacc_loop_auto_partitions (loop->sibling,
+					     outer_mask, outer_assign);
   
   inner_mask |= loop->inner | loop->mask;
 
@@ -19765,7 +19771,7 @@ oacc_loop_partition (oacc_loop *loop, un
   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
     {
       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
-      mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
+      mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
     }
   return mask_all;
 }
Index: gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
===================================================================
--- gcc/testsuite/c-c++-common/goacc/loop-auto-1.c	(revision 235775)
+++ gcc/testsuite/c-c++-common/goacc/loop-auto-1.c	(working copy)
@@ -74,6 +74,21 @@ void Foo ()
 	    for (int kx = 0; kx < 10; kx++) {}
 	  }
       }
+
+#pragma acc loop auto
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	    for (int kx = 0; kx < 10; kx++)
+	      {
+#pragma acc loop auto
+		for (int lx = 0; lx < 10; lx++) {}
+	      }
+	  }
+      }
   }
 }
 
@@ -214,10 +229,10 @@ void Vector (void)
 #pragma acc loop auto
     for (int ix = 0; ix < 10; ix++) {}
 
-#pragma acc loop auto
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto
 	for (int jx = 0; jx < 10; jx++) {}
       }
 }
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c	(revision 235775)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c	(working copy)
@@ -102,7 +102,6 @@ int vector_1 (int *ary, int size)
   clear (ary, size);
   
 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
-  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 104 } */
   {
 #pragma acc loop gang
     for (int jx = 0; jx < 1; jx++)
@@ -111,7 +110,7 @@ int vector_1 (int *ary, int size)
 	ary[ix] = place ();
   }
 
-  return check (ary, size, 0, 0, 1);
+  return check (ary, size, 0, 1, 1);
 }
 
 int vector_2 (int *ary, int size)
@@ -153,7 +152,7 @@ int gang_1 (int *ary, int size)
   clear (ary, size);
   
 #pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
-  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 155 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 154 } */
   {
 #pragma acc loop auto
     for (int jx = 0; jx <  size  / 64; jx++)
@@ -188,7 +187,6 @@ int gang_3 (int *ary, int size)
   clear (ary, size);
   
 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
-  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 190 } */
   {
 #pragma acc loop auto
     for (int jx = 0; jx <  size  / 64; jx++)
@@ -197,10 +195,24 @@ int gang_3 (int *ary, int size)
 	ary[ix + jx * 64] = place ();
   }
 
+  return check (ary, size, 1, 1, 1);
+}
+
+int gang_4 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop auto
+    for (int jx = 0; jx <  size; jx++)
+      ary[jx] = place ();
+  }
+
   return check (ary, size, 1, 0, 1);
 }
 
-#define N (32*32*32)
+#define N (32*32*32*2)
 int main ()
 {
   int ondev = 0;
@@ -228,6 +240,8 @@ int main ()
     return 1;
   if (gang_3 (ary,  N))
     return 1;
+  if (gang_4 (ary,  N))
+    return 1;
 
   return 0;
 }

Reply via email to