This preliminary patch changes PTX's loop discovery to use a recursive DFS walk, rather than a worklist.

The significant changes, from the POC  of the next patch I'll be committing are:

1) always insert a 'fork' insn, which keeps the entry to a partitioned region as a possible Single-Entry-Single-Exit exit node.

2) don't add the ENTRY bb to the null loop that contains the entire function.

both #1  and #2 help with the SESE optimization I've been  working on.

nathan
2015-09-11  Nathan Sidwell  <nat...@codesourcery.com>

	* config/nvptx/nvptx.c (nvptx_emit_forking): Always emit fork.
	(bb_par_t, bb_par_vec_t): Delete.
	(nvptx_find_par): New recursive function. Broken out of ...
	(nvptx_discover_pars): ... here.  Call it.
	(nvptx_optimize_inner): Write to dump_file.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 227683)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -308,8 +308,8 @@ nvptx_emit_forking (unsigned mask, bool
     {
       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
       
-      /* Emit fork for worker level.  */
-      if (!is_call && mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+      /* Emit fork at all levels, this helps form SESE regions..  */
+      if (!is_call)
 	emit_insn (gen_nvptx_fork (op));
       emit_insn (gen_nvptx_forked (op));
     }
@@ -2591,7 +2591,7 @@ nvptx_discover_pre (basic_block block, i
   return pre_insn;
 }
 
-/*  Dump this parallel and all its inner parallels.  */
+/* Dump this parallel and all its inner parallels.  */
 
 static void
 nvptx_dump_pars (parallel *par, unsigned depth)
@@ -2614,110 +2614,114 @@ nvptx_dump_pars (parallel *par, unsigned
     nvptx_dump_pars (par->next, depth);
 }
 
-typedef std::pair<basic_block, parallel *> bb_par_t;
-typedef auto_vec<bb_par_t> bb_par_vec_t;
-
-/* Walk the BBG looking for fork & join markers.  Construct a
-   loop structure for the function.  MAP is a mapping of basic blocks
-   to head & taiol markers, discoveded when splitting blocks.  This
-   speeds up the discovery.  We rely on the BB visited flag having
-   been cleared when splitting blocks.  */
+/* If BLOCK contains a fork/join marker, process it to create or
+   terminate a loop structure.  Add this block to the current loop,
+   and then walk successor blocks.   */
 
 static parallel *
-nvptx_discover_pars (bb_insn_map_t *map)
+nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
 {
-  parallel *outer_par = new parallel (0, 0);
-  bb_par_vec_t worklist;
-  basic_block block;
-
-  // Mark entry and exit blocks as visited.
-  block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+  if (block->flags & BB_VISITED)
+    return par;
   block->flags |= BB_VISITED;
-  block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
-  worklist.safe_push (bb_par_t (block, outer_par));
 
-  while (worklist.length ())
+  if (rtx_insn **endp = map->get (block))
     {
-      bb_par_t bb_par = worklist.pop ();
-      parallel *l = bb_par.second;
+      rtx_insn *end = *endp;
 
-      block = bb_par.first;
-
-      // Have we met this block?
-      if (block->flags & BB_VISITED)
-	continue;
-      block->flags |= BB_VISITED;
-      
-      rtx_insn **endp = map->get (block);
-      if (endp)
+      /* This is a block head or tail, or return instruction.  */
+      switch (recog_memoized (end))
 	{
-	  rtx_insn *end = *endp;
-	  
-	  /* This is a block head or tail, or return instruction.  */
-	  switch (recog_memoized (end))
-	    {
-	    case CODE_FOR_return:
-	      /* Return instructions are in their own block, and we
-		 don't need to do anything more.  */
-	      continue;
+	case CODE_FOR_return:
+	  /* Return instructions are in their own block, and we
+	     don't need to do anything more.  */
+	  return par;
 
-	    case CODE_FOR_nvptx_forked:
-	      /* Loop head, create a new inner loop and add it into
-		 our parent's child list.  */
-	      {
-		unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
-
-		gcc_assert (mask);
-		l = new parallel (l, mask);
-		l->forked_block = block;
-		l->forked_insn = end;
-		if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		    && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
-		  l->fork_insn
-		    = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
-	      }
-	      break;
+	case CODE_FOR_nvptx_forked:
+	  /* Loop head, create a new inner loop and add it into
+	     our parent's child list.  */
+	  {
+	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
 
-	    case CODE_FOR_nvptx_join:
-	      /* A loop tail.  Finish the current loop and return to
-		 parent.  */
-	      {
-		unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
-
-		gcc_assert (l->mask == mask);
-		l->join_block = block;
-		l->join_insn = end;
-		if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		    && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
-		  l->joining_insn
-		    = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
-		l = l->parent;
-	      }
-	      break;
+	    gcc_assert (mask);
+	    par = new parallel (par, mask);
+	    par->forked_block = block;
+	    par->forked_insn = end;
+	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	      par->fork_insn
+		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
+	  }
+	  break;
 
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
+	case CODE_FOR_nvptx_join:
+	  /* A loop tail.  Finish the current loop and return to
+	     parent.  */
+	  {
+	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
 
-      /* Add this block onto the current loop's list of blocks.  */
-      l->blocks.safe_push (block);
+	    gcc_assert (par->mask == mask);
+	    par->join_block = block;
+	    par->join_insn = end;
+	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	      par->joining_insn
+		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
+	    par = par->parent;
+	  }
+	  break;
 
-      /* Push each destination block onto the work list.  */
-      edge e;
-      edge_iterator ei;
-      FOR_EACH_EDGE (e, ei, block->succs)
-	worklist.safe_push (bb_par_t (e->dest, l));
+	default:
+	  gcc_unreachable ();
+	}
     }
 
+  if (par)
+    /* Add this block onto the current loop's list of blocks.  */
+    par->blocks.safe_push (block);
+  else
+    /* This must be the entry block.  Create a NULL parallel.  */
+    par = new parallel (0, 0);
+
+  /* Walk successor blocks.  */
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, block->succs)
+    nvptx_find_par (map, par, e->dest);
+
+  return par;
+}
+
+/* DFS walk the CFG looking for fork & join markers.  Construct
+   loop structures as we go.  MAP is a mapping of basic blocks
+   to head & tail markers, discovered when splitting blocks.  This
+   speeds up the discovery.  We rely on the BB visited flag having
+   been cleared when splitting blocks.  */
+
+static parallel *
+nvptx_discover_pars (bb_insn_map_t *map)
+{
+  basic_block block;
+
+  /* Mark exit blocks as visited.  */
+  block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+  block->flags |= BB_VISITED;
+
+  /* And entry block as not.  */
+  block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+  block->flags &= ~BB_VISITED;
+
+  parallel *par = nvptx_find_par (map, 0, block);
+
   if (dump_file)
     {
       fprintf (dump_file, "\nLoops\n");
-      nvptx_dump_pars (outer_par, 0);
+      nvptx_dump_pars (par, 0);
       fprintf (dump_file, "\n");
     }
   
-  return outer_par;
+  return par;
 }
 
 /* Propagate live state at the start of a partitioned region.  BLOCK
@@ -3107,6 +3111,12 @@ nvptx_optimize_inner (parallel *par)
       return;
 
   /* Preconditions met.  Swallow the inner par.  */
+  if (dump_file)
+    fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
+	     inner->mask, inner->forked_block->index,
+	     inner->join_block->index,
+	     par->mask, par->forked_block->index, par->join_block->index);
+
   par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
 
   par->blocks.reserve (inner->blocks.length ());

Reply via email to