This uses the patch I committed yesterday which introduces warp broadcasts to implement the vector-single predication needed for OpenACC. Outside a loop with vector parallelism, only one of the threads representing a vector must execute, the others follow along. So we skip the real work in each basic block for the inactive threads, then broadcast the direction to take in the control flow graph from the active one, and jump as a group.

This will get extended with similar functionality for worker-single. Julian is working on some patches on top of that to ensure the later optimizers don't destroy the control flow - we really need the threads to reconverge and perform the broadcast/jump in lockstep.

Committed on gomp-4_0-branch.


Bernd
Index: gcc/ChangeLog.gomp
===================================================================
--- gcc/ChangeLog.gomp	(revision 223444)
+++ gcc/ChangeLog.gomp	(working copy)
@@ -1,5 +1,15 @@
 2015-05-20  Bernd Schmidt  <ber...@codesourcery.com>
 
+	* omp-low.c (struct omp_region): Add a gwv_this field.
+	(bb_region_map): New variable.
+	(find_omp_for_region_data, find_omp_target_region_data): New static
+	functions.
+	(build_omp_regions_1): Call them.  Build the bb_region_map.
+	(enclosing_target_region, requires_vector_predicate,
+	generate_vector_broadcast, predicate_bb, find_predicatable_bbs,
+	predicate_omp_regions): New static functions.
+	(execute_expand_omp): Allocate and free bb_region_map.
+
 	* config/nvptx/nvptx.c: Include "dumpfile,h".
 	(condition_unidirectional_p): New static function.
 	(nvptx_print_operand): Use it for new 'U' handling.
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 223442)
+++ gcc/omp-low.c	(working copy)
@@ -159,6 +159,9 @@ struct omp_region
 
   /* True if this is a combined parallel+workshare region.  */
   bool is_combined_parallel;
+
+  /* For an OpenACC loop, the level of parallelism requested.  */
+  int gwv_this;
 };
 
 /* Levels of parallelism as defined by OpenACC.  Increasing numbers
@@ -9961,7 +9964,6 @@ expand_omp_target (struct omp_region *re
     update_ssa (TODO_update_ssa_only_virtuals);
 }
 
-
 /* Expand the parallel region tree rooted at REGION.  Expansion
    proceeds in depth-first order.  Innermost regions are expanded
    first.  This way, parallel regions that require a new function to
@@ -9984,7 +9986,7 @@ expand_omp (struct omp_region *region)
       if (region->type == GIMPLE_OMP_FOR
 	  && gimple_omp_for_combined_p (last_stmt (region->entry)))
 	inner_stmt = last_stmt (region->inner->entry);
-
+     
       if (region->inner)
 	expand_omp (region->inner);
 
@@ -10041,6 +10043,44 @@ expand_omp (struct omp_region *region)
     }
 }
 
+/* Map each basic block to an omp_region.  */
+static hash_map<basic_block, omp_region *> *bb_region_map;
+
+/* Fill in additional data for a region REGION associated with an
+   OMP_FOR STMT.  */
+
+static void
+find_omp_for_region_data (struct omp_region *region, gimple stmt)
+{
+  if (!is_gimple_omp_oacc (stmt))
+    return;
+
+  tree clauses = gimple_omp_for_clauses (stmt);
+  if (find_omp_clause (clauses, OMP_CLAUSE_GANG))
+    region->gwv_this |= MASK_GANG;
+  if (find_omp_clause (clauses, OMP_CLAUSE_WORKER))
+    region->gwv_this |= MASK_WORKER;
+  if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR))
+    region->gwv_this |= MASK_VECTOR;
+}
+
+/* Fill in additional data for a region REGION associated with an
+   OMP_TARGET STMT.  */
+
+static void
+find_omp_target_region_data (struct omp_region *region, gimple stmt)
+{
+  if (!is_gimple_omp_oacc (stmt))
+    return;
+
+  tree clauses = gimple_omp_target_clauses (stmt);
+  if (find_omp_clause (clauses, OMP_CLAUSE_NUM_GANGS))
+    region->gwv_this |= MASK_GANG;
+  if (find_omp_clause (clauses, OMP_CLAUSE_NUM_WORKERS))
+    region->gwv_this |= MASK_WORKER;
+  if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH))
+    region->gwv_this |= MASK_VECTOR;
+}
 
 /* Helper for build_omp_regions.  Scan the dominator tree starting at
    block BB.  PARENT is the region that contains BB.  If SINGLE_TREE is
@@ -10055,6 +10095,8 @@ build_omp_regions_1 (basic_block bb, str
   gimple stmt;
   basic_block son;
 
+  bb_region_map->put (bb, parent);
+
   gsi = gsi_last_bb (bb);
   if (!gsi_end_p (gsi) && is_gimple_omp (gsi_stmt (gsi)))
     {
@@ -10107,6 +10149,7 @@ build_omp_regions_1 (basic_block bb, str
 		case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 		case GF_OMP_TARGET_KIND_OACC_KERNELS:
 		case GF_OMP_TARGET_KIND_OACC_DATA:
+		  find_omp_target_region_data (region, stmt);
 		  break;
 		case GF_OMP_TARGET_KIND_UPDATE:
 		case GF_OMP_TARGET_KIND_OACC_UPDATE:
@@ -10118,6 +10161,8 @@ build_omp_regions_1 (basic_block bb, str
 		  gcc_unreachable ();
 		}
 	    }
+	  else if (code == GIMPLE_OMP_FOR)
+	    find_omp_for_region_data (region, stmt);
 	  /* ..., this directive becomes the parent for a new region.  */
 	  if (region)
 	    parent = region;
@@ -10156,7 +10201,7 @@ omp_expand_local (basic_block head)
       dump_omp_region (dump_file, root_omp_region, 0);
       fprintf (dump_file, "\n");
     }
-
+  
   remove_exit_barriers (root_omp_region);
   expand_omp (root_omp_region);
 
@@ -10174,31 +10219,264 @@ build_omp_regions (void)
   build_omp_regions_1 (ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, false);
 }
 
+/* Walk the tree upwards from region until a target region is found
+   or we reach the end, then return it.  */
+static omp_region *
+enclosing_target_region (omp_region *region)
+{
+  while (region != NULL
+	 && region->type != GIMPLE_OMP_TARGET)
+    region = region->outer;
+  return region;
+}
+
+/* Return true if basic blocks in REGION require OpenACC vector
+   predication.  */
+static bool
+requires_vector_predicate (struct omp_region *region)
+{
+  while (region
+	 && region->type != GIMPLE_OMP_FOR && region->type != GIMPLE_OMP_TARGET)
+    region = region->outer;
+  if (!region)
+    return false;
+  omp_region *outer_target = enclosing_target_region (region);
+  if (!outer_target || (outer_target->gwv_this & MASK_VECTOR) == 0)
+    return false;
+  if (region->type == GIMPLE_OMP_FOR && (region->gwv_this & MASK_VECTOR) == 0)
+    return true;
+  return false;
+}
+
+/* Generate a broadcast across OpenACC vector threads (a warp on GPUs)
+   so that VAR is broadcast to DEST_VAR.  The new statements are
+   added after WHERE.  */
+static void
+generate_vector_broadcast (tree dest_var, tree var,
+			   gimple_stmt_iterator &where)
+{
+  tree vartype = TREE_TYPE (var);
+  tree call_arg_type = unsigned_type_node;
+  enum built_in_function fn = BUILT_IN_GOACC_THREAD_BROADCAST;
+  if (TYPE_PRECISION (vartype) > TYPE_PRECISION (call_arg_type))
+    {
+      fn = BUILT_IN_GOACC_THREAD_BROADCAST_LL;
+      call_arg_type = long_long_unsigned_type_node;
+    }
+  bool need_conversion = !types_compatible_p (vartype, call_arg_type);
+  tree casted_var = var;
+  if (need_conversion)
+    {
+      casted_var = create_tmp_var (call_arg_type);
+      gassign *conv1 = gimple_build_assign (casted_var, NOP_EXPR, var);
+      gsi_insert_after (&where, conv1, GSI_CONTINUE_LINKING);
+    }
+
+  tree decl = builtin_decl_explicit (fn);
+  gimple call = gimple_build_call (decl, 1, casted_var);
+  gsi_insert_after (&where, call, GSI_NEW_STMT);
+  tree casted_dest = dest_var;
+  if (need_conversion)
+    {
+      casted_dest = create_tmp_var (call_arg_type);
+      create_tmp_var (call_arg_type);
+      gassign *conv2 = gimple_build_assign (dest_var, NOP_EXPR,
+					    casted_dest);
+      gsi_insert_after (&where, conv2, GSI_CONTINUE_LINKING);
+    }
+  gimple_call_set_lhs (call, casted_dest);
+
+}
+
+/* Apply OpenACC vector predication to basic block BB which is in
+   region PARENT.  */
+
+static void
+predicate_bb (basic_block bb, struct omp_region *parent)
+{
+  if (!requires_vector_predicate (parent))
+    return;
+
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+
+  gsi = gsi_last_bb (bb);
+  stmt = gsi_stmt (gsi);
+  if (stmt == NULL)
+    return;
+
+  basic_block skip_dest_bb = NULL;
+  basic_block *adjust_bb_ptr = NULL;
+
+  if (gimple_code (stmt) == GIMPLE_COND)
+    {
+      tree cond_var = create_tmp_var (boolean_type_node);
+      tree broadcast_cond = create_tmp_var (boolean_type_node);
+      gassign *asgn = gimple_build_assign (cond_var,
+					   gimple_cond_code (stmt),
+					   gimple_cond_lhs (stmt),
+					   gimple_cond_rhs (stmt));
+      gsi_insert_before (&gsi, asgn, GSI_CONTINUE_LINKING);
+      gimple_stmt_iterator gsi_asgn = gsi_for_stmt (asgn);
+
+      generate_vector_broadcast (broadcast_cond, cond_var, gsi_asgn);
+
+      edge e = split_block (bb, asgn);
+      skip_dest_bb = e->dest;
+
+      gimple_cond_set_condition (as_a <gcond *> (stmt), EQ_EXPR,
+				 broadcast_cond, boolean_true_node);
+    }
+  else if (gimple_code (stmt) == GIMPLE_SWITCH)
+    {
+      gswitch *sstmt = as_a <gswitch *> (stmt);
+      tree var = gimple_switch_index (sstmt);
+      tree new_var = create_tmp_var (TREE_TYPE (var));
+
+      gassign *asgn = gimple_build_assign (new_var, var);
+      gsi_insert_before (&gsi, asgn, GSI_CONTINUE_LINKING);
+      gimple_stmt_iterator gsi_asgn = gsi_for_stmt (asgn);
+
+      generate_vector_broadcast (new_var, var, gsi_asgn);
+
+      edge e = split_block (bb, asgn);
+      skip_dest_bb = e->dest;
+
+      gimple_switch_set_index (sstmt, new_var);
+    }
+  else if (is_gimple_omp (stmt))
+    {
+      gsi_prev (&gsi);
+      /* Only a few statements need special treatment.  */
+      if (gimple_code (stmt) != GIMPLE_OMP_FOR
+	  && gimple_code (stmt) != GIMPLE_OMP_CONTINUE)
+	{
+	  edge e = single_succ_edge (bb);
+	  skip_dest_bb = e->dest;
+	  if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
+	    {
+	      gcc_assert (parent->exit == bb);
+	      adjust_bb_ptr = &parent->exit;
+	    }
+	}
+      else
+	{
+	  gimple split_stmt = gsi_stmt (gsi);
+	  if (!split_stmt)
+	    return;
+	  edge e = split_block (bb, split_stmt);
+	  skip_dest_bb = e->dest;
+	  if (gimple_code (stmt) == GIMPLE_OMP_CONTINUE)
+	    {
+	      gcc_assert (parent->cont == bb);
+	      parent->cont = skip_dest_bb;
+	    }
+	  else if (gimple_code (stmt) == GIMPLE_OMP_FOR)
+	    {
+	      omp_region *inner;
+	      inner = *bb_region_map->get (FALLTHRU_EDGE (skip_dest_bb)->dest);
+	      gcc_assert (inner->entry == bb);
+	      inner->entry = skip_dest_bb;
+	    }
+	}
+    }
+  else if (single_succ_p (bb))
+    {
+      edge e = single_succ_edge (bb);
+      skip_dest_bb = e->dest;
+      if (gimple_code (stmt) == GIMPLE_GOTO)
+	gsi_prev (&gsi);
+      if (gsi_stmt (gsi) == 0)
+	return;
+    }
+
+  if (skip_dest_bb != NULL)
+    {
+      gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
+      gsi_prev (&head_gsi);
+      edge e2 = split_block (bb, gsi_stmt (head_gsi));
+      basic_block cond_bb = e2->src;
+
+      if (adjust_bb_ptr)
+	*adjust_bb_ptr = e2->dest;
+
+      gimple_stmt_iterator tmp_gsi = gsi_last_bb (cond_bb);
+
+      tree decl = builtin_decl_explicit (BUILT_IN_GOACC_TID);
+      gimple call = gimple_build_call (decl, 1, integer_zero_node);
+      tree tmp_var = create_tmp_var (unsigned_type_node);
+      gimple_call_set_lhs (call, tmp_var);
+
+      gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
+
+      tree cond = build2 (EQ_EXPR, boolean_type_node, tmp_var,
+			  fold_convert (unsigned_type_node, integer_zero_node));
+      gimple cond_stmt = gimple_build_cond_empty (cond);
+      gsi_insert_after (&tmp_gsi, cond_stmt, GSI_CONTINUE_LINKING);
+
+      e2->flags = EDGE_TRUE_VALUE;
+      make_edge (cond_bb, skip_dest_bb, EDGE_FALSE_VALUE);
+    }
+}
+
+/* Walk the dominator tree starting at BB to collect basic blocks in
+   WORKLIST which need OpenACC vector predication applied to them.  */
+static void
+find_predicatable_bbs (basic_block bb, vec<basic_block> &worklist)
+{
+  struct omp_region *parent = *bb_region_map->get (bb);
+  if (requires_vector_predicate (parent))
+    worklist.safe_push (bb);
+  basic_block son;
+  for (son = first_dom_son (CDI_DOMINATORS, bb);
+       son;
+       son = next_dom_son (CDI_DOMINATORS, son))
+    find_predicatable_bbs (son, worklist);
+}
+
+/* Apply OpenACC vector predication to all basic blocks.  HEAD_BB is the
+   first.  */
+static void
+predicate_omp_regions (basic_block head_bb)
+{
+  vec<basic_block> worklist = vNULL;
+  find_predicatable_bbs (head_bb, worklist);
+  int i;
+  basic_block bb;
+  FOR_EACH_VEC_ELT (worklist, i, bb)
+    predicate_bb (bb, *bb_region_map->get (bb));
+}
+
 /* Main entry point for expanding OMP-GIMPLE into runtime calls.  */
 
 static unsigned int
 execute_expand_omp (void)
 {
-  build_omp_regions ();
+  bb_region_map = new hash_map<basic_block, omp_region *>;
 
-  if (!root_omp_region)
-    return 0;
+  build_omp_regions ();
 
-  if (dump_file)
+  if (root_omp_region)
     {
-      fprintf (dump_file, "\nOMP region tree\n\n");
-      dump_omp_region (dump_file, root_omp_region, 0);
-      fprintf (dump_file, "\n");
-    }
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nOMP region tree\n\n");
+	  dump_omp_region (dump_file, root_omp_region, 0);
+	  fprintf (dump_file, "\n");
+	}
 
-  remove_exit_barriers (root_omp_region);
+      predicate_omp_regions (ENTRY_BLOCK_PTR_FOR_FN (cfun));
 
-  expand_omp (root_omp_region);
+      remove_exit_barriers (root_omp_region);
 
-  cleanup_tree_cfg ();
+      expand_omp (root_omp_region);
 
-  free_omp_regions ();
+      cleanup_tree_cfg ();
 
+      free_omp_regions ();
+    }
+
+  delete bb_region_map;
   return 0;
 }
 

Reply via email to