Re: [PATCH] [og10] OpenACC: Remove unnecessary barriers (gimple worker partitioning/broadcast)

2021-09-17 Thread Thomas Schwinge
Hi!

On 2020-06-29T13:17:30-0700, Julian Brown  wrote:
> This is an optimisation for middle-end worker-partitioning support (used
> to support multiple workers on AMD GCN).  At present, [...]

Thanks.  (... but I have not verified the algorithmic/behavioral
changes.)

I've removed (trivial) now-untrue 'ARG_UNUSED' markers.

Already earlier, I had settled on using "gang-private" (instead of
"gangprivate", "gang private", "ganglocal", "gang-local", "gang local",
etc.), so I've made such (trivial) changes.

Pushed to master branch commit 2961ac45b9e19523958757e607d11c5893d6368b
"openacc: Remove unnecessary barriers (gimple worker
partitioning/broadcast)", see attached.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 2961ac45b9e19523958757e607d11c5893d6368b Mon Sep 17 00:00:00 2001
From: Julian Brown 
Date: Mon, 29 Jun 2020 13:17:30 -0700
Subject: [PATCH] openacc: Remove unnecessary barriers (gimple worker
 partitioning/broadcast)

This is an optimisation for middle-end worker-partitioning support (used
to support multiple workers on AMD GCN).  At present, barriers may be
emitted in cases where they aren't needed and cannot be optimised away.
This patch stops the extraneous barriers from being emitted in the
first place.

One exception to the above (where the barrier is still needed) is for
predicated blocks of code that perform a write to gang-private shared
memory from one worker.  We must execute a barrier before other workers
read that shared memory location.

gcc/
	* config/gcn/gcn.c (gimple.h): Include.
	(gcn_fork_join): Emit barrier for worker-level joins.
	* omp-oacc-neuter-broadcast.cc (find_local_vars_to_propagate): Add
	writes_gang_private bitmap parameter. Set bit for blocks
	containing gang-private variable writes.
	(worker_single_simple): Don't emit barrier after predicated block.
	(worker_single_copy): Don't emit barrier if we're not broadcasting
	anything and the block contains no gang-private writes.
	(neuter_worker_single): Don't predicate blocks that only contain
	NOPs or internal marker functions.  Pass has_gang_private_write
	argument to worker_single_copy.
	(oacc_do_neutering): Add writes_gang_private bitmap handling.
---
 gcc/config/gcn/gcn.c |  11 ++-
 gcc/omp-oacc-neuter-broadcast.cc | 112 ---
 2 files changed, 94 insertions(+), 29 deletions(-)

diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index b1bfdeac7b6..2a3fc96c1ee 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -51,6 +51,7 @@
 #include "intl.h"
 #include "rtl-iter.h"
 #include "dwarf2.h"
+#include "gimple.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -5133,10 +5134,14 @@ gcn_oacc_dim_pos (int dim)
 /* Implement TARGET_GOACC_FORK_JOIN.  */
 
 static bool
-gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
-	   bool ARG_UNUSED (is_fork))
+gcn_fork_join (gcall *call, const int dims[], bool is_fork)
 {
-  /* GCN does not need to expand fork/join markers at the RTL level.  */
+  tree arg = gimple_call_arg (call, 2);
+  unsigned axis = TREE_INT_CST_LOW (arg);
+
+  if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
+return true;
+
   return false;
 }
 
diff --git a/gcc/omp-oacc-neuter-broadcast.cc b/gcc/omp-oacc-neuter-broadcast.cc
index e0bd01311ee..aa5990ed7a1 100644
--- a/gcc/omp-oacc-neuter-broadcast.cc
+++ b/gcc/omp-oacc-neuter-broadcast.cc
@@ -769,16 +769,19 @@ static void
 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
 			  hash_set *partitioned_var_uses,
 			  hash_set *gang_private_vars,
+			  bitmap writes_gang_private,
 			  vec *prop_set)
 {
   unsigned mask = outer_mask | par->mask;
 
   if (par->inner)
 find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
-  gang_private_vars, prop_set);
+  gang_private_vars, writes_gang_private,
+  prop_set);
   if (par->next)
 find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
-  gang_private_vars, prop_set);
+  gang_private_vars, writes_gang_private,
+  prop_set);
 
   if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
 {
@@ -799,8 +802,7 @@ find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
 		  if (!VAR_P (var)
 		  || is_global_var (var)
 		  || AGGREGATE_TYPE_P (TREE_TYPE (var))
-		  || !partitioned_var_uses->contains (var)
-		  || gang_private_vars->contains (var))
+		  || !partitioned_var_uses->contains (var))
 		continue;
 
 		  if (stmt_may_clobber_ref_p (stmt, var))
@@ -814,6 +816,14 @@ find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
 			  fprintf (dump_file, "\n");
 			}
 
+		  if (gang_private_vars->contains (var))
+			{
+			  /* If we 

[PATCH] [og10] OpenACC: Remove unnecessary barriers (gimple worker partitioning/broadcast)

2020-06-29 Thread Julian Brown
This is an optimisation for middle-end worker-partitioning support (used
to support multiple workers on AMD GCN).  At present, barriers may be
emitted in cases where they aren't needed and cannot be optimised away.
This patch stops the extraneous barriers from being emitted in the
first place.

One exception to the above (where the barrier is still needed) is for
predicated blocks of code that perform a write to gang-private shared
memory from one worker.  We must execute a barrier before other workers
read that shared memory location.

OK for og10 branch?

Julian

ChangeLog

gcc/
* config/gcn/gcn.c (gimple.h): Include.
(gcn_fork_join): Emit barrier for worker-level joins.
* omp-sese.c (find_local_vars_to_propagate): Add writes_gangprivate
bitmap parameter. Set bit for blocks containing gang-private variable
writes.
(worker_single_simple): Don't emit barrier after predicated block.
(worker_single_copy): Don't emit barrier if we're not broadcasting
anything and the block contains no gang-private writes.
(neuter_worker_single): Don't predicate blocks that only contain NOPs
or internal marker functions.  Pass has_gangprivate_write argument to
worker_single_copy.
(oacc_do_neutering): Add writes_gangprivate bitmap handling.
---
 gcc/config/gcn/gcn.c |   9 +++-
 gcc/omp-sese.c   | 115 +--
 2 files changed, 97 insertions(+), 27 deletions(-)

diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index bf996b461547..35b2ef5e752b 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -50,6 +50,7 @@
 #include "varasm.h"
 #include "intl.h"
 #include "rtl-iter.h"
+#include "gimple.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -4898,9 +4899,15 @@ gcn_oacc_dim_pos (int dim)
 /* Implement TARGET_GOACC_FORK_JOIN.  */
 
 static bool
-gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
+gcn_fork_join (gcall *call, const int *ARG_UNUSED (dims),
   bool ARG_UNUSED (is_fork))
 {
+  tree arg = gimple_call_arg (call, 2);
+  unsigned axis = TREE_INT_CST_LOW (arg);
+
+  if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
+return true;
+
   return false;
 }
 
diff --git a/gcc/omp-sese.c b/gcc/omp-sese.c
index 4dd3417066c6..80697358efec 100644
--- a/gcc/omp-sese.c
+++ b/gcc/omp-sese.c
@@ -768,16 +768,19 @@ static void
 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
  hash_set *partitioned_var_uses,
  hash_set *gangprivate_vars,
+ bitmap writes_gangprivate,
  vec *prop_set)
 {
   unsigned mask = outer_mask | par->mask;
 
   if (par->inner)
 find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
- gangprivate_vars, prop_set);
+ gangprivate_vars, writes_gangprivate,
+ prop_set);
   if (par->next)
 find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
- gangprivate_vars, prop_set);
+ gangprivate_vars, writes_gangprivate,
+ prop_set);
 
   if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
 {
@@ -798,8 +801,7 @@ find_local_vars_to_propagate (parallel_g *par, unsigned 
outer_mask,
  if (!VAR_P (var)
  || is_global_var (var)
  || AGGREGATE_TYPE_P (TREE_TYPE (var))
- || !partitioned_var_uses->contains (var)
- || gangprivate_vars->contains (var))
+ || !partitioned_var_uses->contains (var))
continue;
 
  if (stmt_may_clobber_ref_p (stmt, var))
@@ -813,6 +815,14 @@ find_local_vars_to_propagate (parallel_g *par, unsigned 
outer_mask,
  fprintf (dump_file, "\n");
}
 
+ if (gangprivate_vars->contains (var))
+   {
+ /* If we write a gang-private variable, we want a
+barrier at the end of the block.  */
+ bitmap_set_bit (writes_gangprivate, block->index);
+ continue;
+   }
+
  if (!(*prop_set)[block->index])
(*prop_set)[block->index] = new propagation_set;
 
@@ -924,14 +934,6 @@ worker_single_simple (basic_block from, basic_block to,
}
}
 }
-
-  gsi = gsi_start_bb (skip_block);
-
-  decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
-  gimple *acc_bar = gimple_build_call (decl, 0);
-
-  gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
-  update_stmt (acc_bar);
 }
 
 /* This is a copied and renamed omp-low.c:omp_build_component_ref.  *