Re: [og7] vector_length extension part 2: Generalize state propagation and synchronization

Tom de Vries Fri, 23 Mar 2018 07:18:28 -0700

On 03/02/2018 05:55 PM, Cesar Philippidis wrote:

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 28ae263c867..ac2731233dd 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1418,10 +1418,16 @@
    [(set_attr "atomic" "true")])

(define_insn "nvptx_barsync"

-  [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
+  [(unspec_volatile [(match_operand:SI 0 "nvptx_nonmemory_operand" "Ri")
+                    (match_operand:SI 1 "const_int_operand")]
                    UNSPECV_BARSYNC)]
    ""
-  "\\tbar.sync\\t%0;"
+  {
+    if (!REG_P (operands[0]))
+      return "\\tbar.sync\\t%0;";
+    else
+      return "\\tbar.sync\\t%0, %1;";
+  }
    [(set_attr "predicable" "false")])

This is wrong. The first operand can be a register or a constant, andthe second operand is independent. Whether or not we print the secondoperand is independent of whether the first is a register.

In this patch I've reserved INTVAL (operands[1]) == 0 for the "no secondoperand" case.


Committed.

Thanks,
- Tom

[nvptx] Add thread count parm to bar.sync

2018-03-23  Tom de Vries  <t...@codesourcery.com>

	* config/nvptx/nvptx.md (nvptx_barsync): Add and handle operand.
	* config/nvptx/nvptx.c (nvptx_cta_sync): Change arguments to take in a
	lock and thread count.  Update call to gen_nvptx_barsync.
	(nvptx_single, nvptx_process_pars): Update calls to nvptx_cta_sync.

---
 gcc/config/nvptx/nvptx.c  | 22 ++++++++++++++--------
 gcc/config/nvptx/nvptx.md | 10 ++++++++--
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 12441cb..32f2efb 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -3939,13 +3939,14 @@ nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
   return empty;
 }
 
-/* Emit a CTA-level synchronization barrier.  We use different
-   markers for before and after synchronizations.  */
+/* Emit a CTA-level synchronization barrier (bar.sync).  LOCK is the
+   barrier number, which is an integer or a register.  THREADS is the
+   number of threads controlled by the barrier.  */
 
 static rtx
-nvptx_cta_sync (bool after)
+nvptx_cta_sync (rtx lock, int threads)
 {
-  return gen_nvptx_barsync (GEN_INT (after));
+  return gen_nvptx_barsync (lock, GEN_INT (threads));
 }
 
 #if WORKAROUND_PTXJIT_BUG
@@ -4195,6 +4196,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  /* Includes worker mode, do spill & fill.  By construction
 	     we should never have worker mode only. */
 	  broadcast_data_t data;
+	  rtx barrier = GEN_INT (0);
+	  int threads = 0;
 
 	  data.base = oacc_bcast_sym;
 	  data.ptr = 0;
@@ -4207,14 +4210,14 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 						    false),
 			    before);
 	  /* Barrier so other workers can see the write.  */
-	  emit_insn_before (nvptx_cta_sync (false), tail);
+	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
 	  data.offset = 0;
 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
 						    false), tail);
 	  /* This barrier is needed to avoid worker zero clobbering
 	     the broadcast buffer before all the other workers have
 	     had a chance to read this instance of it.  */
-	  emit_insn_before (nvptx_cta_sync (false), tail);
+	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
 	}
 
       extract_insn (tail);
@@ -4331,12 +4334,15 @@ nvptx_process_pars (parallel *par)
       bool empty = nvptx_shared_propagate (true, is_call,
 					   par->forked_block, par->fork_insn,
 					   false);
+      rtx barrier = GEN_INT (0);
+      int threads = 0;
 
       if (!empty || !is_call)
 	{
 	  /* Insert begin and end synchronizations.  */
-	  emit_insn_before (nvptx_cta_sync (false), par->forked_insn);
-	  emit_insn_before (nvptx_cta_sync (false), par->join_insn);
+	  emit_insn_before (nvptx_cta_sync (barrier, threads),
+			    par->forked_insn);
+	  emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
 	}
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 2b4bcb3a..2609222 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1421,10 +1421,16 @@
   [(set_attr "atomic" "true")])
 
 (define_insn "nvptx_barsync"
-  [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
+  [(unspec_volatile [(match_operand:SI 0 "nvptx_nonmemory_operand" "Ri")
+		     (match_operand:SI 1 "const_int_operand")]
 		    UNSPECV_BARSYNC)]
   ""
-  "\\tbar.sync\\t%0;"
+  {
+    if (INTVAL (operands[1]) == 0)
+      return "\\tbar.sync\\t%0;";
+    else
+      return "\\tbar.sync\\t%0, %1;";
+  }
   [(set_attr "predicable" "false")])
 
 (define_insn "nvptx_nounroll"

Re: [og7] vector_length extension part 2: Generalize state propagation and synchronization

Reply via email to