On 03/22/2018 09:18 AM, Tom de Vries wrote:

> That's obviously not good enough.
> 
> When I compile this test-case:
> ...
> int
> main (void)
> {
>   int a[10];
> #pragma acc parallel num_workers (16)
> #pragma acc loop worker
>   for (int i = 0; i < 10; i++)
>     a[i] = i;
> 
>   return 0;
> }
> ...
> 
> I get:
> ...
>  .maxntid 32, 16, 1
> ...
> 
> That's the change you need to isolate.

I attached an updated patch which incorporates the
cfun->machine->axis_dim changes. It now generates more precise arguments
for maxntid.

Cesar
>From 11035dc92884146dc4d974156adcb260568db785 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis <ce...@codesourcery.com>
Date: Thu, 22 Mar 2018 08:05:53 -0700
Subject: [PATCH] emit .maxntid hint

---
 gcc/config/nvptx/nvptx.c | 19 +++++++++++++++++++
 gcc/config/nvptx/nvptx.h |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index eff87732c4b..3958f71e995 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -76,6 +76,7 @@
 #include "target-def.h"
 
 #define WORKAROUND_PTXJIT_BUG 1
+#define WORKAROUND_PTXJIT_BUG_3 1
 
 /* Define dimension sizes for known hardware.  */
 #define PTX_VECTOR_LENGTH 32
@@ -1219,6 +1220,16 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
      stream, in order to share the prototype writing code.  */
   std::stringstream s;
   write_fn_proto (s, true, name, decl);
+
+#if WORKAROUND_PTXJIT_BUG_3
+  /* Emitting a .maxntid seems to have the effect of encouraging the
+     PTX JIT emit SYNC branches.  */
+  if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
+      && lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
+      s << ".maxntid " << cfun->machine->axis_dim[0] << ", "
+	<< cfun->machine->axis_dim[1] << ", 1\n";
+#endif
+
   s << "{\n";
 
   bool return_in_mem = write_return_type (s, false, result_type);
@@ -2831,6 +2842,11 @@ struct offload_attrs
   int max_workers;
 };
 
+/* Define entries for cfun->machine->axis_dim.  */
+
+#define MACH_VECTOR_LENGTH 0
+#define MACH_MAX_WORKERS 1
+
 struct parallel
 {
   /* Parent parallel.  */
@@ -4525,6 +4541,9 @@ nvptx_reorg (void)
 
       populate_offload_attrs (&oa);
 
+      cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
+      cfun->machine->axis_dim[MACH_MAX_WORKERS] = oa.max_workers;
+
       /* If there is worker neutering, there must be vector
 	 neutering.  Otherwise the hardware will fail.  */
       gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 8a14507c88a..958516da604 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -226,6 +226,8 @@ struct GTY(()) machine_function
   int return_mode; /* Return mode of current fn.
 		      (machine_mode not defined yet.) */
   rtx axis_predicate[2]; /* Neutering predicates.  */
+  int axis_dim[2]; /* Maximum number of threads on each axis, dim[0] is
+		      vector_length, dim[1] is num_workers.   */
   rtx unisimt_master; /* 'Master lane index' for -muniform-simt.  */
   rtx unisimt_predicate; /* Predicate for -muniform-simt.  */
   rtx unisimt_location; /* Mask location for -muniform-simt.  */
-- 
2.14.3

Reply via email to