On 03/22/2018 09:18 AM, Tom de Vries wrote: > That's obviously not good enough. > > When I compile this test-case: > ... > int > main (void) > { > int a[10]; > #pragma acc parallel num_workers (16) > #pragma acc loop worker > for (int i = 0; i < 10; i++) > a[i] = i; > > return 0; > } > ... > > I get: > ... > .maxntid 32, 16, 1 > ... > > That's the change you need to isolate.
I attached an updated patch which incorporates the cfun->machine->axis_dim changes. It now generates more precise arguments for maxntid. Cesar
>From 11035dc92884146dc4d974156adcb260568db785 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis <ce...@codesourcery.com> Date: Thu, 22 Mar 2018 08:05:53 -0700 Subject: [PATCH] emit .maxntid hint --- gcc/config/nvptx/nvptx.c | 19 +++++++++++++++++++ gcc/config/nvptx/nvptx.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index eff87732c4b..3958f71e995 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -76,6 +76,7 @@ #include "target-def.h" #define WORKAROUND_PTXJIT_BUG 1 +#define WORKAROUND_PTXJIT_BUG_3 1 /* Define dimension sizes for known hardware. */ #define PTX_VECTOR_LENGTH 32 @@ -1219,6 +1220,16 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) stream, in order to share the prototype writing code. */ std::stringstream s; write_fn_proto (s, true, name, decl); + +#if WORKAROUND_PTXJIT_BUG_3 + /* Emitting a .maxntid seems to have the effect of encouraging the + PTX JIT emit SYNC branches. */ + if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl)) + && lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl))) + s << ".maxntid " << cfun->machine->axis_dim[0] << ", " + << cfun->machine->axis_dim[1] << ", 1\n"; +#endif + s << "{\n"; bool return_in_mem = write_return_type (s, false, result_type); @@ -2831,6 +2842,11 @@ struct offload_attrs int max_workers; }; +/* Define entries for cfun->machine->axis_dim. */ + +#define MACH_VECTOR_LENGTH 0 +#define MACH_MAX_WORKERS 1 + struct parallel { /* Parent parallel. */ @@ -4525,6 +4541,9 @@ nvptx_reorg (void) populate_offload_attrs (&oa); + cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length; + cfun->machine->axis_dim[MACH_MAX_WORKERS] = oa.max_workers; + /* If there is worker neutering, there must be vector neutering. Otherwise the hardware will fail. */ gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 8a14507c88a..958516da604 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -226,6 +226,8 @@ struct GTY(()) machine_function int return_mode; /* Return mode of current fn. (machine_mode not defined yet.) */ rtx axis_predicate[2]; /* Neutering predicates. */ + int axis_dim[2]; /* Maximum number of threads on each axis, dim[0] is + vector_length, dim[1] is num_workers. */ rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */ rtx unisimt_predicate; /* Predicate for -muniform-simt. */ rtx unisimt_location; /* Mask location for -muniform-simt. */ -- 2.14.3