Re: [og7] vector_length extension part 4: target hooks and automatic parallelism

2018-04-05 Thread Tom de Vries

On 03/02/2018 08:18 PM, Cesar Philippidis wrote:

The attached patch adjusts the existing goacc validate_dims target hook
and introduces a new goacc adjust_parallelism target hook.


The attached patch now just introduces the nvptx_adjust_parallelism 
target hook implementation, which enables test-cases to start using the 
feature.


Build x86_64 with nvptx accelerator and tested libgomp.

Committed.

Thanks,
- Tom
[nvptx] Enable large vectors

2018-04-05  Cesar Philippidis  
	Tom de Vries  

	* omp-offload.c (oacc_get_default_dim): New function.
	* omp-offload.h (oacc_get_default_dim): Declare.
	* config/nvptx/nvptx.c (NVPTX_GOACC_VL_WARP): Define.
	(nvptx_goacc_needs_vl_warp): New function.
	(nvptx_goacc_validate_dims): Take larger vector lengths into
	account.
	(nvptx_adjust_parallelism): New function.
	(TARGET_GOACC_ADJUST_PARALLELISM): Define.
	(populate_offload_attrs): Handle the situation where the default
	runtime geometry has not been initialized yet for reductions.

	* testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c: Expect
	vector length to be 128.
	* testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c: Same.
	* testsuite/libgomp.oacc-c-c++-common/vred2d-128.c: Same.
	* testsuite/libgomp.oacc-fortran/gemm.f90: Same.

---
 gcc/config/nvptx/nvptx.c   | 148 +++--
 gcc/omp-offload.c  |   7 +
 gcc/omp-offload.h  |   2 +
 .../vector-length-128-1.c  |   5 +-
 .../vector-length-128-10.c |   1 -
 .../vector-length-128-2.c  |   5 +-
 .../libgomp.oacc-c-c++-common/vred2d-128.c |   2 -
 libgomp/testsuite/libgomp.oacc-fortran/gemm.f90|   1 -
 8 files changed, 153 insertions(+), 18 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 51bd69d..595413a 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -71,6 +71,7 @@
 #include "fold-const.h"
 #include "intl.h"
 #include "tree-hash-traits.h"
+#include "omp-offload.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -4634,15 +4635,20 @@ populate_offload_attrs (offload_attrs *oa)
   if (oa->vector_length == 0)
 {
   /* FIXME: Need a more graceful way to handle large vector
-	 lengths in OpenACC routines.  */
+	 lengths in OpenACC routines and also -fopenacc-dims.  */
   if (!lookup_attribute ("omp target entrypoint",
 			 DECL_ATTRIBUTES (current_function_decl)))
 	oa->vector_length = PTX_WARP_SIZE;
-  else
+  else if (PTX_VECTOR_LENGTH != PTX_WARP_SIZE)
 	oa->vector_length = PTX_VECTOR_LENGTH;
 }
   if (oa->num_workers == 0)
-oa->max_workers = PTX_CTA_SIZE / oa->vector_length;
+{
+  if (oa->vector_length == 0)
+	oa->max_workers = PTX_WORKER_LENGTH;
+  else
+	oa->max_workers = PTX_CTA_SIZE / oa->vector_length;
+}
   else
 oa->max_workers = oa->num_workers;
 }
@@ -5193,6 +5199,19 @@ nvptx_simt_vf ()
   return PTX_WARP_SIZE;
 }
 
+#define NVPTX_GOACC_VL_WARP "nvptx vl warp"
+
+/* Return true of the offloaded function needs a vector_length of
+   PTX_WARP_SIZE.  */
+
+static bool
+nvptx_goacc_needs_vl_warp ()
+{
+  tree attr = lookup_attribute (NVPTX_GOACC_VL_WARP,
+DECL_ATTRIBUTES (current_function_decl));
+  return attr != NULL_TREE;
+}
+
 /* Validate compute dimensions of an OpenACC offload or routine, fill
in non-unity defaults.  FN_LEVEL indicates the level at which a
routine might spawn a loop.  It is negative for non-routines.  If
@@ -5201,6 +5220,14 @@ nvptx_simt_vf ()
 static bool
 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
 {
+  int default_vector_length = PTX_VECTOR_LENGTH;
+
+  /* For capability reasons, fallback to vl = 32 for runtime values.  */
+  if (dims[GOMP_DIM_VECTOR] == 0)
+default_vector_length = PTX_WARP_SIZE;
+  else if (decl)
+default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
+
   /* Detect if a function is unsuitable for offloading.  */
   if (!flag_offload_force && decl)
 {
@@ -5225,18 +5252,20 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
 
   bool changed = false;
 
-  /* The vector size must be 32, unless this is a SEQ routine.  */
+  /* The vector size must be a positive multiple of the warp size,
+ unless this is a SEQ routine.  */
   if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
   && dims[GOMP_DIM_VECTOR] >= 0
-  && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
+  && (dims[GOMP_DIM_VECTOR] % 32 != 0
+	  || dims[GOMP_DIM_VECTOR] == 0))
 {
   if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
 	warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
 		dims[GOMP_DIM_VECTOR]
 		? G_("using vector_length (%d), ignoring %d")
 		: G_("using vector_length (%d), 

Re: [og7] vector_length extension part 4: target hooks and automatic parallelism

2018-03-27 Thread Tom de Vries

On 03/26/2018 06:33 PM, Tom de Vries wrote:

+  loop->mask = targetm.goacc.adjust_parallelism (loop->mask, outer_mask);
loop->mask |= this_mask;


I committed the above, but the original:
...

@@ -1397,6 +1407,8 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned 
outer_mask,
}
 
   loop->mask |= this_mask;

+  loop->mask = targetm.goacc.adjust_parallelism (loop->mask, outer_mask);
+
   if (!loop->mask && noisy)
warning_at (loop->loc, 0,
tiling

...
has the two loop->mask lines in the reverse order.

Fixed in attached patch.

Committed.

Thanks,
- Tom
[openacc] Fix adjust_parallism usage in oacc_loop_auto_partitions

2018-03-27  Tom de Vries  

	* omp-offload.c (oacc_loop_auto_partitions): Fix adjust_parallism usage.

---
 gcc/omp-offload.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index aa4de24..ed17160 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -1404,8 +1404,8 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
 			" to parallelize element loop");
 	}
 
-  loop->mask = targetm.goacc.adjust_parallelism (loop->mask, outer_mask);
   loop->mask |= this_mask;
+  loop->mask = targetm.goacc.adjust_parallelism (loop->mask, outer_mask);
 
   if (!loop->mask && noisy)
 	warning_at (loop->loc, 0,


Re: [og7] vector_length extension part 4: target hooks and automatic parallelism

2018-03-26 Thread Tom de Vries

On 03/02/2018 08:18 PM, Cesar Philippidis wrote:

The attached patch adjusts the existing goacc validate_dims target hook


This is overkill. All we need is a function
"int oacc_get_default_dim (int dim)".

Thanks,
- Tom


Re: [og7] vector_length extension part 4: target hooks and automatic parallelism

2018-03-26 Thread Tom de Vries

On 03/02/2018 08:18 PM, Cesar Philippidis wrote:

introduces a new goacc adjust_parallelism target hook.


That's another separate patch.

Committed.

Thanks,
- Tom
[openacc] Add target hook TARGET_GOACC_ADJUST_PARALLELISM

2018-03-26  Cesar Philippidis  
	Tom de Vries  

	* doc/tm.texi.in: Add placeholder for TARGET_GOACC_ADJUST_PARALLELISM.
	* doc/tm.texi: Regenerate.
	* omp-offload.c (oacc_loop_fixed_partitions): Use the adjust_parallelism
	hook to modify this_mask.
	(oacc_loop_auto_partitions): Use the adjust_parallelism hook to modify
	this_mask and loop->mask.
	(default_goacc_adjust_parallelism): New function.
	* target.def (adjust_parallelism): New hook.
	* targhooks.h (default_goacc_adjust_parallelism): Declare.

---
 gcc/doc/tm.texi   |  6 ++
 gcc/doc/tm.texi.in|  2 ++
 gcc/omp-offload.c | 19 +++
 gcc/target.def|  8 
 gcc/targhooks.h   |  1 +
 6 files changed, 49 insertions(+)

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 0fcb9c6..271eb4d 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5883,6 +5883,12 @@ This hook should return the maximum size of a particular dimension,
 or zero if unbounded.
 @end deftypefn
 
+@deftypefn {Target Hook} unsigned TARGET_GOACC_ADJUST_PARALLELISM (unsigned @var{this_mask}, unsigned @var{outer_mask})
+This hook allows the accelerator compiler to remove any unused
+parallelism exposed in the current loop @var{THIS_MASK}, and the
+enclosing loop @var{OUTER_MASK}.  It returns an adjusted mask.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_GOACC_FORK_JOIN (gcall *@var{call}, const int *@var{dims}, bool @var{is_fork})
 This hook can be used to convert IFN_GOACC_FORK and IFN_GOACC_JOIN
 function calls to target-specific gimple, or indicate whether they
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 4187da1..fc73ad1 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4298,6 +4298,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_GOACC_DIM_LIMIT
 
+@hook TARGET_GOACC_ADJUST_PARALLELISM
+
 @hook TARGET_GOACC_FORK_JOIN
 
 @hook TARGET_GOACC_REDUCTION
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index ba3f431..aa4de24 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -1258,6 +1258,13 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
 	}
 }
 
+  /* FIXME: Ideally, we should be coalescing parallelism here if the
+ hardware supports it.  E.g. Instead of partitioning a loop
+ across worker and vector axes, sometimes the hardware can
+ execute those loops together without resorting to placing
+ extra thread barriers.  */
+  this_mask = targetm.goacc.adjust_parallelism (this_mask, outer_mask);
+
   mask_all |= this_mask;
 
   if (loop->flags & OLF_TILE)
@@ -1349,6 +1356,7 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
 	  this_mask ^= loop->e_mask;
 	}
 
+  this_mask = targetm.goacc.adjust_parallelism (this_mask, outer_mask);
   loop->mask |= this_mask;
 }
 
@@ -1396,7 +1404,9 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
 			" to parallelize element loop");
 	}
 
+  loop->mask = targetm.goacc.adjust_parallelism (loop->mask, outer_mask);
   loop->mask |= this_mask;
+
   if (!loop->mask && noisy)
 	warning_at (loop->loc, 0,
 		tiling
@@ -1774,6 +1784,15 @@ default_goacc_dim_limit (int ARG_UNUSED (axis))
 #endif
 }
 
+/* Default adjustment of loop parallelism is not required.  */
+
+unsigned
+default_goacc_adjust_parallelism (unsigned this_mask,
+  unsigned ARG_UNUSED (outer_mask))
+{
+  return this_mask;
+}
+
 namespace {
 
 const pass_data pass_data_oacc_device_lower =
diff --git a/gcc/target.def b/gcc/target.def
index b302d36..c878fee 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1697,6 +1697,14 @@ int, (int axis),
 default_goacc_dim_limit)
 
 DEFHOOK
+(adjust_parallelism,
+"This hook allows the accelerator compiler to remove any unused\n\
+parallelism exposed in the current loop @var{THIS_MASK}, and the\n\
+enclosing loop @var{OUTER_MASK}.  It returns an adjusted mask.",
+unsigned, (unsigned this_mask, unsigned outer_mask),
+default_goacc_adjust_parallelism)
+
+DEFHOOK
 (fork_join,
 "This hook can be used to convert IFN_GOACC_FORK and IFN_GOACC_JOIN\n\
 function calls to target-specific gimple, or indicate whether they\n\
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 18070df..f4f6864 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -115,6 +115,7 @@ extern bool default_goacc_validate_dims (tree, int [], int);
 extern int default_goacc_dim_limit (int);
 extern bool default_goacc_fork_join (gcall *, const int [], bool);
 extern void default_goacc_reduction (gcall *);
+extern unsigned default_goacc_adjust_parallelism (unsigned, unsigned);
 
 /* These are here, and not in hooks.[ch], because not all users of
hooks.h include tm.h, and thus we 

Re: [og7] vector_length extension part 4: target hooks and automatic parallelism

2018-03-26 Thread Cesar Philippidis
On 03/26/2018 07:14 AM, Tom de Vries wrote:
> On 03/02/2018 08:18 PM, Cesar Philippidis wrote:
>> diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
>> index ba3f4317f4e..f15ce6b8f8d 100644
>> --- a/gcc/omp-offload.c
>> +++ b/gcc/omp-offload.c
>> @@ -626,7 +626,8 @@ oacc_parse_default_dims (const char *dims)
>>  function.  */
>>     static void
>> -oacc_validate_dims (tree fn, tree attrs, int *dims, int level,
>> unsigned used)
>> +oacc_validate_dims (tree fn, tree attrs, int *dims, int level,
>> unsigned used,
>> +    int * ARG_UNUSED (default_dims))
>>   {
>>     tree purpose[GOMP_DIM_MAX];
>>     unsigned ix;
> 
>> @@ -1604,7 +1616,8 @@ execute_oacc_device_lower ()
>>   }
>>       int dims[GOMP_DIM_MAX];
>> -  oacc_validate_dims (current_function_decl, attrs, dims, fn_level,
>> used_mask);
>> +  oacc_validate_dims (current_function_decl, attrs, dims, fn_level,
>> used_mask,
>> +  NULL);
>>       if (dump_file)
>>   {
> 
> What's the purpose of this unused parameter default_dims, that only ever
> gets to be NULL?

That's stale and can be removed. In an earlier, and more complicated,
version of the patch I was still trying to get large vector lengths to
work with multiple workers.

I'll remove it from my patch.

Thanks,
Cesar


Re: [og7] vector_length extension part 4: target hooks and automatic parallelism

2018-03-26 Thread Tom de Vries

On 03/02/2018 08:18 PM, Cesar Philippidis wrote:

diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index ba3f4317f4e..f15ce6b8f8d 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -626,7 +626,8 @@ oacc_parse_default_dims (const char *dims)
 function.  */
  
  static void

-oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
+oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used,
+   int * ARG_UNUSED (default_dims))
  {
tree purpose[GOMP_DIM_MAX];
unsigned ix;



@@ -1604,7 +1616,8 @@ execute_oacc_device_lower ()
  }
  
int dims[GOMP_DIM_MAX];

-  oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
+  oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask,
+ NULL);
  
if (dump_file)

  {


What's the purpose of this unused parameter default_dims, that only ever 
gets to be NULL?


Thanks,
- Tom


Re: [og7] vector_length extension part 4: target hooks and automatic parallelism

2018-03-21 Thread Cesar Philippidis
On 03/21/2018 08:49 AM, Tom de Vries wrote:
> On 03/02/2018 08:18 PM, Cesar Philippidis wrote:
> 
>> og7-vl-part4-hooks.diff
> 
>> diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
>> index 5642941c6a3..507c8671704 100644
>> --- a/gcc/config/nvptx/nvptx.c
>> +++ b/gcc/config/nvptx/nvptx.c
>> @@ -5205,14 +5205,36 @@ nvptx_simt_vf ()
>>     return PTX_WARP_SIZE;
>>   }
>>   +#define NVPTX_GOACC_VL_WARP "nvptx vl warp"
>> +
>> +/* Return true of the offloaded function needs a vector_length of
>> +   PTX_WARP_SIZE.  */
>> +
>> +static bool
>> +nvptx_goacc_needs_vl_warp ()
>> +{
>> +  tree attr = lookup_attribute (NVPTX_GOACC_VL_WARP,
>> +    DECL_ATTRIBUTES (current_function_decl));
>> +  return attr == NULL_TREE;
>> +}
>> +
> 
> I just wrote an example using "#pragma acc parallel vector_length (128)"
> and looked at the generated code. I found that the actual vector_length
> was still 32. I tracked this back to this function returning true.
> 
> I think we need "return attr != NULL_TREE".

Yes. Good catch. I've added another test case for this.

Thanks,
Cesar


Re: [og7] vector_length extension part 4: target hooks and automatic parallelism

2018-03-21 Thread Tom de Vries

On 03/02/2018 08:18 PM, Cesar Philippidis wrote:


og7-vl-part4-hooks.diff



diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 5642941c6a3..507c8671704 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -5205,14 +5205,36 @@ nvptx_simt_vf ()
return PTX_WARP_SIZE;
  }
  
+#define NVPTX_GOACC_VL_WARP "nvptx vl warp"

+
+/* Return true of the offloaded function needs a vector_length of
+   PTX_WARP_SIZE.  */
+
+static bool
+nvptx_goacc_needs_vl_warp ()
+{
+  tree attr = lookup_attribute (NVPTX_GOACC_VL_WARP,
+   DECL_ATTRIBUTES (current_function_decl));
+  return attr == NULL_TREE;
+}
+


I just wrote an example using "#pragma acc parallel vector_length (128)" 
and looked at the generated code. I found that the actual vector_length 
was still 32. I tracked this back to this function returning true.


I think we need "return attr != NULL_TREE".

Thanks,
- Tom



[og7] vector_length extension part 4: target hooks and automatic parallelism

2018-03-02 Thread Cesar Philippidis
The attached patch adjusts the existing goacc validate_dims target hook
and introduces a new goacc adjust_parallelism target hook. Now that
vector length is no longer hard-coded to 32, there are four different
ways to set it:

  1) compiler default
  2) explicitly via the vector_length clause
  3) compile time using -fopenacc-dim or the GOMP_OPENACC_DIM
 environment variable
  4) fallback to vector_length = 32 due to insufficient parallelism

The compiler default is activated in the absence of 2) and 3). It is
controlled by the macro PTX_VECTOR_LENGTH in nvptx.c. While working on
this patch set, I had it set to 128 to get more test coverage. But in
order to maintain backwards compatibility with acc routines (which is
still a work in progress), I've kept the default vector length to 32.
Besides, large vector reductions are expected to run slower until the
parallel reduction finalizer is ready.

The new default_dims arguments to validate_dims represents is necessary
to accommodate option 3) from above. validate_dims is called after
oaccdevlow has assigned parallelism to each acc loop.

Prior to this patch, oaccdevlow automatically assigned parallelism to
acc loops using oacc_loop_fixed_partitions and
oacc_loop_auto_partitions. Both of those functions were
processor-agnostic. In the case of nvptx, due to the current limitations
in this patch set, the nvptx BE needs to fallback to using a
vector_length of 32 whenever a vector loop is nested inside a worker
loop. By supplying the parallelism mask for both the current loop and
the outer loops, the goacc adjust_parallelism hook allows the back ends
to fine tune any parallelism as necessary.

Inside the nvptx BE, nvptx_goacc_adjust_parallelism uses a new "nvptx vl
warp" function attribute to denote that the offloaded function must
fallback to using a vector length of 32. Later,
nvptx_goacc_validate_dims uses the attribute to adjust vector_length
accordingly.

Going forward, in addition to adding a new parallel reduction finalizer,
the nvptx BE would benefit from merging synchronization and reduction
code for combined worker-reduction loops, e.g.

  #pragma acc loop worker vector

At present, GCC partitions acc loops with internal function markers for
each level of parallelism associated with the loop. If a loop has both
worker and vector level parallelism, it will have a dummy outer worker
loop, and dummy inner vector loop. On CUDA hardware, there's no strong
difference between workers and vectors as CUDA blocks are a loose
collection of warps. Therefore, it would make more sense to merge the
two loops together into a special WV loop. That would at least require
some changes in the BE in addition to oacc_loop_{auto,fixed}_partitions.
There were some problems in the past where CUDA hardware would lock up
because the synchronization requirements for those two levels of
parallelism. Merging them ought to simplify the synchronization code and
enable the PTX JIT to generate better code.

Overall, the changes in this patch are mild. I'll apply it to
openacc-gcc-7-branch after Tom approves the reduction patch.

Cesar

2018-03-02  Cesar Philippidis  

	gcc/
	* config/nvptx/nvptx.c (NVPTX_GOACC_VL_WARP): Define.
	(nvptx_goacc_needs_vl_warp): New function.
	(nvptx_goacc_validate_dims): Add new default_dims argument and take
	larger vector lengths into account.
	(nvptx_adjust_parallelism): New function.
	(TARGET_GOACC_ADJUST_PARALLELISM): Define.
	* doc/tm.texi: Regenerate.
	* doc/tm.texi.in: Add placeholder for TARGET_GOACC_ADJUST_PARALLELISM.
	* omp-offload.c (oacc_parse_default_dims): Update usage of the
	targetm.goacc_valdate_dims hook.
	(oacc_validate_dims): Add default_dims argument.
	(oacc_loop_fixed_partitions): Use the adjust_parallelism hook to
	modify this_mask.
	(oacc_loop_auto_partitions): Use the adjust_parallelism hook to
	modify this_mask and loop->mask.
	(execute_oacc_device_lower): Update call to oacc_validate_dims.
	(default_goacc_adjust_parallelism): New function.
	* target.def (validate_dims): Add new default_dims argument.
	(adjust_parallelism): New hook.
	* targhooks.h (default_goacc_validate_dims): Add new argument.
	(default_goacc_adjust_parallelism): Declare.

>From 1ee16b267dfbb0a148e8ec3b83ca463c21cbac1d Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Fri, 2 Mar 2018 10:08:23 -0800
Subject: [PATCH] New target hooks

---
 gcc/config/nvptx/nvptx.c | 139 +--
 gcc/doc/tm.texi  |  15 +++--
 gcc/doc/tm.texi.in   |   2 +
 gcc/omp-offload.c|  35 ++--
 gcc/target.def   |  17 --
 gcc/targhooks.h  |   3 +-
 6 files changed, 190 insertions(+), 21 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 5642941c6a3..507c8671704 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -5205,14 +5205,36 @@ nvptx_simt_vf ()
   return PTX_WARP_SIZE;
 }
 
+#define