Re: [og7] vector_length extension part 1: generalize function and variable names

2018-03-09 Thread Cesar Philippidis
On 03/09/2018 07:29 AM, Thomas Schwinge wrote:

> On Thu, 1 Mar 2018 13:17:01 -0800, Cesar Philippidis  
> wrote:
>> To reduce the size of the final patch,
>> I've separated all of the misc. function and variable renaming into this
>> patch.
> 
> Yes, please always do such refactoring changes independently of other
> functionality changes.
> 
> 
>> This patch also introduces a new populate_offload_attrs function.
> 
> I'm seeing:
> 
> [...]/gcc/config/nvptx/nvptx.c: In function 'void nvptx_reorg()':
> [...]/gcc/config/nvptx/nvptx.c:4451:3: warning: 
> 'oa.offload_attrs::vector_length' may be used uninitialized in this function 
> [-Wmaybe-uninitialized]
>if (oa->vector_length == 0)
>^
> [...]/gcc/config/nvptx/nvptx.c:4516:21: note: 
> 'oa.offload_attrs::vector_length' was declared here
>offload_attrs oa;
>  ^
> 
> That must be "populate_offload_attrs" inlined into "nvptx_reorg".  I
> can't tell yet why it complains about "vector_length" only but not about
> the others.

I got lazy and just merged that function as-is. That warning will go
away once the reset of the vector length changes are in.

Cesar


Re: [og7] vector_length extension part 1: generalize function and variable names

2018-03-09 Thread Thomas Schwinge
Hi!

On Thu, 1 Mar 2018 13:17:01 -0800, Cesar Philippidis  
wrote:
> To reduce the size of the final patch,
> I've separated all of the misc. function and variable renaming into this
> patch.

Yes, please always do such refactoring changes independently of other
functionality changes.


> This patch also introduces a new populate_offload_attrs function.

I'm seeing:

[...]/gcc/config/nvptx/nvptx.c: In function 'void nvptx_reorg()':
[...]/gcc/config/nvptx/nvptx.c:4451:3: warning: 
'oa.offload_attrs::vector_length' may be used uninitialized in this function 
[-Wmaybe-uninitialized]
   if (oa->vector_length == 0)
   ^
[...]/gcc/config/nvptx/nvptx.c:4516:21: note: 
'oa.offload_attrs::vector_length' was declared here
   offload_attrs oa;
 ^

That must be "populate_offload_attrs" inlined into "nvptx_reorg".  I
can't tell yet why it complains about "vector_length" only but not about
the others.

For reference:

> --- a/gcc/config/nvptx/nvptx.c
> +++ b/gcc/config/nvptx/nvptx.c

> +/* Offloading function attributes.  */
> +
> +struct offload_attrs
> +{
> +  unsigned mask;
> +  int num_gangs;
> +  int num_workers;
> +  int vector_length;
> +  int max_workers;
> +};

> +static void
> +populate_offload_attrs (offload_attrs *oa)
> +{
> +  tree attr = oacc_get_fn_attrib (current_function_decl);
> +  tree dims = TREE_VALUE (attr);
> +  unsigned ix;
> +
> +  oa->mask = 0;
> +
> +  for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
> +{
> +  tree t = TREE_VALUE (dims);
> +  int size = (t == NULL_TREE) ? 0 : TREE_INT_CST_LOW (t);
> +  tree allowed = TREE_PURPOSE (dims);
> +
> +  if (size != 1 && !(allowed && integer_zerop (allowed)))
> + oa->mask |= GOMP_DIM_MASK (ix);
> +
> +  switch (ix)
> + {
> + case GOMP_DIM_GANG:
> +   oa->num_gangs = size;
> +   break;
> +
> + case GOMP_DIM_WORKER:
> +   oa->num_workers = size;
> +   break;
> +
> + case GOMP_DIM_VECTOR:
> +   oa->vector_length = size;
> +   break;
> + }
> +}
> +
> +  if (oa->vector_length == 0)
> +{
> +  /* FIXME: Need a more graceful way to handle large vector
> +  lengths in OpenACC routines.  */
> +  if (!lookup_attribute ("omp target entrypoint",
> +  DECL_ATTRIBUTES (current_function_decl)))
> + oa->vector_length = PTX_WARP_SIZE;
> +  else
> + oa->vector_length = PTX_VECTOR_LENGTH;
> +}
> +  if (oa->num_workers == 0)
> +oa->max_workers = PTX_CTA_SIZE / oa->vector_length;
> +  else
> +oa->max_workers = oa->num_workers;
> +}

> @@ -4435,27 +4513,19 @@ nvptx_reorg (void)
>  {
>/* If we determined this mask before RTL expansion, we could
>elide emission of some levels of forks and joins.  */
> -  unsigned mask = 0;
> -  tree dims = TREE_VALUE (attr);
> -  unsigned ix;
> +  offload_attrs oa;
>  
> -  for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
> - {
> -   int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
> -   tree allowed = TREE_PURPOSE (dims);
> +  populate_offload_attrs ();
>  
> -   if (size != 1 && !(allowed && integer_zerop (allowed)))
> - mask |= GOMP_DIM_MASK (ix);
> - }
>/* If there is worker neutering, there must be vector
>neutering.  Otherwise the hardware will fail.  */
> -  gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
> -   || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
> +  gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
> +   || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
>  
>/* Discover & process partitioned regions.  */
>parallel *pars = nvptx_discover_pars (_insn_map);
>nvptx_process_pars (pars);
> -  nvptx_neuter_pars (pars, mask, 0);
> +  nvptx_neuter_pars (pars, oa.mask, 0);
>delete pars;
>  }


Grüße
 Thomas


[og7] vector_length extension part 1: generalize function and variable names

2018-03-01 Thread Cesar Philippidis
Right now, I'm in the process of adding support for larger
vector_lengths in the nvptx BE. To reduce the size of the final patch,
I've separated all of the misc. function and variable renaming into this
patch. Once the nvptx BE is extended to support multiple CUDA
warps-sized vector lengths, in certain respect vectors will act like
workers with regards to state propagation and reductions (i.e., large
vectors will use shared-memory to propagate state between vector-single
to vector-partitioned modes and also for reductions). To that end, this
patch renames worker functions and variables from worker-something to
shared-something. Likewise, vector specific functions have been renamed
as warp-something.

This patch also introduces a new populate_offload_attrs function. At
present, that function is only used in nvptx_reorg, and it is actually
overkill for that. However, later one it will be used in other places,
including nvptx_validate_dims and the nvptx reduction handling code.

This patch has been committed to openacc-gcc-7-branch.

Cesar
2018-03-01  Cesar Philippidis  

	gcc/
	* config/nvptx/nvptx.c (PTX_VECTOR_LENGTH, PTX_WORKER_LENGTH,
	PTX_DEFAULT_RUNTIME_DIM): Move to the top of the file.
	(PTX_WARP_SIZE): Define.
	(PTX_CTA_SIZE): Define.
	(worker_bcast_size): Rename to oacc_bcast_size.
	(worker_bcast_align): Rename to oacc_bcast_align.
	(worker_bcast_sym): Rename to oacc_bcast_sym.
	(nvptx_option_override): Update usage of oacc_bcast_*.
	(nvptx_gen_wcast): Rename to nvptx_gen_warp_bcast.
	(struct wcast_data_t): Rename to broadcast_data_t.
	(nvptx_gen_wcast): Rename to nvptx_gen_shared_bcast.  Update to use
	oacc_bcast_* variables.
	(struct offload_attrs): New.
	(propagator_fn): Add bool argument.
	(nvptx_propagate): New bool argument.  Pass bool argument to fn.
	(vprop_gen): Rename to warp_prop_gen.  Update call to
	nvptx_gen_warp_bcast.
	(nvptx_vpropagate): Rename to nvptx_warp_propagate. Update call to
	nvptx_propagate.
	(wprop_gen): Rename to shared_prop_gen.  Update usage of oacc_bcast_*
	variables and call to nvptx_gen_shared_bcast.
	(nvptx_wpropagate): Rename to nvptx_shared_propagate.  Update usage
	of oacc_bcast_* variables and call to nvptx_propagate.
	(nvptx_wsync): Rename to nvptx_cta_sync.
	(nvptx_single): Update usage of oacc_bcast_* vars and calls to
	nvptx_gen_warp_bcast, nvptx_gen_shared_bcast and nvptx_cta_sync.
	(nvptx_process_pars): Likewise.
	(nvptx_neuter_pars): Whitespace.
	(populate_offload_attrs): New function.
	(nvptx_reorg): Use it to extract partitioning mask.
	(write_worker_buffer): Rename to write_shared_buffer.
	(nvptx_file_end): Update calls to write_shared_buffer.
	(nvptx_expand_worker_addr): Rename to nvptx_expand_shared_addr.
	(nvptx_expand_builtin): Update call to nvptx_expand_shared_addr.
	(nvptx_simt_vf): Return PTX_WARP_SIZE instead of PTX_VECTOR_LENGTH.
	(nvptx_get_worker_red_addr): Rename to nvptx_get_shared_red_addr.
	(nvptx_goacc_reduction_setup): Update call to
	nvptx_get_shared_red_addr.
	(nvptx_goacc_reduction_fini): Likewise.
	(nvptx_goacc_reduction_teardown): Likewise.


diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 1d510a7bb7d..b16cf59575c 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -77,6 +77,14 @@
 
 #define WORKAROUND_PTXJIT_BUG 1
 
+/* Define dimension sizes for known hardware.  */
+#define PTX_VECTOR_LENGTH 32
+//#define PTX_VECTOR_LENGTH 128
+#define PTX_WORKER_LENGTH 32
+#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
+#define PTX_WARP_SIZE 32
+#define PTX_CTA_SIZE 1024
+
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
 {
@@ -118,14 +126,15 @@ struct tree_hasher : ggc_cache_ptr_hash
 static GTY((cache)) hash_table *declared_fndecls_htab;
 static GTY((cache)) hash_table *needed_fndecls_htab;
 
-/* Buffer needed to broadcast across workers.  This is used for both
-   worker-neutering and worker broadcasting.  It is shared by all
-   functions emitted.  The buffer is placed in shared memory.  It'd be
-   nice if PTX supported common blocks, because then this could be
-   shared across TUs (taking the largest size).  */
-static unsigned worker_bcast_size;
-static unsigned worker_bcast_align;
-static GTY(()) rtx worker_bcast_sym;
+/* Buffer needed to broadcast across workers and vectors.  This is
+   used for both worker-neutering and worker broadcasting, and
+   vector-neutering and boardcasting when vector_length > 32.  It is
+   shared by all functions emitted.  The buffer is placed in shared
+   memory.  It'd be nice if PTX supported common blocks, because then
+   this could be shared across TUs (taking the largest size).  */
+static unsigned oacc_bcast_size;
+static unsigned oacc_bcast_align;
+static GTY(()) rtx oacc_bcast_sym;
 
 /* Buffer needed for worker reductions.  This has to be distinct from
the worker broadcast array, as both may be live concurrently.  */
@@ -198,9 +207,9 @@ nvptx_option_override (void)