Re: [og7] vector_length extension part 3: reductions

2018-04-05 Thread Tom de Vries

On 03/02/2018 06:51 PM, Cesar Philippidis wrote:

This patch teaches the nvptx BE how to process vector reductions with
large vector lengths.


As with the "[nvptx] Generalize state propagation and synchronization" 
patch":

- added use of MAX and ROUND_UP
- added missing initialization of vector_red_partition
- added assert checking vector_red_partition and vector_red_size

Also:
- added FIXME for hack in nvptx_declare_function_name


Build x86_64 with nvptx accelerator and tested libgomp.

Committed.

Thanks,
- Tom
[nvptx] Handle large vector reductions

2018-04-05  Cesar Philippidis  
	Tom de Vries  

	* config/nvptx/nvptx-protos.h (nvptx_output_red_partition): Declare.
	* config/nvptx/nvptx.c (vector_red_size, vector_red_align,
	vector_red_partition, vector_red_sym): New global variables.
	(nvptx_option_override): Initialize vector_red_sym.
	(nvptx_declare_function_name): Restore red_partition register.
	(nvptx_file_end): Emit code to declare the vector reduction variables.
	(nvptx_output_red_partition): New function.
	(nvptx_expand_shared_addr): Add vector argument. Use it to handle
	large vector reductions.
	(enum nvptx_builtins): Add NVPTX_BUILTIN_VECTOR_ADDR.
	(nvptx_init_builtins): Add VECTOR_ADDR.
	(nvptx_expand_builtin): Update call to nvptx_expand_shared_addr.
	Handle nvptx_expand_shared_addr.
	(nvptx_get_shared_red_addr): Add vector argument and handle large
	vectors.
	(nvptx_goacc_reduction_setup): Add offload_attrs argument and handle
	large vectors.
	(nvptx_goacc_reduction_init): Likewise.
	(nvptx_goacc_reduction_fini): Likewise.
	(nvptx_goacc_reduction_teardown): Likewise.
	(nvptx_goacc_reduction): Update calls to nvptx_goacc_reduction_{setup,
	init,fini,teardown}.
	(nvptx_init_axis_predicate): Initialize vector_red_partition.
	(nvptx_set_current_function): Init vector_red_partition.
	* config/nvptx/nvptx.md (UNSPECV_RED_PART): New unspecv.
	(nvptx_red_partition): New insn.
	* config/nvptx/nvptx.h (struct machine_function): Add red_partition.

---
 gcc/config/nvptx/nvptx-protos.h |   1 +
 gcc/config/nvptx/nvptx.c| 154 
 gcc/config/nvptx/nvptx.h|   2 +
 gcc/config/nvptx/nvptx.md   |  12 
 4 files changed, 140 insertions(+), 29 deletions(-)

diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index 16b316f..326c38c 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -55,5 +55,6 @@ extern const char *nvptx_output_return (void);
 extern const char *nvptx_output_set_softstack (unsigned);
 extern const char *nvptx_output_simt_enter (rtx, rtx, rtx);
 extern const char *nvptx_output_simt_exit (rtx);
+extern const char *nvptx_output_red_partition (rtx, rtx);
 #endif
 #endif
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 009ca59..51bd69d 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -143,6 +143,14 @@ static unsigned worker_red_size;
 static unsigned worker_red_align;
 static GTY(()) rtx worker_red_sym;
 
+/* Buffer needed for vector reductions, when vector_length >
+   PTX_WARP_SIZE.  This has to be distinct from the worker broadcast
+   array, as both may be live concurrently.  */
+static unsigned vector_red_size;
+static unsigned vector_red_align;
+static unsigned vector_red_partition;
+static GTY(()) rtx vector_red_sym;
+
 /* Shared memory block for gang-private variables.  */
 static unsigned gangprivate_shared_size;
 static unsigned gangprivate_shared_align;
@@ -219,6 +227,11 @@ nvptx_option_override (void)
   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
 
+  vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
+  SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
+  vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+  vector_red_partition = 0;
+
   gangprivate_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gangprivate_shared");
   SET_SYMBOL_DATA_AREA (gangprivate_shared_sym, DATA_AREA_SHARED);
   gangprivate_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
@@ -1096,8 +1109,25 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
 {
   fprintf (file, "\t{\n");
   fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
+  if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
+{
+  fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
+  fprintf (file, "\t\t.reg.u64\t%%y64;\n");
+}
   fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
   fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
+  if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
+{
+  fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
+  fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
+  fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
+	   "// vector reduction buffer\n",
+	   REGNO 

Re: [og7] vector_length extension part 3: reductions

2018-04-05 Thread Tom de Vries

On 03/02/2018 06:51 PM, Cesar Philippidis wrote:

This patch teaches the nvptx BE how to process vector reductions with
large vector lengths.


Committed test-case exercising large vector length with reductions.

Thanks,
- Tom
[openacc] Add vector-length-128-10.c

2018-04-05  Tom de Vries  

	* testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c: New test.

---
 .../vector-length-128-10.c | 40 ++
 1 file changed, 40 insertions(+)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c
new file mode 100644
index 000..e46b5cf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+
+#include 
+
+#define N 1024
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+unsigned int n = N;
+
+int
+main (void)
+{
+  for (unsigned int i = 0; i < n; ++i)
+{
+  a[i] = i % 3;
+  b[i] = i % 5;
+}
+
+  unsigned int res = 1;
+  unsigned long long res2 = 1;
+#pragma acc parallel vector_length (128) copyin (a,b) reduction (+:res, res2) copy (res, res2)
+  {
+#pragma acc loop vector reduction (+:res, res2)
+for (unsigned int i = 0; i < n; i++)
+  {
+	res += ((a[i] + b[i]) % 2);
+	res2 += ((a[i] + b[i]) % 2);
+  }
+  }
+
+  if (res != 478)
+abort ();
+  if (res2 != 478)
+abort ();
+
+  return 0;
+}
+/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */


[og7] vector_length extension part 3: reductions

2018-03-02 Thread Cesar Philippidis
This patch teaches the nvptx BE how to process vector reductions with
large vector lengths. The original vector reduction finalizer won't work
because it uses a warp shuffle operations. Now that vectors may contain
multiple warps, they need to store the partial reductions into
shared-memory like workers. Once the reduction variable is placed in
shared-memory, it will use the same atomic finalizer to update it as the
workers.

Much like the shared-memory spill-and-fill vector state propagation
extension, the nvptx BE needs to reserve enough shared-memory for each
worker that may encounter a vector reduction. That's why the reduction
functions have been augmented with an offload_attrs arguments. The
offload_attrs contains a max_workers field. Unlike vector_length, which
is fixed as a compile-time constant, num_workers can be altered
dynamically at runtime. Given that the size of a CUDA block is fixed,
max_workers is set to max_block_size / vector_length. This will be
discussed further in the next patch.

Effectively, the nvptx BE will now maintain a shared-memory reduction
buffer, named vector_red_sym, that contains max_workers logical
reduction partitions, where each partition contains enough shared-memory
for all of the reductions used by a single vector. By design, OpenACC
reductions are expanded relatively early during oaccdevlow. Because
accessing the reduction partition is a common operation, the partition
offset is placed in a register stored in cfun->machine_red_partition and
initialized in nvptx_init_axis_predicate. Due to how late that register
becomes available, nvptx_expand_shared_addr emits a
gen_nvptx_red_partition instruction to acquire share-memory address for
the reduction variable indirectly.

You may notice a hack in nvptx_declare_function_name. I observed that
sometimes GCC will mark red_partition as dead and not emit PTX code to
declare it. That's why nvptx_declare_function_name manually inserts it
into regno_reg_rtx prior to declaring all of the PTX registers. I think
there might be something wrong with nvptx_red_partition instruction.
Tom, can you take a look at it?

Ultimately, I suspect that large workers would greatly benefit by using
a new parallel tree reduction finalizer. Whereas the atomic finalizer
may have been suitable for a maximum of 32 workers, vector_length can be
up to 1024 threads, and a sequential finalizer will be slow. However,
that's a project for another day.

I'll commit this patch to openacc-gcc-7-branch after Tom reviews the new
nvptx_red_partition insn.

Cesar
2018-03-02  Cesar Philippidis  

	gcc/
	* config/nvptx/nvptx-protos.h (nvptx_output_red_partition): Declare.
	* config/nvptx/nvptx.c (vector_red_size, vector_red_align,
	vector_red_partition, vector_red_sym): New global variables.
	(nvptx_option_override): Initialize vector_red_sym.
	(nvptx_declare_function_name): Restore red_partition register.
	(nvptx_file_end): Emit code to declare the vector reduction variables.
	(nvptx_output_red_partition): New function.
	(nvptx_expand_shared_addr): Add vector argument. Use it to handle
	large vector reductions.
	(enum nvptx_builtins): Add NVPTX_BUILTIN_VECTOR_ADDR.
	(nvptx_init_builtins): Add VECTOR_ADDR.
	(nvptx_expand_builtin): Update call to nvptx_expand_shared_addr.
	Handle nvptx_expand_shared_addr.
	(nvptx_get_shared_red_addr): Add vector argument and handle large
	vectors.
	(nvptx_goacc_reduction_setup): Add offload_attrs argument and handle
	large vectors.
	(nvptx_goacc_reduction_init): Likewise.
	(nvptx_goacc_reduction_fini): Likewise.
	(nvptx_goacc_reduction_teardown): Likewise.
	(nvptx_goacc_reduction): Update calls to nvptx_goacc_reduction_{setup,
	init,fini,teardown}.
	* config/nvptx/nvptx.md (UNSPECV_RED_PART): New unspecv.
	(nvptx_red_partition): New insn.

>From 3834101d5144666f30d8798e983e276bd2c66636 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis 
Date: Fri, 2 Mar 2018 07:36:11 -0800
Subject: [PATCH] reductions

---
 gcc/config/nvptx/nvptx-protos.h |   1 +
 gcc/config/nvptx/nvptx.c| 146 +++-
 gcc/config/nvptx/nvptx.md   |  12 
 3 files changed, 128 insertions(+), 31 deletions(-)

diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index 16b316f12b8..326c38c5dc7 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -55,5 +55,6 @@ extern const char *nvptx_output_return (void);
 extern const char *nvptx_output_set_softstack (unsigned);
 extern const char *nvptx_output_simt_enter (rtx, rtx, rtx);
 extern const char *nvptx_output_simt_exit (rtx);
+extern const char *nvptx_output_red_partition (rtx, rtx);
 #endif
 #endif
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 4a48d44f44c..9d77176c638 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -142,6 +142,14 @@ static unsigned worker_red_size;
 static unsigned worker_red_align;
 static GTY(()) rtx