Re: [og7] vector_length extension part 3: reductions
On 03/02/2018 06:51 PM, Cesar Philippidis wrote: This patch teaches the nvptx BE how to process vector reductions with large vector lengths. As with the "[nvptx] Generalize state propagation and synchronization" patch": - added use of MAX and ROUND_UP - added missing initialization of vector_red_partition - added assert checking vector_red_partition and vector_red_size Also: - added FIXME for hack in nvptx_declare_function_name Build x86_64 with nvptx accelerator and tested libgomp. Committed. Thanks, - Tom [nvptx] Handle large vector reductions 2018-04-05 Cesar PhilippidisTom de Vries * config/nvptx/nvptx-protos.h (nvptx_output_red_partition): Declare. * config/nvptx/nvptx.c (vector_red_size, vector_red_align, vector_red_partition, vector_red_sym): New global variables. (nvptx_option_override): Initialize vector_red_sym. (nvptx_declare_function_name): Restore red_partition register. (nvptx_file_end): Emit code to declare the vector reduction variables. (nvptx_output_red_partition): New function. (nvptx_expand_shared_addr): Add vector argument. Use it to handle large vector reductions. (enum nvptx_builtins): Add NVPTX_BUILTIN_VECTOR_ADDR. (nvptx_init_builtins): Add VECTOR_ADDR. (nvptx_expand_builtin): Update call to nvptx_expand_shared_addr. Handle nvptx_expand_shared_addr. (nvptx_get_shared_red_addr): Add vector argument and handle large vectors. (nvptx_goacc_reduction_setup): Add offload_attrs argument and handle large vectors. (nvptx_goacc_reduction_init): Likewise. (nvptx_goacc_reduction_fini): Likewise. (nvptx_goacc_reduction_teardown): Likewise. (nvptx_goacc_reduction): Update calls to nvptx_goacc_reduction_{setup, init,fini,teardown}. (nvptx_init_axis_predicate): Initialize vector_red_partition. (nvptx_set_current_function): Init vector_red_partition. * config/nvptx/nvptx.md (UNSPECV_RED_PART): New unspecv. (nvptx_red_partition): New insn. * config/nvptx/nvptx.h (struct machine_function): Add red_partition. --- gcc/config/nvptx/nvptx-protos.h | 1 + gcc/config/nvptx/nvptx.c| 154 gcc/config/nvptx/nvptx.h| 2 + gcc/config/nvptx/nvptx.md | 12 4 files changed, 140 insertions(+), 29 deletions(-) diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index 16b316f..326c38c 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -55,5 +55,6 @@ extern const char *nvptx_output_return (void); extern const char *nvptx_output_set_softstack (unsigned); extern const char *nvptx_output_simt_enter (rtx, rtx, rtx); extern const char *nvptx_output_simt_exit (rtx); +extern const char *nvptx_output_red_partition (rtx, rtx); #endif #endif diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 009ca59..51bd69d 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -143,6 +143,14 @@ static unsigned worker_red_size; static unsigned worker_red_align; static GTY(()) rtx worker_red_sym; +/* Buffer needed for vector reductions, when vector_length > + PTX_WARP_SIZE. This has to be distinct from the worker broadcast + array, as both may be live concurrently. */ +static unsigned vector_red_size; +static unsigned vector_red_align; +static unsigned vector_red_partition; +static GTY(()) rtx vector_red_sym; + /* Shared memory block for gang-private variables. */ static unsigned gangprivate_shared_size; static unsigned gangprivate_shared_align; @@ -219,6 +227,11 @@ nvptx_option_override (void) SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; + vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red"); + SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED); + vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; + vector_red_partition = 0; + gangprivate_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gangprivate_shared"); SET_SYMBOL_DATA_AREA (gangprivate_shared_sym, DATA_AREA_SHARED); gangprivate_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; @@ -1096,8 +1109,25 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name) { fprintf (file, "\t{\n"); fprintf (file, "\t\t.reg.u32\t%%%s;\n", name); + if (strcmp (name, "x") == 0 && cfun->machine->red_partition) +{ + fprintf (file, "\t\t.reg.u64\t%%t_red;\n"); + fprintf (file, "\t\t.reg.u64\t%%y64;\n"); +} fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name); fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name); + if (strcmp (name, "x") == 0 && cfun->machine->red_partition) +{ + fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n"); + fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n"); + fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; " + "// vector reduction buffer\n", + REGNO
Re: [og7] vector_length extension part 3: reductions
On 03/02/2018 06:51 PM, Cesar Philippidis wrote: This patch teaches the nvptx BE how to process vector reductions with large vector lengths. Committed test-case exercising large vector length with reductions. Thanks, - Tom [openacc] Add vector-length-128-10.c 2018-04-05 Tom de Vries* testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c: New test. --- .../vector-length-128-10.c | 40 ++ 1 file changed, 40 insertions(+) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c new file mode 100644 index 000..e46b5cf --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c @@ -0,0 +1,40 @@ +/* { dg-do run } */ + +#include + +#define N 1024 + +unsigned int a[N]; +unsigned int b[N]; +unsigned int c[N]; +unsigned int n = N; + +int +main (void) +{ + for (unsigned int i = 0; i < n; ++i) +{ + a[i] = i % 3; + b[i] = i % 5; +} + + unsigned int res = 1; + unsigned long long res2 = 1; +#pragma acc parallel vector_length (128) copyin (a,b) reduction (+:res, res2) copy (res, res2) + { +#pragma acc loop vector reduction (+:res, res2) +for (unsigned int i = 0; i < n; i++) + { + res += ((a[i] + b[i]) % 2); + res2 += ((a[i] + b[i]) % 2); + } + } + + if (res != 478) +abort (); + if (res2 != 478) +abort (); + + return 0; +} +/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */
[og7] vector_length extension part 3: reductions
This patch teaches the nvptx BE how to process vector reductions with large vector lengths. The original vector reduction finalizer won't work because it uses a warp shuffle operations. Now that vectors may contain multiple warps, they need to store the partial reductions into shared-memory like workers. Once the reduction variable is placed in shared-memory, it will use the same atomic finalizer to update it as the workers. Much like the shared-memory spill-and-fill vector state propagation extension, the nvptx BE needs to reserve enough shared-memory for each worker that may encounter a vector reduction. That's why the reduction functions have been augmented with an offload_attrs arguments. The offload_attrs contains a max_workers field. Unlike vector_length, which is fixed as a compile-time constant, num_workers can be altered dynamically at runtime. Given that the size of a CUDA block is fixed, max_workers is set to max_block_size / vector_length. This will be discussed further in the next patch. Effectively, the nvptx BE will now maintain a shared-memory reduction buffer, named vector_red_sym, that contains max_workers logical reduction partitions, where each partition contains enough shared-memory for all of the reductions used by a single vector. By design, OpenACC reductions are expanded relatively early during oaccdevlow. Because accessing the reduction partition is a common operation, the partition offset is placed in a register stored in cfun->machine_red_partition and initialized in nvptx_init_axis_predicate. Due to how late that register becomes available, nvptx_expand_shared_addr emits a gen_nvptx_red_partition instruction to acquire share-memory address for the reduction variable indirectly. You may notice a hack in nvptx_declare_function_name. I observed that sometimes GCC will mark red_partition as dead and not emit PTX code to declare it. That's why nvptx_declare_function_name manually inserts it into regno_reg_rtx prior to declaring all of the PTX registers. I think there might be something wrong with nvptx_red_partition instruction. Tom, can you take a look at it? Ultimately, I suspect that large workers would greatly benefit by using a new parallel tree reduction finalizer. Whereas the atomic finalizer may have been suitable for a maximum of 32 workers, vector_length can be up to 1024 threads, and a sequential finalizer will be slow. However, that's a project for another day. I'll commit this patch to openacc-gcc-7-branch after Tom reviews the new nvptx_red_partition insn. Cesar 2018-03-02 Cesar Philippidisgcc/ * config/nvptx/nvptx-protos.h (nvptx_output_red_partition): Declare. * config/nvptx/nvptx.c (vector_red_size, vector_red_align, vector_red_partition, vector_red_sym): New global variables. (nvptx_option_override): Initialize vector_red_sym. (nvptx_declare_function_name): Restore red_partition register. (nvptx_file_end): Emit code to declare the vector reduction variables. (nvptx_output_red_partition): New function. (nvptx_expand_shared_addr): Add vector argument. Use it to handle large vector reductions. (enum nvptx_builtins): Add NVPTX_BUILTIN_VECTOR_ADDR. (nvptx_init_builtins): Add VECTOR_ADDR. (nvptx_expand_builtin): Update call to nvptx_expand_shared_addr. Handle nvptx_expand_shared_addr. (nvptx_get_shared_red_addr): Add vector argument and handle large vectors. (nvptx_goacc_reduction_setup): Add offload_attrs argument and handle large vectors. (nvptx_goacc_reduction_init): Likewise. (nvptx_goacc_reduction_fini): Likewise. (nvptx_goacc_reduction_teardown): Likewise. (nvptx_goacc_reduction): Update calls to nvptx_goacc_reduction_{setup, init,fini,teardown}. * config/nvptx/nvptx.md (UNSPECV_RED_PART): New unspecv. (nvptx_red_partition): New insn. >From 3834101d5144666f30d8798e983e276bd2c66636 Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Fri, 2 Mar 2018 07:36:11 -0800 Subject: [PATCH] reductions --- gcc/config/nvptx/nvptx-protos.h | 1 + gcc/config/nvptx/nvptx.c| 146 +++- gcc/config/nvptx/nvptx.md | 12 3 files changed, 128 insertions(+), 31 deletions(-) diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index 16b316f12b8..326c38c5dc7 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -55,5 +55,6 @@ extern const char *nvptx_output_return (void); extern const char *nvptx_output_set_softstack (unsigned); extern const char *nvptx_output_simt_enter (rtx, rtx, rtx); extern const char *nvptx_output_simt_exit (rtx); +extern const char *nvptx_output_red_partition (rtx, rtx); #endif #endif diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 4a48d44f44c..9d77176c638 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -142,6 +142,14 @@ static unsigned worker_red_size; static unsigned worker_red_align; static GTY(()) rtx