At present, due bugs in the nvptx state propagation code, worker partitioning must be deactivated when the user specifies a large vector_length. However, the code that is responsible for deactivating worker partitioning was not considering the case where the vector_length is the size of a warp. Consequently, not all of the CUDA threads were being utilized when vector_length = 32 (which is the default case).
I've committed this patch to openacc-gcc-7-branch which allows warp-sized vectors to nest inside worker-partitioned loops. Cesar
2018-04-10 Cesar Philippidis <ce...@codesourcery.com> gcc/ * config/nvptx/nvptx.c (populate_offload_attrs): Ensure that oa->vector_length is set. gcc/testsuite/ * gcc.target/nvptx/oacc-autopar.c: New test. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index b5e6dce165c..cd89d1738aa 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -4639,7 +4639,7 @@ populate_offload_attrs (offload_attrs *oa) if (!lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (current_function_decl))) oa->vector_length = PTX_WARP_SIZE; - else if (PTX_VECTOR_LENGTH != PTX_WARP_SIZE) + else oa->vector_length = PTX_VECTOR_LENGTH; } if (oa->num_workers == 0) diff --git a/gcc/testsuite/gcc.target/nvptx/oacc-autopar.c b/gcc/testsuite/gcc.target/nvptx/oacc-autopar.c new file mode 100644 index 00000000000..915053afcbf --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/oacc-autopar.c @@ -0,0 +1,120 @@ +/* Verify the default assignment of parallelism. */ + +/* { dg-do compile } */ +/* { dg-additional-options "-fopenacc -misa=sm_35 -fopt-info-note-omp" } */ + +void +gang_independent () +{ + int i, j; + + #pragma acc parallel + #pragma acc loop gang /* { dg-message "Detected parallelism <acc loop gang>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop worker vector>" } */ + for (j = 0; j < 10; j++) + ; + + #pragma acc parallel vector_length (128) + #pragma acc loop gang /* { dg-message "Detected parallelism <acc loop gang>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop vector>" } */ + for (j = 0; j < 10; j++) + ; +} + +void +gang_independent_seq () +{ + int i, j, k; + + #pragma acc parallel + #pragma acc loop gang /* { dg-message "Detected parallelism <acc loop gang>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop worker vector>" } */ + for (j = 0; j < 10; j++) + #pragma acc loop seq /* { dg-message "Detected parallelism <acc loop seq>" } */ + for (k = 0; k < 10; k++) + ; + + #pragma acc parallel vector_length (128) + #pragma acc loop gang /* { dg-message "Detected parallelism <acc loop gang>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop vector>" } */ + for (j = 0; j < 10; j++) + #pragma acc loop seq /* { dg-message "Detected parallelism <acc loop seq>" } */ + for (k = 0; k < 10; k++) + ; +} + +void +worker () +{ + int i, j; + + #pragma acc parallel + #pragma acc loop worker /* { dg-message "Detected parallelism <acc loop worker>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop vector>" } */ + for (j = 0; j < 10; j++) + ; + + #pragma acc parallel vector_length (128) + #pragma acc loop worker /* { dg-message "Detected parallelism <acc loop worker>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop vector>" } */ + for (j = 0; j < 10; j++) + ; +} + +void +fully_independent () +{ + int i, j, k; + + #pragma acc parallel + #pragma acc loop /* { dg-message "Detected parallelism <acc loop gang vector>" } */ + for (i = 0; i < 10; i++) + ; + + #pragma acc parallel vector_length (128) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop gang vector>" } */ + for (i = 0; i < 10; i++) + ; + + #pragma acc parallel + #pragma acc loop /* { dg-message "Detected parallelism <acc loop gang worker>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop vector>" } */ + for (j = 0; j < 10; j++) + ; + + /* FIXME: Should the outer loop only be gang partitioned so that the + inner loopp can utilize a large vector_length? */ + #pragma acc parallel vector_length (128) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop gang worker>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop vector>" } */ + for (j = 0; j < 10; j++) + ; + + #pragma acc parallel + #pragma acc loop /* { dg-message "Detected parallelism <acc loop gang>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop worker>" } */ + for (j = 0; j < 10; j++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop vector>" } */ + for (k = 0; k < 10; k++) + ; + + /* FIXME: Should the middle loop be seq-partitioned in order to + respect vector_length = 128 on the innermost loop? */ + #pragma acc parallel vector_length (128) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop gang>" } */ + for (i = 0; i < 10; i++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop worker>" } */ + for (j = 0; j < 10; j++) + #pragma acc loop /* { dg-message "Detected parallelism <acc loop vector>" } */ + for (k = 0; k < 10; k++) + ; +}