Hi,

Currently, when we enable -mlong-vector-in-workers in gemm.f90, we get:
...
  {
    .reg.u32        %tidy;
    .reg.u64        %t_bcast;
    .reg.u64        %y64;
    mov.u32         %tidy, %tid.y;
    cvt.u64.u32     %y64, %tidy;
    add.u64         %y64, %y64, 1;
    cvta.shared.u64 %t_bcast, __oacc_bcast;
    mad.lo.u64      %r166, %y64, 104, %t_bcast;
  }

  @ %r179 bra.uni $L28;
  @ %r174 bra $L29;
  ...
  setp.le.s32 %r114,%r113,0;
  selp.u32 %r182,1,0,%r114;
  st.u32 [%r166],%r182;
 $L29:
 $L28:

  bar.sync %r167,128;

  ld.u32 %r183,[%r166];
  setp.ne.u32 %r114,%r183,0;

  bar.sync %r167,128;

  @ %r114 bra.uni $L1
...

The branch condition %114 is computed in a W0V0 region, and then broadcast to a WAVA region. The broadcast is done using a partition of the broadcast buffer at %r166, but this is a worker-specific buffer.

So since the writing of the buffer is done in worker 0 only, the read in workers other than 0 is reading uninitialized memory.

This patch fixes this by using the generic broadcast buffer in this case, rather than a worker-specific one.

Build x86_64 with nvptx accelerator and tested libgomp.

Committed to og7.

Thanks,
- Tom
[nvptx] Fix propagation of branch cond in vw-neutered code

2018-04-12  Tom de Vries  <t...@codesourcery.com>

	PR target/85246
	* config/nvptx/nvptx.c (nvptx_single): Don't use partitioning when
	propagating branch condition calculated in vector-worker-neutered code.

	* testsuite/libgomp.oacc-fortran/gemm.f90: Use
	-foffload=-mlong-vector-in-workers.

---
 gcc/config/nvptx/nvptx.c                        | 3 ++-
 libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 547022e..9d011eb 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4306,13 +4306,14 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  broadcast_data_t data;
 	  unsigned size = GET_MODE_SIZE (SImode);
 	  bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
+	  bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
 	  rtx barrier = GEN_INT (0);
 	  int threads = 0;
 
 	  data.base = oacc_bcast_sym;
 	  data.ptr = 0;
 
-	  bool use_partitioning_p = (vector
+	  bool use_partitioning_p = (vector && !worker
 				     && nvptx_mach_max_workers () > 1
 				     && cfun->machine->bcast_partition);
 	  if (use_partitioning_p)
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
index ad67dce..744d21e 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
@@ -1,6 +1,7 @@
 ! Exercise three levels of parallelism using SGEMM from BLAS.
 
 ! { dg-additional-options "-fopenacc-dim=-:-:128" }
+! { dg-additional-options "-foffload=-mlong-vector-in-workers" }
 
 ! Implicitly set vector_length to 128 using -fopenacc-dim.
 subroutine openacc_sgemm (m, n, k, alpha, a, b, beta, c)

Reply via email to