[nvptx, committed] Add extra initialization of broadcasted condition variables

Tom de Vries Tue, 11 Jul 2017 05:38:14 -0700

Hi,

we've run into a PTX JIT bug with cuda driver version 381.22 for sm_61at -O1 and higher. This patch adds a workaround, guarded by a macro,enabling the workaround by default.


Tested on x86_64 with nvidia accelerator.

Committed.

Thanks,
- Tom

Add extra initialization of broadcasted condition variables

2017-07-11  Tom de Vries  <t...@codesourcery.com>

	* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro.
	(bb_first_real_insn): New function.
	(nvptx_single): Add extra initialization of broadcasted condition
	variables.

---
 gcc/config/nvptx/nvptx.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index daeec27..c8847a5 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -74,6 +74,8 @@
 /* This file should be included last.  */
 #include "target-def.h"
 
+#define WORKAROUND_PTXJIT_BUG 1
+
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
 {
@@ -3844,6 +3846,24 @@ nvptx_wsync (bool after)
   return gen_nvptx_barsync (GEN_INT (after));
 }
 
+#if WORKAROUND_PTXJIT_BUG
+/* Return first real insn in BB, or return NULL_RTX if BB does not contain
+   real insns.  */
+
+static rtx_insn *
+bb_first_real_insn (basic_block bb)
+{
+  rtx_insn *insn;
+
+  /* Find first insn of from block.  */
+  FOR_BB_INSNS (bb, insn)
+    if (INSN_P (insn))
+      return insn;
+
+  return 0;
+}
+#endif
+
 /* Single neutering according to MASK.  FROM is the incoming block and
    TO is the outgoing block.  These may be the same block. Insert at
    start of FROM:
@@ -3958,6 +3978,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
       if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
 	{
 	  /* Vector mode only, do a shuffle.  */
+#if WORKAROUND_PTXJIT_BUG
+	  /* The branch condition %rcond is propagated like this:
+
+		{
+		    .reg .u32 %x;
+		    mov.u32 %x,%tid.x;
+		    setp.ne.u32 %rnotvzero,%x,0;
+		 }
+
+		 @%rnotvzero bra Lskip;
+		 setp.<op>.<type> %rcond,op1,op2;
+		 Lskip:
+		 selp.u32 %rcondu32,1,0,%rcond;
+		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
+		 setp.ne.u32 %rcond,%rcondu32,0;
+
+	     There seems to be a bug in the ptx JIT compiler (observed at driver
+	     version 381.22, at -O1 and higher for sm_61), that drops the shfl
+	     unless %rcond is initialized to something before 'bra Lskip'.  The
+	     bug is not observed with ptxas from cuda 8.0.61.
+
+	     It is true that the code is non-trivial: at Lskip, %rcond is
+	     uninitialized in threads 1-31, and after the selp the same holds
+	     for %rcondu32.  But shfl propagates the defined value in thread 0
+	     to threads 1-31, so after the shfl %rcondu32 is defined in threads
+	     0-31, and after the setp.ne %rcond is defined in threads 0-31.
+
+	     There is nothing in the PTX spec to suggest that this is wrong, or
+	     to explain why the extra initialization is needed.  So, we classify
+	     it as a JIT bug, and the extra initialization as workaround.  */
+	  emit_insn_before (gen_movbi (pvar, const0_rtx),
+			    bb_first_real_insn (from));
+#endif
 	  emit_insn_before (nvptx_gen_vcast (pvar), tail);
 	}
       else

[nvptx, committed] Add extra initialization of broadcasted condition variables

Reply via email to