Hi,
we've run into a PTX JIT bug with cuda driver version 381.22 for sm_61
at -O1 and higher. This patch adds a workaround, guarded by a macro,
enabling the workaround by default.
Tested on x86_64 with nvidia accelerator.
Committed.
Thanks,
- Tom
Add extra initialization of broadcasted condition variables
2017-07-11 Tom de Vries <t...@codesourcery.com>
* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro.
(bb_first_real_insn): New function.
(nvptx_single): Add extra initialization of broadcasted condition
variables.
---
gcc/config/nvptx/nvptx.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 53 insertions(+)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index daeec27..c8847a5 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -74,6 +74,8 @@
/* This file should be included last. */
#include "target-def.h"
+#define WORKAROUND_PTXJIT_BUG 1
+
/* The various PTX memory areas an object might reside in. */
enum nvptx_data_area
{
@@ -3844,6 +3846,24 @@ nvptx_wsync (bool after)
return gen_nvptx_barsync (GEN_INT (after));
}
+#if WORKAROUND_PTXJIT_BUG
+/* Return first real insn in BB, or return NULL_RTX if BB does not contain
+ real insns. */
+
+static rtx_insn *
+bb_first_real_insn (basic_block bb)
+{
+ rtx_insn *insn;
+
+ /* Find first insn of from block. */
+ FOR_BB_INSNS (bb, insn)
+ if (INSN_P (insn))
+ return insn;
+
+ return 0;
+}
+#endif
+
/* Single neutering according to MASK. FROM is the incoming block and
TO is the outgoing block. These may be the same block. Insert at
start of FROM:
@@ -3958,6 +3978,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
{
/* Vector mode only, do a shuffle. */
+#if WORKAROUND_PTXJIT_BUG
+ /* The branch condition %rcond is propagated like this:
+
+ {
+ .reg .u32 %x;
+ mov.u32 %x,%tid.x;
+ setp.ne.u32 %rnotvzero,%x,0;
+ }
+
+ @%rnotvzero bra Lskip;
+ setp.<op>.<type> %rcond,op1,op2;
+ Lskip:
+ selp.u32 %rcondu32,1,0,%rcond;
+ shfl.idx.b32 %rcondu32,%rcondu32,0,31;
+ setp.ne.u32 %rcond,%rcondu32,0;
+
+ There seems to be a bug in the ptx JIT compiler (observed at driver
+ version 381.22, at -O1 and higher for sm_61), that drops the shfl
+ unless %rcond is initialized to something before 'bra Lskip'. The
+ bug is not observed with ptxas from cuda 8.0.61.
+
+ It is true that the code is non-trivial: at Lskip, %rcond is
+ uninitialized in threads 1-31, and after the selp the same holds
+ for %rcondu32. But shfl propagates the defined value in thread 0
+ to threads 1-31, so after the shfl %rcondu32 is defined in threads
+ 0-31, and after the setp.ne %rcond is defined in threads 0-31.
+
+ There is nothing in the PTX spec to suggest that this is wrong, or
+ to explain why the extra initialization is needed. So, we classify
+ it as a JIT bug, and the extra initialization as workaround. */
+ emit_insn_before (gen_movbi (pvar, const0_rtx),
+ bb_first_real_insn (from));
+#endif
emit_insn_before (nvptx_gen_vcast (pvar), tail);
}
else