https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81069

--- Comment #3 from Tom de Vries <vries at gcc dot gnu.org> ---
The immediate reason for the failure is a bug in the ptx JIT compiler.

This piece of ptx code (containing two functionally equivalent but differently
layed out variants, DETOUR=0 and DETOUR=1) is a minimal example, and is
translated correctly for DETOUR=0 and incorrectly for DETOUR=1:
...
.version 3.1
.target sm_30
.address_size 64

.entry test_function (.param .u64 %in_ar0);

.entry test_function (.param .u64 %in_ar0)
{
        // Init %ra to 0 in all lanes.
        .reg .u32 %ra;
        mov.u32 %ra,0;

        // Branch around if not lane 0.
        {
                .reg .u32 %l;
                mov.u32 %l,%laneid;
                .reg .pred %lane0;
                setp.eq.u32 %lane0,%l,0;
                @ ! %lane0 bra $L20;
        }
        bra $L3;

#if DETOUR == 0
$L3:
        // Set %ra to 1 in lane zero
        mov.u32 %ra,1;
$L20:   
        bra $L8;
#endif

$L8:
        // Broadcast %ra from lane 0 to all lanes
        shfl.idx.b32 %ra,%ra,0,31;

        // Branch to trap if %ra not 1 in all lanes
        {
                .reg .pred %ra_is_one;
                setp.eq.u32 %ra_is_one,%ra,1;
                @ ! %ra_is_one bra $L6;
        }

        ret;

$L6:
        trap;

#if DETOUR == 1
$L3:
        // Set %ra to 1 in lane zero
        mov.u32 %ra,1;
$L20:   
        bra $L8;
#endif
}
...

The problem for the DETOUR=1 case is that while the divergent branch '@ !
%lane0 bra $L20' is post-dominated by $L20, the synchronization point is
inserted much later, after the shfl. This means the shfl is executed in
divergent state, which invokes undefined behaviour, causing an undefined result
%ra in one of the threads in the warp, which causes the example to call the
trap.

Reply via email to