https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85381

--- Comment #3 from Tom de Vries <vries at gcc dot gnu.org> ---
(In reply to Tom de Vries from comment #2)
> I minimized this failure, and ran into PR 80035/81069, so I've backported
> the fix from trunk: https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00774.html

With that fixed, I run into another failure. Minimized:
...
int
main (void)
{
  long long v1;
#pragma acc parallel num_gangs (640) num_workers(1) vector_length (128)
#pragma acc loop
  for (v1 = 0; v1 < 20; v1 += 2)
    ;

  return 0;
}
...

With ptx:
...
// BEGIN PREAMBLE                                                               
.version 3.1
.target sm_30
.address_size 64
// END PREAMBLE                                                                 

// BEGIN FUNCTION DECL: main$_omp_fn$0                                          
.entry main$_omp_fn$0;

// BEGIN FUNCTION DEF: main$_omp_fn$0                                           
.entry main$_omp_fn$0
{
  .reg .u64 %r23;
  .reg .u64 %r24;
  // fork 4;                                                                    
  bar.sync 0;
  // forked 4;                                                                  
  // joining 4;                                                                 
  bar.sync 0;
  // join 4;                                                                    
  ret;
}
//:FUNC_MAP "main$_omp_fn$0", 0x280, 0x1, 0x80                                  
^@
...

Intriguing detail about this example: passes with GOMP_NVPTX_JIT=-O0, hangs
with GOMP_NVPTX_JIT=-O1, but seemingly the same SASS is generated:
...
//--------------------- .text.main$_omp_fn$0      --------------------------
        .section        .text.main$_omp_fn$0,"ax",@progbits
        .sectionflags   @"SHF_BARRIERS=1"
        .sectioninfo    @"SHI_REGISTERS=2"
        .align  32
.text.main$_omp_fn$0:
        .type           main$_omp_fn$0,@function
        .size           main$_omp_fn$0,(.L_8 - main$_omp_fn$0)
        .other          main$_omp_fn$0,@"STO_CUDA_ENTRY STV_DEFAULT"
main$_omp_fn$0:
        /*0008*/                   MOV R1, c[0x0][0x20];
        /*0010*/                   BAR.SYNC 0x0;
        /*0018*/                   BAR.SYNC 0x0;
        /*0028*/                   EXIT;
.L_1:
        /*0030*/                   BRA `(.L_1);
.L_8:
...

The only difference is at 0020, but that's only visible with cuobjdump:
...
/*0010*/                   BAR.SYNC 0x0;          /* 0xf0a81b8000070000 */
/*0018*/                   BAR.SYNC 0x0;          /* 0xf0a81b8000070000 */
                                                  /* 0x001f8000ffe01fef */
/*0028*/                   EXIT;                  /* 0xe30000000007000f */
...
vs
...
/*0010*/                   BAR.SYNC 0x0;          /* 0xf0a81b8000070000 */
/*0018*/                   BAR.SYNC 0x0;          /* 0xf0a81b8000070000 */
                                                  /* 0x001f8000ffe007ff */
/*0028*/                   EXIT;                  /* 0xe30000000007000f */
...

Reply via email to