https://bugs.llvm.org/show_bug.cgi?id=36361

            Bug ID: 36361
           Summary: Ignored memory fence after 64bit __shfl_*_sync
                    intrinsics
           Product: clang
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: CUDA
          Assignee: unassignedclangb...@nondot.org
          Reporter: j....@gmx.com
                CC: llvm-bugs@lists.llvm.org

Clang ignores the implicit memory fence after 64bit __shfl_*_sync intrinsics
which leads to incorrect behaviour.

Testing platform: clang version 7.0.0 (trunk 324341), CUDA 9.1,
--cuda-gpu-arch=sm_60

Repro code:

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#include <stdio.h>                                          

template <typename T>                                       
__device__                                                  
T warpReduceSum(T val)                                      
{                                                           
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
        val += __shfl_down_sync(0xFFFFFFFF, val, offset);   
    return val;                                             
}                                                           

template <typename T>                                       
__global__ void kernel()                                    
{                                                           
    T i = 1;                                                
    T sum = warpReduceSum( i );                             

    if( threadIdx.x == 0 )                                  
        printf( "sum = %d \n", (int) sum );                 
}                                                           

int main()                                                  
{                                                           
    printf("float: ");                                      
    kernel< float ><<< 1, 32 >>>();                         
    cudaDeviceSynchronize();                                

    printf("double: ");                                     
    kernel< double ><<< 1, 32 >>>();                        
    cudaDeviceSynchronize();                                

    printf("long: ");                                       
    kernel< long ><<< 1, 32 >>>();                          
    cudaDeviceSynchronize();                                
}                                                           

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Typical output when compiled with clang:

float: sum = 32
double: sum = -2147483648
long: sum = -746962023

Expected output (accomplished with nvcc):

float: sum = 32
double: sum = 32
long: sum = 32

PTX for the float kernel, which seems correct:

                Function : _Z6kernelIfEvv
        .headerflags    @"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
                                                                              
/* 0x001c5000fe0007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                      
/* 0x4c98078000870001 */
        /*0010*/         {         MOV32I R7, 0x3f800000;                     
/* 0x0103f8000007f007 */
        /*0018*/                   S2R R6, SR_TID.X;        }                 
/* 0xf0c8000002170006 */
                                                                              
/* 0x009fd0002be007f0 */
        /*0028*/         {         IADD32I R1, R1, -0x8;                      
/* 0x1c0fffffff870101 */
        /*0030*/                   SHFL.DOWN PT, R0, R7, 0x10, 0x1f;        } 
/* 0xef17007cb1070700 */
        /*0038*/                   FADD R0, R0, 1;                            
/* 0x3858003f80070000 */
                                                                              
/* 0x001cc402fe80073f */
        /*0048*/                   SHFL.DOWN PT, R2, R0, 0x8, 0x1f;           
/* 0xef17007cb0870002 */
        /*0050*/                   FADD R2, R0, R2;                           
/* 0x5c58000000270002 */
        /*0058*/                   SHFL.DOWN PT, R3, R2, 0x4, 0x1f;           
/* 0xef17007cb0470203 */
                                                                              
/* 0x001fd000ffa00fed */
        /*0068*/                   ISETP.NE.AND P0, PT, R6, RZ, PT;           
/* 0x5b6b03800ff70607 */
        /*0070*/                   DEPBAR {1};                                
/* 0xf0f0000000070002 */
        /*0078*/                   FADD R3, R2, R3;                           
/* 0x5c58000000370203 */
                                                                              
/* 0x001ff400fda00711 */
        /*0088*/                   SHFL.DOWN PT, R4, R3, 0x2, 0x1f;           
/* 0xef17007cb0270304 */
        /*0090*/                   IADD R6.CC, R1, c[0x0][0x4];               
/* 0x4c10800000170106 */
        /*0098*/                   IADD.X R7, RZ, c[0x0][0x104];              
/* 0x4c1008000417ff07 */
                                                                              
/* 0x001ff40002200ff4 */
        /*00a8*/                   FADD R4, R3, R4;                           
/* 0x5c58000000470304 */
        /*00b0*/                   SHFL.DOWN PT, R5, R4, 0x1, 0x1f;           
/* 0xef17007cb0170405 */
        /*00b8*/               @P0 EXIT;                                      
/* 0xe30000000000000f */
                                                                              
/* 0x001fc400e2200ff2 */
        /*00c8*/                   FADD R0, R4, R5;                           
/* 0x5c58000000570400 */
        /*00d0*/                   F2I.S32.F32.TRUNC R0, R0;                  
/* 0x5cb0018000071a00 */
        /*00d8*/                   MOV32I R4, 0x0;                            
/* 0x010000000007f004 */
                                                                              
/* 0x003ff4011e4007fd */
        /*00e8*/                   MOV32I R5, 0x0;                            
/* 0x010000000007f005 */
        /*00f0*/                   STL [R1], R0;                              
/* 0xef54000000070100 */
        /*00f8*/                   JCAL 0x0;                                  
/* 0xe220000000000040 */
                                                                              
/* 0x001f8400fde007ef */
        /*0108*/                   NOP;                                       
/* 0x50b0000000070f00 */
        /*0110*/                   NOP;                                       
/* 0x50b0000000070f00 */
        /*0118*/                   NOP;                                       
/* 0x50b0000000070f00 */
                                                                              
/* 0x001f8000ffe007ff */
        /*0128*/                   EXIT;                                      
/* 0xe30000000007000f */
        /*0130*/                   BRA 0x130;                                 
/* 0xe2400fffff87000f */
        /*0138*/                   NOP;                                       
/* 0x50b0000000070f00 */

PTX for the double kernel, which is definitely wrong:

                Function : _Z6kernelIdEvv
        .headerflags    @"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
                                                                              
/* 0x001c4400fe0007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                      
/* 0x4c98078000870001 */
        /*0010*/         {         IADD32I R1, R1, -0x8;                      
/* 0x1c0fffffff870101 */
        /*0018*/                   S2R R6, SR_TID.X;        }                 
/* 0xf0c8000002170006 */
                                                                              
/* 0x0004f40026200131 */
        /*0028*/                   SHFL.DOWN PT, R2, R0, 0x10, 0x1f;          
/* 0xef17007cb1070002 */
        /*0030*/                   SHFL.DOWN PT, R3, R0, 0x10, 0x1f;          
/* 0xef17007cb1070003 */
        /*0038*/                   SHFL.DOWN PT, R4, R0, 0x8, 0x1f;           
/* 0xef17007cb0870004 */
                                                                              
/* 0x001fc00026200ff0 */
        /*0048*/         {         ISETP.NE.AND P0, PT, R6, RZ, PT;           
/* 0x5b6b03800ff70607 */
        /*0050*/                   SHFL.DOWN PT, R5, R0, 0x8, 0x1f;        }  
/* 0xef17007cb0870005 */
        /*0058*/         {         IADD R6.CC, R1, c[0x0][0x4];               
/* 0x4c10800000170106 */
        /*0068*/                   SHFL.DOWN PT, R8, R0, 0x4, 0x1f;        }  
/* 0x0004c400fe000136 */
                                                                              
/* 0xef17007cb0470008 */
        /*0070*/         {         IADD.X R7, RZ, c[0x0][0x104];              
/* 0x4c1008000417ff07 */
        /*0078*/                   SHFL.DOWN PT, R9, R0, 0x4, 0x1f;        }  
/* 0xef17007cb0470009 */
                                                                              
/* 0x0004c40026200131 */
        /*0088*/                   SHFL.DOWN PT, R10, R0, 0x2, 0x1f;          
/* 0xef17007cb027000a */
        /*0090*/                   SHFL.DOWN PT, R11, R0, 0x2, 0x1f;          
/* 0xef17007cb027000b */
        /*0098*/                   SHFL.DOWN PT, R12, R0, 0x1, 0x1f;          
/* 0xef17007cb017000c */
                                                                              
/* 0x001fbc00fde00132 */
        /*00a8*/                   SHFL.DOWN PT, R13, R0, 0x1, 0x1f;          
/* 0xef17007cb017000d */
        /*00b0*/                   NOP;                                       
/* 0x50b0000000070f00 */
        /*00b8*/                   NOP;                                       
/* 0x50b0000000070f00 */
                                                                              
/* 0x0020c802e3c007fd */
        /*00c8*/               @P0 EXIT;                                      
/* 0xe30000000000000f */
        /*00d0*/                   DADD R2, R2, 1;                            
/* 0x3870003ff0070202 */
        /*00d8*/                   DADD R2, R2, R4;                           
/* 0x5c70000000470202 */
                                                                              
/* 0x0040c800fec00ff1 */
        /*00e8*/                   MOV R4, R8;                                
/* 0x5c98078000870004 */
        /*00f0*/                   MOV R5, R9;                                
/* 0x5c98078000970005 */
        /*00f8*/                   DADD R2, R2, R4;                           
/* 0x5c70000000470202 */
                                                                              
/* 0x0040c800fcc00ff1 */
        /*0108*/                   MOV R4, R10;                               
/* 0x5c98078000a70004 */
        /*0110*/                   MOV R5, R11;                               
/* 0x5c98078000b70005 */
        /*0118*/                   DADD R2, R2, R4;                           
/* 0x5c70000000470202 */
                                                                              
/* 0x00407800fec00ff1 */
        /*0128*/                   MOV R4, R12;                               
/* 0x5c98078000c70004 */
        /*0130*/                   MOV R5, R13;                               
/* 0x5c98078000d70005 */
        /*0138*/                   DADD R2, R2, R4;                           
/* 0x5c70000000470202 */
                                                                              
/* 0x0023c400fe000f14 */
        /*0148*/                   F2I.S32.F64.TRUNC R2, R2;                  
/* 0x5cb0018000271e02 */
        /*0150*/         {         MOV32I R4, 0x0;                            
/* 0x010000000007f004 */
        /*0158*/                   STL [R1], R2;        }                     
/* 0xef54000000070102 */
                                                                              
/* 0x001ffc01ffa007e6 */
        /*0168*/                   MOV32I R5, 0x0;                            
/* 0x010000000007f005 */
        /*0170*/                   JCAL 0x0;                                  
/* 0xe220000000000040 */
        /*0178*/                   EXIT;                                      
/* 0xe30000000007000f */
                                                                              
/* 0x001f8000fc0007ff */
        /*0188*/                   BRA 0x180;                                 
/* 0xe2400fffff07000f */
        /*0190*/                   NOP;                                       
/* 0x50b0000000070f00 */
        /*0198*/                   NOP;                                       
/* 0x50b0000000070f00 */
                                                                              
/* 0x001f8000fc0007e0 */
        /*01a8*/                   NOP;                                       
/* 0x50b0000000070f00 */
        /*01b0*/                   NOP;                                       
/* 0x50b0000000070f00 */
        /*01b8*/                   NOP;                                       
/* 0x50b0000000070f00 */

-- 
You are receiving this mail because:
You are on the CC list for the bug.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to