[PATCH] D44985: Remove initializer for CUDA shared varirable
yaxunl updated this revision to Diff 140310. yaxunl added a comment. Revised by John's comments. Also simplified the test by Artem's comments. https://reviews.llvm.org/D44985 Files: lib/CodeGen/CGDecl.cpp test/CodeGenCUDA/address-spaces.cu test/CodeGenCUDA/device-var-init.cu Index: test/CodeGenCUDA/device-var-init.cu === --- test/CodeGenCUDA/device-var-init.cu +++ test/CodeGenCUDA/device-var-init.cu @@ -1,10 +1,14 @@ // REQUIRES: nvptx-registered-target +// REQUIRES: amdgpu-registered-target // Make sure we don't allow dynamic initialization for device // variables, but accept empty constructors allowed by CUDA. // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -std=c++11 \ -// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck %s +// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,NVPTX %s + +// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -std=c++11 \ +// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,AMDGCN %s #ifdef __clang__ #include "Inputs/cuda.h" @@ -105,68 +109,120 @@ __constant__ EC_I_EC c_ec_i_ec; // CHECK: @c_ec_i_ec = addrspace(4) externally_initialized global %struct.EC_I_EC zeroinitializer, +// CHECK: @_ZZ2dfvE4s_ec = internal addrspace(3) global %struct.EC undef +// CHECK: @_ZZ2dfvE5s_etc = internal addrspace(3) global %struct.ETC undef + // We should not emit global initializers for device-side variables. // CHECK-NOT: @__cxx_global_var_init // Make sure that initialization restrictions do not apply to local // variables. __device__ void df() { + // NVPTX: %[[ec:.*]] = alloca %struct.EC + // NVPTX: %[[ed:.*]] = alloca %struct.ED + // NVPTX: %[[ecd:.*]] = alloca %struct.ECD + // NVPTX: %[[etc:.*]] = alloca %struct.ETC + // NVPTX: %[[uc:.*]] = alloca %struct.UC + // NVPTX: %[[ud:.*]] = alloca %struct.UD + // NVPTX: %[[eci:.*]] = alloca %struct.ECI + // NVPTX: %[[nec:.*]] = alloca %struct.NEC + // NVPTX: %[[ned:.*]] = alloca %struct.NED + // NVPTX: %[[ncv:.*]] = alloca %struct.NCV + // NVPTX: %[[vd:.*]] = alloca %struct.VD + // NVPTX: %[[ncf:.*]] = alloca %struct.NCF + // NVPTX: %[[ncfs:.*]] = alloca %struct.NCFS + // NVPTX: %[[utc:.*]] = alloca %struct.UTC + // NVPTX: %[[netc:.*]] = alloca %struct.NETC + // NVPTX: %[[ec_i_ec:.*]] = alloca %struct.EC_I_EC + // NVPTX: %[[ec_i_ec1:.*]] = alloca %struct.EC_I_EC1 + // NVPTX: %[[t_v_t:.*]] = alloca %struct.T_V_T + // NVPTX: %[[t_b_nec:.*]] = alloca %struct.T_B_NEC + // NVPTX: %[[t_f_nec:.*]] = alloca %struct.T_F_NEC + // NVPTX: %[[t_fa_nec:.*]] = alloca %struct.T_FA_NEC + // NVPTX: %[[t_b_ned:.*]] = alloca %struct.T_B_NED + // NVPTX: %[[t_f_ned:.*]] = alloca %struct.T_F_NED + // NVPTX: %[[t_fa_ned:.*]] = alloca %struct.T_FA_NED + // AMDGCN: %[[ec:.*]] = addrspacecast %struct.EC addrspace(5)* %ec to %struct.EC* + // AMDGCN: %[[ed:.*]] = addrspacecast %struct.ED addrspace(5)* %ed to %struct.ED* + // AMDGCN: %[[ecd:.*]] = addrspacecast %struct.ECD addrspace(5)* %ecd to %struct.ECD* + // AMDGCN: %[[etc:.*]] = addrspacecast %struct.ETC addrspace(5)* %etc to %struct.ETC* + // AMDGCN: %[[uc:.*]] = addrspacecast %struct.UC addrspace(5)* %uc to %struct.UC* + // AMDGCN: %[[ud:.*]] = addrspacecast %struct.UD addrspace(5)* %ud to %struct.UD* + // AMDGCN: %[[eci:.*]] = addrspacecast %struct.ECI addrspace(5)* %eci to %struct.ECI* + // AMDGCN: %[[nec:.*]] = addrspacecast %struct.NEC addrspace(5)* %nec to %struct.NEC* + // AMDGCN: %[[ned:.*]] = addrspacecast %struct.NED addrspace(5)* %ned to %struct.NED* + // AMDGCN: %[[ncv:.*]] = addrspacecast %struct.NCV addrspace(5)* %ncv to %struct.NCV* + // AMDGCN: %[[vd:.*]] = addrspacecast %struct.VD addrspace(5)* %vd to %struct.VD* + // AMDGCN: %[[ncf:.*]] = addrspacecast %struct.NCF addrspace(5)* %ncf to %struct.NCF* + // AMDGCN: %[[ncfs:.*]] = addrspacecast %struct.NCFS addrspace(5)* %ncfs to %struct.NCFS* + // AMDGCN: %[[utc:.*]] = addrspacecast %struct.UTC addrspace(5)* %utc to %struct.UTC* + // AMDGCN: %[[netc:.*]] = addrspacecast %struct.NETC addrspace(5)* %netc to %struct.NETC* + // AMDGCN: %[[ec_i_ec:.*]] = addrspacecast %struct.EC_I_EC addrspace(5)* %ec_i_ec to %struct.EC_I_EC* + // AMDGCN: %[[ec_i_ec1:.*]] = addrspacecast %struct.EC_I_EC1 addrspace(5)* %ec_i_ec1 to %struct.EC_I_EC1* + // AMDGCN: %[[t_v_t:.*]] = addrspacecast %struct.T_V_T addrspace(5)* %t_v_t to %struct.T_V_T* + // AMDGCN: %[[t_b_nec:.*]] = addrspacecast %struct.T_B_NEC addrspace(5)* %t_b_nec to %struct.T_B_NEC* + // AMDGCN: %[[t_f_nec:.*]] = addrspacecast %struct.T_F_NEC addrspace(5)* %t_f_nec to %struct.T_F_NEC* + // AMDGCN: %[[t_fa_nec:.*]] = addrspacecast %struct.T_FA_NEC addrspace(5)* %t_fa_nec to %struct.T_FA_NEC* + // AMDGCN: %[[t_b_ned:.*]] = addrspacecast %struct.T_B_NED addrspace(5)* %t_b_ned to %struct.T_B_NED* + // AMDGCN: %[[t_f_ned:.*]]
[PATCH] D44985: Remove initializer for CUDA shared varirable
rjmccall added a comment. In https://reviews.llvm.org/D44985#1051840, @yaxunl wrote: > In https://reviews.llvm.org/D44985#1050876, @rjmccall wrote: > > > In https://reviews.llvm.org/D44985#1050674, @yaxunl wrote: > > > > > In https://reviews.llvm.org/D44985#1050670, @rjmccall wrote: > > > > > > > What exactly are you trying to express here? Are you just trying to > > > > make these external declarations when compiling for the device because > > > > `__shared__` variables are actually defined on the host? That should > > > > be handled by the frontend by setting up the AST so that these > > > > declarations are not definitions. > > > > > > > > > No. These variables are not like external symbols defined on the host. > > > They behave like global variables in the kernel code but never > > > initialized. Currently no targets are able to initialize them and it is > > > users' responsibility to initialize them explicitly. > > > > > > Giving them an initial value will cause error in some backends since they > > > cannot handle them, therefore put undef as initializer. > > > > > > So undef is being used as a special marker to the backends that it's okay > > not to try to initialize these variables? > > > I think undef as the initializer tells the llvm passes and backend that this > global variable contains undefined value. I am not sure if this is better > than without an initializer. I saw code in > CodeGenModule::getOrCreateStaticVarDecl > > // Local address space cannot have an initializer. > llvm::Constant *Init = nullptr; > if (Ty.getAddressSpace() != LangAS::opencl_local) > Init = EmitNullConstant(Ty); > else > Init = llvm::UndefValue::get(LTy); > > > > which means OpenCL static variable in local address space (equivalent to CUDA > shared address space) gets an undef initializer. > > For CUDA shared variable, in CodeGenFunction::EmitStaticVarDecl, it first > goes through call of CodeGenModule::getOrCreateStaticVarDecl and gets a > zeroinitializer, then it reaches line 400 > > // Whatever initializer such variable may have when it gets here is > // a no-op and should not be emitted. > bool isCudaSharedVar = getLangOpts().CUDA && getLangOpts().CUDAIsDevice && >D.hasAttr(); > // If this value has an initializer, emit it. > if (D.getInit() && !isCudaSharedVar) > var = AddInitializerToStaticVarDecl(D, var); > > > > Although this disables adding initializer from D, var already has a > zeroinitializer from CodeGenModule::getOrCreateStaticVarDecl, therefore its > initializer needs to be overwritten by undef. > > Probably a better solution would be do it in > CodeGenModule::getOrCreateStaticVarDecl, side by side by the OpenCL code. Yes, I agree, just updating the condition to trigger if either language mode is set is the right fix. https://reviews.llvm.org/D44985 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44985: Remove initializer for CUDA shared varirable
yaxunl added a comment. In https://reviews.llvm.org/D44985#1050876, @rjmccall wrote: > In https://reviews.llvm.org/D44985#1050674, @yaxunl wrote: > > > In https://reviews.llvm.org/D44985#1050670, @rjmccall wrote: > > > > > What exactly are you trying to express here? Are you just trying to make > > > these external declarations when compiling for the device because > > > `__shared__` variables are actually defined on the host? That should be > > > handled by the frontend by setting up the AST so that these declarations > > > are not definitions. > > > > > > No. These variables are not like external symbols defined on the host. They > > behave like global variables in the kernel code but never initialized. > > Currently no targets are able to initialize them and it is users' > > responsibility to initialize them explicitly. > > > > Giving them an initial value will cause error in some backends since they > > cannot handle them, therefore put undef as initializer. > > > So undef is being used as a special marker to the backends that it's okay not > to try to initialize these variables? I think undef as the initializer tells the llvm passes and backend that this global variable contains undefined value. I am not sure if this is better than without an initializer. I saw code in CodeGenModule::getOrCreateStaticVarDecl // Local address space cannot have an initializer. llvm::Constant *Init = nullptr; if (Ty.getAddressSpace() != LangAS::opencl_local) Init = EmitNullConstant(Ty); else Init = llvm::UndefValue::get(LTy); which means OpenCL static variable in local address space (equivalent to CUDA shared address space) gets an undef initializer. For CUDA shared variable, in CodeGenFunction::EmitStaticVarDecl, it first goes through call of CodeGenModule::getOrCreateStaticVarDecl and gets a zeroinitializer, then it reaches line 400 // Whatever initializer such variable may have when it gets here is // a no-op and should not be emitted. bool isCudaSharedVar = getLangOpts().CUDA && getLangOpts().CUDAIsDevice && D.hasAttr(); // If this value has an initializer, emit it. if (D.getInit() && !isCudaSharedVar) var = AddInitializerToStaticVarDecl(D, var); Although this disables adding initializer from D, var already has a zeroinitializer from CodeGenModule::getOrCreateStaticVarDecl, therefore its initializer needs to be overwritten by undef. Probably a better solution would be do it in CodeGenModule::getOrCreateStaticVarDecl, side by side by the OpenCL code. https://reviews.llvm.org/D44985 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44985: Remove initializer for CUDA shared varirable
rjmccall added a comment. In https://reviews.llvm.org/D44985#1050674, @yaxunl wrote: > In https://reviews.llvm.org/D44985#1050670, @rjmccall wrote: > > > What exactly are you trying to express here? Are you just trying to make > > these external declarations when compiling for the device because > > `__shared__` variables are actually defined on the host? That should be > > handled by the frontend by setting up the AST so that these declarations > > are not definitions. > > > No. These variables are not like external symbols defined on the host. They > behave like global variables in the kernel code but never initialized. > Currently no targets are able to initialize them and it is users' > responsibility to initialize them explicitly. > > Giving them an initial value will cause error in some backends since they > cannot handle them, therefore put undef as initializer. So undef is being used as a special marker to the backends that it's okay not to try to initialize these variables? https://reviews.llvm.org/D44985 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44985: Remove initializer for CUDA shared varirable
yaxunl added a comment. In https://reviews.llvm.org/D44985#1050670, @rjmccall wrote: > What exactly are you trying to express here? Are you just trying to make > these external declarations when compiling for the device because > `__shared__` variables are actually defined on the host? That should be > handled by the frontend by setting up the AST so that these declarations are > not definitions. No. These variables are not like external symbols defined on the host. They behave like global variables in the kernel code but never initialized. Currently no targets are able to initialize them and it is users' responsibility to initialize them explicitly. Giving them an initial value will cause error in some backends since they cannot handle them, therefore put undef as initializer. https://reviews.llvm.org/D44985 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44985: Remove initializer for CUDA shared varirable
tra added a comment. In https://reviews.llvm.org/D44985#1050670, @rjmccall wrote: > What exactly are you trying to express here? Are you just trying to make > these external declarations when compiling for the device because > `__shared__` variables are actually defined on the host? That should be > handled by the frontend by setting up the AST so that these declarations are > not definitions. __shared__ vars (at least in CUDA) are weird. Local-scoped ones are implicitly static (which compiler will attempt to zero-init) but in CUDA __shared__ variables can't have static initializers and we don't know the value of such vars when we launch the kernel. https://reviews.llvm.org/D44985 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44985: Remove initializer for CUDA shared varirable
rjmccall added a comment. What exactly are you trying to express here? Are you just trying to make these external declarations when compiling for the device because `__shared__` variables are actually defined on the host? That should be handled by the frontend by setting up the AST so that these declarations are not definitions. https://reviews.llvm.org/D44985 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44985: Remove initializer for CUDA shared varirable
yaxunl added inline comments. Comment at: test/CodeGenCUDA/device-var-init.cu:121 __device__ void df() { + // AMDGCN: %[[ec:.*]] = addrspacecast %struct.EC addrspace(5)* %ec to %struct.EC* + // AMDGCN: %[[ed:.*]] = addrspacecast %struct.ED addrspace(5)* %ed to %struct.ED* tra wrote: > Perhaps it would make sense to capture there names for NVPTX as well and > avoid duplicating all the checks below. will do when committing. https://reviews.llvm.org/D44985 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44985: Remove initializer for CUDA shared varirable
tra added inline comments. Comment at: test/CodeGenCUDA/device-var-init.cu:121 __device__ void df() { + // AMDGCN: %[[ec:.*]] = addrspacecast %struct.EC addrspace(5)* %ec to %struct.EC* + // AMDGCN: %[[ed:.*]] = addrspacecast %struct.ED addrspace(5)* %ed to %struct.ED* Perhaps it would make sense to capture there names for NVPTX as well and avoid duplicating all the checks below. https://reviews.llvm.org/D44985 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44985: Remove initializer for CUDA shared varirable
yaxunl updated this revision to Diff 140110. yaxunl retitled this revision from "Disable zeroinitializer for CUDA shared varirable for amdgcn target" to "Remove initializer for CUDA shared varirable". yaxunl edited the summary of this revision. yaxunl added a reviewer: tra. yaxunl added a comment. Revised by Artem's comments. https://reviews.llvm.org/D44985 Files: lib/CodeGen/CGDecl.cpp test/CodeGenCUDA/address-spaces.cu test/CodeGenCUDA/device-var-init.cu Index: test/CodeGenCUDA/device-var-init.cu === --- test/CodeGenCUDA/device-var-init.cu +++ test/CodeGenCUDA/device-var-init.cu @@ -1,10 +1,14 @@ // REQUIRES: nvptx-registered-target +// REQUIRES: amdgpu-registered-target // Make sure we don't allow dynamic initialization for device // variables, but accept empty constructors allowed by CUDA. // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -std=c++11 \ -// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck %s +// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,NVPTX %s + +// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -std=c++11 \ +// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,AMDGCN %s #ifdef __clang__ #include "Inputs/cuda.h" @@ -105,68 +109,114 @@ __constant__ EC_I_EC c_ec_i_ec; // CHECK: @c_ec_i_ec = addrspace(4) externally_initialized global %struct.EC_I_EC zeroinitializer, +// CHECK: @_ZZ2dfvE4s_ec = internal addrspace(3) global %struct.EC undef +// CHECK: @_ZZ2dfvE5s_etc = internal addrspace(3) global %struct.ETC undef + // We should not emit global initializers for device-side variables. // CHECK-NOT: @__cxx_global_var_init // Make sure that initialization restrictions do not apply to local // variables. __device__ void df() { + // AMDGCN: %[[ec:.*]] = addrspacecast %struct.EC addrspace(5)* %ec to %struct.EC* + // AMDGCN: %[[ed:.*]] = addrspacecast %struct.ED addrspace(5)* %ed to %struct.ED* + // AMDGCN: %[[ecd:.*]] = addrspacecast %struct.ECD addrspace(5)* %ecd to %struct.ECD* + // AMDGCN: %[[etc:.*]] = addrspacecast %struct.ETC addrspace(5)* %etc to %struct.ETC* + // AMDGCN: %[[uc:.*]] = addrspacecast %struct.UC addrspace(5)* %uc to %struct.UC* + // AMDGCN: %[[ud:.*]] = addrspacecast %struct.UD addrspace(5)* %ud to %struct.UD* + // AMDGCN: %[[eci:.*]] = addrspacecast %struct.ECI addrspace(5)* %eci to %struct.ECI* + // AMDGCN: %[[nec:.*]] = addrspacecast %struct.NEC addrspace(5)* %nec to %struct.NEC* + // AMDGCN: %[[ned:.*]] = addrspacecast %struct.NED addrspace(5)* %ned to %struct.NED* + // AMDGCN: %[[ncv:.*]] = addrspacecast %struct.NCV addrspace(5)* %ncv to %struct.NCV* + // AMDGCN: %[[vd:.*]] = addrspacecast %struct.VD addrspace(5)* %vd to %struct.VD* + // AMDGCN: %[[ncf:.*]] = addrspacecast %struct.NCF addrspace(5)* %ncf to %struct.NCF* + // AMDGCN: %[[ncfs:.*]] = addrspacecast %struct.NCFS addrspace(5)* %ncfs to %struct.NCFS* + // AMDGCN: %[[utc:.*]] = addrspacecast %struct.UTC addrspace(5)* %utc to %struct.UTC* + // AMDGCN: %[[netc:.*]] = addrspacecast %struct.NETC addrspace(5)* %netc to %struct.NETC* + // AMDGCN: %[[ec_i_ec:.*]] = addrspacecast %struct.EC_I_EC addrspace(5)* %ec_i_ec to %struct.EC_I_EC* + // AMDGCN: %[[ec_i_ec1:.*]] = addrspacecast %struct.EC_I_EC1 addrspace(5)* %ec_i_ec1 to %struct.EC_I_EC1* + // AMDGCN: %[[t_v_t:.*]] = addrspacecast %struct.T_V_T addrspace(5)* %t_v_t to %struct.T_V_T* + // AMDGCN: %[[t_b_nec:.*]] = addrspacecast %struct.T_B_NEC addrspace(5)* %t_b_nec to %struct.T_B_NEC* + // AMDGCN: %[[t_f_nec:.*]] = addrspacecast %struct.T_F_NEC addrspace(5)* %t_f_nec to %struct.T_F_NEC* + // AMDGCN: %[[t_fa_nec:.*]] = addrspacecast %struct.T_FA_NEC addrspace(5)* %t_fa_nec to %struct.T_FA_NEC* + // AMDGCN: %[[t_b_ned:.*]] = addrspacecast %struct.T_B_NED addrspace(5)* %t_b_ned to %struct.T_B_NED* + // AMDGCN: %[[t_f_ned:.*]] = addrspacecast %struct.T_F_NED addrspace(5)* %t_f_ned to %struct.T_F_NED* + // AMDGCN: %[[t_fa_ned:.*]] = addrspacecast %struct.T_FA_NED addrspace(5)* %t_fa_ned to %struct.T_FA_NED* + T t; // CHECK-NOT: call EC ec; - // CHECK: call void @_ZN2ECC1Ev(%struct.EC* %ec) + // NVPTX: call void @_ZN2ECC1Ev(%struct.EC* %ec) + // AMDGCN: call void @_ZN2ECC1Ev(%struct.EC* %[[ec]]) ED ed; // CHECK-NOT: call ECD ecd; - // CHECK: call void @_ZN3ECDC1Ev(%struct.ECD* %ecd) + // NVPTX: call void @_ZN3ECDC1Ev(%struct.ECD* %ecd) + // AMDGCN: call void @_ZN3ECDC1Ev(%struct.ECD* %[[ecd]]) ETC etc; - // CHECK: call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* %etc) + // NVPTX: call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* %etc) + // AMDGCN: call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* %[[etc]]) UC uc; // undefined constructor -- not allowed - // CHECK: call void @_ZN2UCC1Ev(%struct.UC* %uc) + // NVPTX: call void @_ZN2UCC1Ev(%struct.UC* %uc) + // AMDGCN: call void