Re: [PATCH] D17779: [CUDA] Emit host-side 'shadows' for device-side global variables
tra updated this revision to Diff 49561. tra marked 9 inline comments as done. tra added a comment. Addressed Justin's comments. http://reviews.llvm.org/D17779 Files: lib/CodeGen/CGCUDANV.cpp lib/CodeGen/CGCUDARuntime.h lib/CodeGen/CodeGenModule.cpp test/CodeGenCUDA/device-stub.cu test/CodeGenCUDA/filter-decl.cu Index: test/CodeGenCUDA/filter-decl.cu === --- test/CodeGenCUDA/filter-decl.cu +++ test/CodeGenCUDA/filter-decl.cu @@ -9,15 +9,15 @@ // CHECK-DEVICE-NOT: module asm "file scope asm is host only" __asm__("file scope asm is host only"); -// CHECK-HOST-NOT: constantdata = externally_initialized global +// CHECK-HOST: constantdata = internal global // CHECK-DEVICE: constantdata = externally_initialized global __constant__ char constantdata[256]; -// CHECK-HOST-NOT: devicedata = externally_initialized global +// CHECK-HOST: devicedata = internal global // CHECK-DEVICE: devicedata = externally_initialized global __device__ char devicedata[256]; -// CHECK-HOST-NOT: shareddata = global +// CHECK-HOST: shareddata = internal global // CHECK-DEVICE: shareddata = global __shared__ char shareddata[256]; Index: test/CodeGenCUDA/device-stub.cu === --- test/CodeGenCUDA/device-stub.cu +++ test/CodeGenCUDA/device-stub.cu @@ -2,6 +2,40 @@ #include "Inputs/cuda.h" +// CHECK-DAG: @device_var = internal global i32 +__device__ int device_var; + +// CHECK-DAG: @constant_var = internal global i32 +__constant__ int constant_var; + +// CHECK-DAG: @shared_var = internal global i32 +__shared__ int shared_var; + +// Make sure host globals don't get internalized... +// CHECK-DAG: @host_var = global i32 +int host_var; +// ... and that extern vars remain external. +// CHECK-DAG: @ext_host_var = external global i32 +extern int ext_host_var; + +// Shadows for external device-side variables are *definitions* of +// those variables. +// CHECK-DAG: @ext_device_var = internal global i32 +extern __device__ int ext_device_var; +// CHECK-DAG: @ext_device_var = internal global i32 +extern __constant__ int ext_constant_var; + +void use_pointers() { + int *p; + p = _var; + p = _var; + p = _var; + p = _var; + p = _device_var; + p = _constant_var; + p = _host_var; +} + // Make sure that all parts of GPU code init/cleanup are there: // * constant unnamed string with the kernel name // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00" @@ -32,18 +66,23 @@ // CHECK: call{{.*}}kernelfunc void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } -// Test that we've built a function to register kernels -// CHECK: define internal void @__cuda_register_kernels +// Test that we've built a function to register kernels and global vars. +// CHECK: define internal void @__cuda_register_globals // CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 +// CHECK: ret void // Test that we've built contructor.. // CHECK: define internal void @__cuda_module_ctor // .. that calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper) // CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper // .. stores return value in __cuda_gpubin_handle // CHECK-NEXT: store{{.*}}__cuda_gpubin_handle -// .. and then calls __cuda_register_kernels -// CHECK-NEXT: call void @__cuda_register_kernels +// .. and then calls __cuda_register_globals +// CHECK-NEXT: call void @__cuda_register_globals // Test that we've created destructor. // CHECK: define internal void @__cuda_module_dtor Index: lib/CodeGen/CodeGenModule.cpp === --- lib/CodeGen/CodeGenModule.cpp +++ lib/CodeGen/CodeGenModule.cpp @@ -1528,11 +1528,18 @@ !Global->hasAttr()) return; } else { - if (!Global->hasAttr() && ( -Global->hasAttr() || -Global->hasAttr() || -Global->hasAttr())) + // We need to emit host-side 'shadows' for all global + // device-side variables because the CUDA runtime needs their + // size and host-side address in order to provide access to + // their device-side incarnations. + + // So device-only functions are the only things we skip. + if (isa(Global) && !Global->hasAttr() && + Global->hasAttr()) return; + + assert((isa(Global) || isa(Global)) && + "Expected Variable or Function"); } } @@ -1561,8 +1568,15 @@ } else { const auto *VD = cast(Global);
Re: [PATCH] D17779: [CUDA] Emit host-side 'shadows' for device-side global variables
jlebar added inline comments. Comment at: lib/CodeGen/CGCUDANV.cpp:168 @@ -163,1 +167,3 @@ +/// of global scope device-side variables generated in this module +/// with the CUDA runtime. /// \code This is kind of hard to parse. How about rephrasing to something like: Creates a function that sets up state on the host side for CUDA objects that have a presence on both the host and device sides. Specifically, registers the host side of kernel functions and __device__ global variables with the CUDA runtime. Comment at: lib/CodeGen/CGCUDANV.cpp:213 @@ +212,3 @@ + // void __cudaRegisterVar(void **, char *, char *, const char *, + //int, int, int, int) + std::vector RegisterVarParams = { Can we say what these args mean? Comment at: lib/CodeGen/CGCUDANV.cpp:224 @@ +223,3 @@ +llvm::Constant *VarName = makeConstantString(Var->getName()); +llvm::Value *args[] = { +, Builder.CreateBitCast(Var, VoidPtrTy), VarName, Nit: s/args/Args/? Comment at: lib/CodeGen/CGCUDANV.cpp:228 @@ +227,3 @@ +llvm::ConstantInt::get(IntTy, CGM.getDataLayout().getTypeAllocSize( + Var->getValueType())), // sizeof(var) +llvm::ConstantInt::get(IntTy, (Flags & DevVarConst) ? 1 : 0), Nit: Maybe pull this expression out as a separate var? Then the comment isn't needed (would be nice, because at the moment it's ambiguous exactly what "sizeof(var)" refers to. Comment at: lib/CodeGen/CodeGenModule.cpp:1532 @@ +1531,3 @@ + // We need to emit host-side 'shadows' for all global + // device-side variables because CUDA runtime API needs their + // size and host-side address in order to provide access to s/CUDA runtime API/the CUDA runtime/ (not really a requirement of the API, I think?) Comment at: lib/CodeGen/CodeGenModule.cpp:1575 @@ +1574,3 @@ + // definition, because we still need to define host-side shadow + // for it. +} else if (VD->isThisDeclarationADefinition() != VarDecl::Definition && Kind of an odd way of writing this control flow? Could we phrase it more idiomatically as MustEmitForCUDA = !VD->hasDefinition() && ...; if (!MustEmitForCUDA && ...) return; Comment at: lib/CodeGen/CodeGenModule.cpp:2477 @@ +2476,3 @@ + if (D->hasAttr() || D->hasAttr()) { +Linkage = llvm::GlobalValue::InternalLinkage; + Is it worth explaining why the shadows get internal linkage? Comment at: lib/CodeGen/CodeGenModule.cpp:2480 @@ +2479,3 @@ +// Shadow variables and their properties must be registered +// with CUDA runtime. +unsigned Flags = 0; with the CUDA runtime Comment at: lib/CodeGen/CodeGenModule.cpp:2483 @@ +2482,3 @@ +if (!D->hasDefinition()) + Flags |= CGCUDARuntime::DevVarExt; +if (D->hasAttr()) Now that I see them in context, I think these flags would be a lot easier to handle if they employed less abbreviation. "ExternalDeviceVar", "ConstDeviceVar"? Comment at: test/CodeGenCUDA/device-stub.cu:14 @@ +13,3 @@ + +// Make sure host globals don't get internalized.. +// CHECK-DAG: @host_var = global i32 Not sure if this is a typo or if you mean "...". Comment at: test/CodeGenCUDA/device-stub.cu:17 @@ +16,3 @@ +int host_var; +// .. and that extern vars remain external. +// CHECK-DAG: @ext_host_var = external global i32 Here you do seem to mean "..." http://reviews.llvm.org/D17779 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D17779: [CUDA] Emit host-side 'shadows' for device-side global variables
tra created this revision. tra added reviewers: jlebar, jingyue. tra added a subscriber: cfe-commits. .. and register them with CUDA runtime. This is needed for commonly used cudaMemcpy*() APIs that use address of host-side shadow to access their counterparts on device side. Fixes PR26340. http://reviews.llvm.org/D17779 Files: lib/CodeGen/CGCUDANV.cpp lib/CodeGen/CGCUDARuntime.h lib/CodeGen/CodeGenModule.cpp test/CodeGenCUDA/device-stub.cu test/CodeGenCUDA/filter-decl.cu Index: test/CodeGenCUDA/filter-decl.cu === --- test/CodeGenCUDA/filter-decl.cu +++ test/CodeGenCUDA/filter-decl.cu @@ -9,11 +9,11 @@ // CHECK-DEVICE-NOT: module asm "file scope asm is host only" __asm__("file scope asm is host only"); -// CHECK-HOST-NOT: constantdata = externally_initialized global +// CHECK-HOST: constantdata = internal global // CHECK-DEVICE: constantdata = externally_initialized global __constant__ char constantdata[256]; -// CHECK-HOST-NOT: devicedata = externally_initialized global +// CHECK-HOST: devicedata = internal global // CHECK-DEVICE: devicedata = externally_initialized global __device__ char devicedata[256]; Index: test/CodeGenCUDA/device-stub.cu === --- test/CodeGenCUDA/device-stub.cu +++ test/CodeGenCUDA/device-stub.cu @@ -2,6 +2,40 @@ #include "Inputs/cuda.h" +// CHECK-DAG: @device_var = internal global i32 +__device__ int device_var; + +// CHECK-DAG: @constant_var = internal global i32 +__constant__ int constant_var; + +// CHECK-DAG: @shared_var = internal global i32 +__shared__ int shared_var; + +// Make sure host globals don't get internalized.. +// CHECK-DAG: @host_var = global i32 +int host_var; +// .. and that extern vars remain external. +// CHECK-DAG: @ext_host_var = external global i32 +extern int ext_host_var; + +// Shadows for external device-side variables are *definitions* of +// those variables. +// CHECK-DAG: @ext_device_var = internal global i32 +extern __device__ int ext_device_var; +// CHECK-DAG: @ext_device_var = internal global i32 +extern __constant__ int ext_constant_var; + +void use_pointers() { + int *p; + p = _var; + p = _var; + p = _var; + p = _var; + p = _device_var; + p = _constant_var; + p = _host_var; +} + // Make sure that all parts of GPU code init/cleanup are there: // * constant unnamed string with the kernel name // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00" @@ -32,18 +66,23 @@ // CHECK: call{{.*}}kernelfunc void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } -// Test that we've built a function to register kernels -// CHECK: define internal void @__cuda_register_kernels +// Test that we've built a function to register kernels and global vars. +// CHECK: define internal void @__cuda_register_globals // CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 +// CHECK: ret void // Test that we've built contructor.. // CHECK: define internal void @__cuda_module_ctor // .. that calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper) // CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper // .. stores return value in __cuda_gpubin_handle // CHECK-NEXT: store{{.*}}__cuda_gpubin_handle -// .. and then calls __cuda_register_kernels -// CHECK-NEXT: call void @__cuda_register_kernels +// .. and then calls __cuda_register_globals +// CHECK-NEXT: call void @__cuda_register_globals // Test that we've created destructor. // CHECK: define internal void @__cuda_module_dtor Index: lib/CodeGen/CodeGenModule.cpp === --- lib/CodeGen/CodeGenModule.cpp +++ lib/CodeGen/CodeGenModule.cpp @@ -1528,11 +1528,18 @@ !Global->hasAttr()) return; } else { - if (!Global->hasAttr() && ( -Global->hasAttr() || -Global->hasAttr() || -Global->hasAttr())) + // We need to emit host-side 'shadows' for all global + // device-side variables because CUDA runtime API needs their + // size and host-side address in order to provide access to + // their device-side incarnations. + + // So device-only functions are the only things we skip. + if (isa(Global) && !Global->hasAttr() && + Global->hasAttr()) return; + + assert((isa(Global) || isa(Global)) && + "Expected Variable or Function"); } } @@ -1561,9 +1568,13 @@ } else { const auto *VD = cast(Global);