yaxunl created this revision. yaxunl added reviewers: tra, rjmccall. HIP provide host API to allow C/C++ programs to launch kernel. A C/C++ program can declare a HIP kernel as an external function and pass it to the kernel launching API. When linked with object files built from HIP programs. These external functions will resolve to symbols with the same name in HIP programs so that kernels with the same name can be found and launched.
This requires clang to emit symbols with the same name as kernels in object files and use them to identify kernels, instead of using device stub functions to identify kernels, since device stub function has different names than kernels. This patch lets clang emits a void* type global variable for each kernel in host IR, which is called kernel handle. The kernel handle has the same mangled name as kernel by host ABI. It is passed to __hipRegisterFunction and kernel launching functions for identifying kernels. https://reviews.llvm.org/D77743 Files: clang/lib/CodeGen/CGCUDANV.cpp clang/test/CodeGenCUDA/Inputs/cuda.h clang/test/CodeGenCUDA/cxx-call-kernel.cpp clang/test/CodeGenCUDA/kernel-stub-name.cu clang/test/CodeGenCUDA/unnamed-types.cu
Index: clang/test/CodeGenCUDA/unnamed-types.cu =================================================================== --- clang/test/CodeGenCUDA/unnamed-types.cu +++ clang/test/CodeGenCUDA/unnamed-types.cu @@ -36,4 +36,4 @@ }(p); } // HOST: @__hip_register_globals -// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0 +// HOST: __hipRegisterFunction{{.*}}@_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0 Index: clang/test/CodeGenCUDA/kernel-stub-name.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-stub-name.cu +++ clang/test/CodeGenCUDA/kernel-stub-name.cu @@ -6,6 +6,12 @@ #include "Inputs/cuda.h" +// Kernel handles + +// CHECK: @[[HCKERN:ckernel]] = constant i8* null +// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant i8* null +// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant i8* null + extern "C" __global__ void ckernel() {} namespace ns { @@ -26,9 +32,9 @@ // Non-template kernel stub functions // CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]] // CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[NSSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]] // CHECK-LABEL: define{{.*}}@_Z8hostfuncv() // CHECK: call void @[[CSTUB]]() @@ -45,11 +51,11 @@ // Template kernel stub functions // CHECK: define{{.*}}@[[TSTUB]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[TSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]] // CHECK: declare{{.*}}@[[DSTUB]] // CHECK-LABEL: define{{.*}}@__hip_register_globals -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[CSTUB]]{{.*}}@[[CKERN]] -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[NSSTUB]]{{.*}}@[[NSKERN]] -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[TSTUB]]{{.*}}@[[TKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]] Index: clang/test/CodeGenCUDA/cxx-call-kernel.cpp =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/cxx-call-kernel.cpp @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -x hip -emit-llvm-bc %s -o %t.hip.bc +// RUN: %clang_cc1 -mlink-builtin-bitcode %t.hip.bc -DHIP_PLATFORM -emit-llvm \ +// RUN: %s -o - | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK: @_Z2g1i = internal constant i8* null +#if __HIP__ +__global__ void g1(int x) {} +#else +extern void g1(int x); + +// CHECK: call i32 @hipLaunchKernel{{.*}}@_Z2g1i +void test() { + hipLaunchKernel((void*)g1, 1, 1, nullptr, 0, 0); +} + +// CHECK: __hipRegisterFunction{{.*}}@_Z2g1i +#endif Index: clang/test/CodeGenCUDA/Inputs/cuda.h =================================================================== --- clang/test/CodeGenCUDA/Inputs/cuda.h +++ clang/test/CodeGenCUDA/Inputs/cuda.h @@ -2,19 +2,28 @@ #include <stddef.h> +#if __HIP__ || __CUDA__ #define __constant__ __attribute__((constant)) #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) #define __host__ __attribute__((host)) #define __shared__ __attribute__((shared)) #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) +#else +#define __constant__ +#define __device__ +#define __global__ +#define __host__ +#define __shared__ +#define __launch_bounds__(...) +#endif struct dim3 { unsigned x, y, z; __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} }; -#ifdef __HIP__ +#if __HIP__ || HIP_PLATFORM typedef struct hipStream *hipStream_t; typedef enum hipError {} hipError_t; int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -41,12 +41,16 @@ llvm::LLVMContext &Context; /// Convenience reference to the current module llvm::Module &TheModule; - /// Keeps track of kernel launch stubs emitted in this module + /// Keeps track of kernel launch stubs and handles emitted in this module struct KernelInfo { - llvm::Function *Kernel; + llvm::Function *Kernel; // stub function to help launch kernel const Decl *D; }; llvm::SmallVector<KernelInfo, 16> EmittedKernels; + // Map a device stub function to a symbol for identifying kernel in host code. + // For CUDA, the symbol for identifying the kernel is the same as the device + // stub function. For HIP, they are different. + llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles; struct VarInfo { llvm::GlobalVariable *Var; const VarDecl *D; @@ -240,6 +244,18 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) { EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); + llvm::GlobalValue *KernelHandle = CGF.CurFn; + if (CGF.getLangOpts().HIP) { + auto Linkage = CGF.CurFn->getLinkage(); + auto *Var = new llvm::GlobalVariable( + TheModule, VoidPtrTy, /*isConstant=*/true, Linkage, + /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrTy), + CGM.getMangledName(GlobalDecl(cast<FunctionDecl>(CGF.CurFuncDecl), + KernelReferenceKind::Kernel))); + Var->setAlignment(CGM.getPointerAlign().getAsAlign()); + KernelHandle = Var; + } + KernelHandles[CGF.CurFn] = KernelHandle; if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), CudaFeature::CUDA_USES_NEW_LAUNCH) || CGF.getLangOpts().HIPUseNewLaunchAPI) @@ -318,7 +334,8 @@ ShmemSize.getPointer(), Stream.getPointer()}); // Emit the call to cudaLaunch - llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); + llvm::Value *Kernel = + CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy); CallArgList LaunchKernelArgs; LaunchKernelArgs.add(RValue::get(Kernel), cudaLaunchKernelFD->getParamDecl(0)->getType()); @@ -375,7 +392,8 @@ // Emit the call to cudaLaunch llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); - llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); + llvm::Value *Arg = + CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy); CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); CGF.EmitBranch(EndBlock); @@ -428,7 +446,7 @@ llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); llvm::Value *Args[] = { &GpuBinaryHandlePtr, - Builder.CreateBitCast(I.Kernel, VoidPtrTy), + Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy), KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1),
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits