[llvm-branch-commits] [openmp] 78b0630 - [libomptarget][cuda] Call v2 functions explicitly
Author: Jon Chesterfield Date: 2021-01-23T20:33:13Z New Revision: 78b0630b72a9742d62b07cef912b72f1743bfae9 URL: https://github.com/llvm/llvm-project/commit/78b0630b72a9742d62b07cef912b72f1743bfae9 DIFF: https://github.com/llvm/llvm-project/commit/78b0630b72a9742d62b07cef912b72f1743bfae9.diff LOG: [libomptarget][cuda] Call v2 functions explicitly [libomptarget][cuda] Call v2 functions explicitly rtl.cpp calls functions like cuMemFree that are replaced by a macro in cuda.h with cuMemFree_v2. This patch changes the source to use the v2 names consistently. See also D95104, D95155 for the idea. Alternatives are to use a mixture, e.g. call the macro names and explictly dlopen the _v2 names, or to keep the current status where the symbols are replaced by macros in both files Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95274 Added: Modified: openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h openmp/libomptarget/plugins/cuda/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h index 832c26965144..dd579a1f7490 100644 --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -48,18 +48,6 @@ typedef enum CUctx_flags_enum { CU_CTX_SCHED_MASK = 0x07, } CUctx_flags; -#define cuMemFree cuMemFree_v2 -#define cuMemAlloc cuMemAlloc_v2 -#define cuMemcpyDtoH cuMemcpyDtoH_v2 -#define cuMemcpyHtoD cuMemcpyHtoD_v2 -#define cuStreamDestroy cuStreamDestroy_v2 -#define cuModuleGetGlobal cuModuleGetGlobal_v2 -#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2 -#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 -#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2 -#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 -#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 - CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); @@ -72,26 +60,26 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, CUstream, void **, void **); -CUresult cuMemAlloc(CUdeviceptr *, size_t); -CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); +CUresult cuMemAlloc_v2(CUdeviceptr *, size_t); +CUresult cuMemcpyDtoDAsync_v2(CUdeviceptr, CUdeviceptr, size_t, CUstream); -CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t); -CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); -CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); -CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); +CUresult cuMemcpyDtoH_v2(void *, CUdeviceptr, size_t); +CUresult cuMemcpyDtoHAsync_v2(void *, CUdeviceptr, size_t, CUstream); +CUresult cuMemcpyHtoD_v2(CUdeviceptr, const void *, size_t); +CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr, const void *, size_t, CUstream); -CUresult cuMemFree(CUdeviceptr); +CUresult cuMemFree_v2(CUdeviceptr); CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); -CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *); +CUresult cuModuleGetGlobal_v2(CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleUnload(CUmodule); CUresult cuStreamCreate(CUstream *, unsigned); -CUresult cuStreamDestroy(CUstream); +CUresult cuStreamDestroy_v2(CUstream); CUresult cuStreamSynchronize(CUstream); CUresult cuCtxSetCurrent(CUcontext); -CUresult cuDevicePrimaryCtxRelease(CUdevice); +CUresult cuDevicePrimaryCtxRelease_v2(CUdevice); CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *); -CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned); +CUresult cuDevicePrimaryCtxSetFlags_v2(CUdevice, unsigned); CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice); CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *, void **); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp index e4ac1e0820e6..f83c9df920aa 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -110,8 +110,8 @@ bool checkResult(CUresult Err, const char *ErrMsg) { int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, CUstream Stream) { - CUresult Err = - cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); + CUresult Err = cuMemcpyDtoDAsync_v2((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, + Size, Stream); if (Err != CUDA_SUCCESS) { REPORT("Error when copying data from device to device. Pointers: src " @@ -207,8 +207,8 @@ class StreamManagerTy { for (CUstream &S : StreamPool[I]) { if (S) - checkResult(
[llvm-branch-commits] [openmp] dc70c56 - [libomptarget][amdgpu][nfc] Update comments
Author: Jon Chesterfield Date: 2021-01-23T22:53:58Z New Revision: dc70c56be5922b874b1408edc1315fcda40680ba URL: https://github.com/llvm/llvm-project/commit/dc70c56be5922b874b1408edc1315fcda40680ba DIFF: https://github.com/llvm/llvm-project/commit/dc70c56be5922b874b1408edc1315fcda40680ba.diff LOG: [libomptarget][amdgpu][nfc] Update comments [libomptarget][amdgpu][nfc] Update comments Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95295 Added: Modified: openmp/libomptarget/plugins/amdgpu/impl/data.cpp openmp/libomptarget/plugins/amdgpu/impl/machine.h openmp/libomptarget/plugins/amdgpu/impl/rt.h openmp/libomptarget/plugins/amdgpu/impl/system.cpp openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp index 39546fbae4b3..0d98d5c51ce1 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp @@ -51,7 +51,6 @@ hsa_amd_memory_pool_t get_memory_pool_by_mem_place(atmi_mem_place_t place) { void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) { if (place.dev_type == ATMI_DEVTYPE_CPU) allow_access_to_all_gpu_agents(ptr); - // TODO(ashwinma): what if one GPU wants to access another GPU? } atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) { diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.h b/openmp/libomptarget/plugins/amdgpu/impl/machine.h index 93169ed4eafb..9250c3b7c663 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/machine.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.h @@ -22,9 +22,6 @@ class ATLProcessor { } void addMemory(const ATLMemory &p); hsa_agent_t agent() const { return agent_; } - // TODO(ashwinma): Do we need this or are we building the machine structure - // just once in the program? - // void removeMemory(ATLMemory &p); const std::vector &memories() const; atmi_devtype_t type() const { return type_; } @@ -86,7 +83,7 @@ template T &get_processor(atmi_place_t place) { int dev_id = place.device_id; if (dev_id == -1) { // user is asking runtime to pick a device -// TODO(ashwinma): best device of this type? pick 0 for now +// best device of this type? pick 0 for now dev_id = 0; } return g_atl_machine.processors()[dev_id]; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h index 757919eb3a45..a857861307c6 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h @@ -26,10 +26,7 @@ class Environment { void GetEnvAll(); int getMaxQueueSize() const { return max_queue_size_; } - - // TODO(ashwinma): int may change to enum if we have more debug modes int getDebugMode() const { return debug_mode_; } - // TODO(ashwinma): int may change to enum if we have more profile modes private: std::string GetEnv(const char *name) { @@ -69,10 +66,7 @@ class Runtime final { static atmi_status_t Memfree(void *); static atmi_status_t Malloc(void **, size_t, atmi_mem_place_t); - // environment variables int getMaxQueueSize() const { return env_.getMaxQueueSize(); } - - // TODO(ashwinma): int may change to enum if we have more debug modes int getDebugMode() const { return env_.getDebugMode(); } protected: diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp index 913dc91b298d..1a126a186ff2 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -133,7 +133,7 @@ static const std::map ArgValueKind = { {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, }; -// public variables -- TODO(ashwinma) move these to a runtime object? +// global variables. TODO: Get rid of these atmi_machine_t g_atmi_machine; ATLMachine g_atl_machine; @@ -210,8 +210,6 @@ atmi_status_t Runtime::Initialize() { } atmi_status_t Runtime::Finalize() { - // TODO(ashwinma): Finalize all processors, queues, signals, kernarg memory - // regions hsa_status_t err; for (uint32_t i = 0; i < g_executables.size(); i++) { @@ -874,8 +872,6 @@ static hsa_status_t get_code_object_custom_metadata(void *binary, msgpackErrorCheck(iterate args map in kernel args metadata, msgpack_errors); -// TODO(ashwinma): should the below population actions be done only for -// non-implicit args? // populate info with sizes and offsets info.arg_sizes.push_back(lcArg.size_); // v3 has offset field and not align field diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp ind
[llvm-branch-commits] [openmp] c3074d4 - [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics
Author: Jon Chesterfield Date: 2021-01-24T10:59:15Z New Revision: c3074d48d38cc1207da893b6f3545b5777db4c27 URL: https://github.com/llvm/llvm-project/commit/c3074d48d38cc1207da893b6f3545b5777db4c27 DIFF: https://github.com/llvm/llvm-project/commit/c3074d48d38cc1207da893b6f3545b5777db4c27.diff LOG: [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics Tested by diff of IR generated for target_impl.cu before and after. NFC. Part of removing deviceRTL build time dependency on cuda SDK. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D95294 Added: Modified: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu Removed: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu index 2bf19523ef6f..1e3ba7d664af 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -28,9 +28,6 @@ int __shfl(int val, int src_line, int width = WARPSIZE); int __shfl_down(int var, unsigned detla, int width); int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width); void __syncwarp(int mask); -void __threadfence(); -void __threadfence_block(); -void __threadfence_system(); } DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { @@ -126,9 +123,9 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) { : "memory"); } -DEVICE void __kmpc_impl_threadfence() { __threadfence(); } -DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); } -DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); } +DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); } +DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); } +DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); } // Calls to the NVPTX layer (assuming 1D layout) DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } @@ -140,39 +137,41 @@ DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); } DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } -// Forward declaration of atomics. Although they're template functions, we -// already have definitions for diff erent types in CUDA internal headers with -// the right mangled names. -template DEVICE T atomicAdd(T *address, T val); -template DEVICE T atomicInc(T *address, T val); -template DEVICE T atomicMax(T *address, T val); -template DEVICE T atomicExch(T *address, T val); -template DEVICE T atomicCAS(T *address, T compare, T val); - +// Atomics DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { - return atomicAdd(Address, Val); + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); } DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { - return atomicInc(Address, Val); + return __nvvm_atom_inc_gen_ui(Address, Val); } + DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { - return atomicMax(Address, Val); + return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); } + DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { - return atomicExch(Address, Val); + uint32_t R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; } + DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) { - return atomicCAS(Address, Compare, Val); + (void)__atomic_compare_exchange(Address, &Compare, &Val, false, + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + return Compare; } DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, unsigned long long Val) { - return atomicExch(Address, Val); + unsigned long long R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; } + DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address, unsigned long long Val) { - return atomicAdd(Address, Val); + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); } #define __OMP_SPIN 1000 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] e5e448a - [libomptarget][cuda] Fix build, change missed from D95274
Author: Jon Chesterfield Date: 2021-01-24T18:30:04Z New Revision: e5e448aafa7699c17f78aaffb001b665b607e5ae URL: https://github.com/llvm/llvm-project/commit/e5e448aafa7699c17f78aaffb001b665b607e5ae DIFF: https://github.com/llvm/llvm-project/commit/e5e448aafa7699c17f78aaffb001b665b607e5ae.diff LOG: [libomptarget][cuda] Fix build, change missed from D95274 Added: Modified: openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp Removed: diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp index cc7bc42412f6..ad67fe95c77e 100644 --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp @@ -28,26 +28,26 @@ DLWRAP(cuFuncGetAttribute, 3); DLWRAP(cuGetErrorString, 2); DLWRAP(cuLaunchKernel, 11); -DLWRAP(cuMemAlloc, 2); -DLWRAP(cuMemcpyDtoDAsync, 4); +DLWRAP(cuMemAlloc_v2, 2); +DLWRAP(cuMemcpyDtoDAsync_v2, 4); -DLWRAP(cuMemcpyDtoH, 3); -DLWRAP(cuMemcpyDtoHAsync, 4); -DLWRAP(cuMemcpyHtoD, 3); -DLWRAP(cuMemcpyHtoDAsync, 4); +DLWRAP(cuMemcpyDtoH_v2, 3); +DLWRAP(cuMemcpyDtoHAsync_v2, 4); +DLWRAP(cuMemcpyHtoD_v2, 3); +DLWRAP(cuMemcpyHtoDAsync_v2, 4); -DLWRAP(cuMemFree, 1); +DLWRAP(cuMemFree_v2, 1); DLWRAP(cuModuleGetFunction, 3); -DLWRAP(cuModuleGetGlobal, 4); +DLWRAP(cuModuleGetGlobal_v2, 4); DLWRAP(cuModuleUnload, 1); DLWRAP(cuStreamCreate, 2); -DLWRAP(cuStreamDestroy, 1); +DLWRAP(cuStreamDestroy_v2, 1); DLWRAP(cuStreamSynchronize, 1); DLWRAP(cuCtxSetCurrent, 1); -DLWRAP(cuDevicePrimaryCtxRelease, 1); +DLWRAP(cuDevicePrimaryCtxRelease_v2, 1); DLWRAP(cuDevicePrimaryCtxGetState, 3); -DLWRAP(cuDevicePrimaryCtxSetFlags, 2); +DLWRAP(cuDevicePrimaryCtxSetFlags_v2, 2); DLWRAP(cuDevicePrimaryCtxRetain, 2); DLWRAP(cuModuleLoadDataEx, 5); ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] 95f0d1e - [libomptarget] Compile with older cuda, revert D95274
Author: Jon Chesterfield Date: 2021-01-25T16:12:56Z New Revision: 95f0d1edafe3e52a4057768f8cde5d55faf39d16 URL: https://github.com/llvm/llvm-project/commit/95f0d1edafe3e52a4057768f8cde5d55faf39d16 DIFF: https://github.com/llvm/llvm-project/commit/95f0d1edafe3e52a4057768f8cde5d55faf39d16.diff LOG: [libomptarget] Compile with older cuda, revert D95274 [libomptarget] Compile with older cuda, revert D95274 Fixes regression reported in comments of D95274. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95367 Added: Modified: openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h openmp/libomptarget/plugins/cuda/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp index ad67fe95c77e..cc7bc42412f6 100644 --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp @@ -28,26 +28,26 @@ DLWRAP(cuFuncGetAttribute, 3); DLWRAP(cuGetErrorString, 2); DLWRAP(cuLaunchKernel, 11); -DLWRAP(cuMemAlloc_v2, 2); -DLWRAP(cuMemcpyDtoDAsync_v2, 4); +DLWRAP(cuMemAlloc, 2); +DLWRAP(cuMemcpyDtoDAsync, 4); -DLWRAP(cuMemcpyDtoH_v2, 3); -DLWRAP(cuMemcpyDtoHAsync_v2, 4); -DLWRAP(cuMemcpyHtoD_v2, 3); -DLWRAP(cuMemcpyHtoDAsync_v2, 4); +DLWRAP(cuMemcpyDtoH, 3); +DLWRAP(cuMemcpyDtoHAsync, 4); +DLWRAP(cuMemcpyHtoD, 3); +DLWRAP(cuMemcpyHtoDAsync, 4); -DLWRAP(cuMemFree_v2, 1); +DLWRAP(cuMemFree, 1); DLWRAP(cuModuleGetFunction, 3); -DLWRAP(cuModuleGetGlobal_v2, 4); +DLWRAP(cuModuleGetGlobal, 4); DLWRAP(cuModuleUnload, 1); DLWRAP(cuStreamCreate, 2); -DLWRAP(cuStreamDestroy_v2, 1); +DLWRAP(cuStreamDestroy, 1); DLWRAP(cuStreamSynchronize, 1); DLWRAP(cuCtxSetCurrent, 1); -DLWRAP(cuDevicePrimaryCtxRelease_v2, 1); +DLWRAP(cuDevicePrimaryCtxRelease, 1); DLWRAP(cuDevicePrimaryCtxGetState, 3); -DLWRAP(cuDevicePrimaryCtxSetFlags_v2, 2); +DLWRAP(cuDevicePrimaryCtxSetFlags, 2); DLWRAP(cuDevicePrimaryCtxRetain, 2); DLWRAP(cuModuleLoadDataEx, 5); diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h index dd579a1f7490..832c26965144 100644 --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -48,6 +48,18 @@ typedef enum CUctx_flags_enum { CU_CTX_SCHED_MASK = 0x07, } CUctx_flags; +#define cuMemFree cuMemFree_v2 +#define cuMemAlloc cuMemAlloc_v2 +#define cuMemcpyDtoH cuMemcpyDtoH_v2 +#define cuMemcpyHtoD cuMemcpyHtoD_v2 +#define cuStreamDestroy cuStreamDestroy_v2 +#define cuModuleGetGlobal cuModuleGetGlobal_v2 +#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2 +#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 +#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2 +#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 +#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 + CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); @@ -60,26 +72,26 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, CUstream, void **, void **); -CUresult cuMemAlloc_v2(CUdeviceptr *, size_t); -CUresult cuMemcpyDtoDAsync_v2(CUdeviceptr, CUdeviceptr, size_t, CUstream); +CUresult cuMemAlloc(CUdeviceptr *, size_t); +CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); -CUresult cuMemcpyDtoH_v2(void *, CUdeviceptr, size_t); -CUresult cuMemcpyDtoHAsync_v2(void *, CUdeviceptr, size_t, CUstream); -CUresult cuMemcpyHtoD_v2(CUdeviceptr, const void *, size_t); -CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr, const void *, size_t, CUstream); +CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t); +CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); +CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); +CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); -CUresult cuMemFree_v2(CUdeviceptr); +CUresult cuMemFree(CUdeviceptr); CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); -CUresult cuModuleGetGlobal_v2(CUdeviceptr *, size_t *, CUmodule, const char *); +CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleUnload(CUmodule); CUresult cuStreamCreate(CUstream *, unsigned); -CUresult cuStreamDestroy_v2(CUstream); +CUresult cuStreamDestroy(CUstream); CUresult cuStreamSynchronize(CUstream); CUresult cuCtxSetCurrent(CUcontext); -CUresult cuDevicePrimaryCtxRelease_v2(CUdevice); +CUresult cuDevicePrimaryCtxRelease(CUdevice); CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *); -CUresult cuDevicePrimaryCtxSetFlags_v2(CUdevice, unsigned); +CUresult cuDevicePrimary
[llvm-branch-commits] [openmp] 33e2494 - [libomptarget][amdgpu][nfc] Fix build on centos
Author: Jon Chesterfield Date: 2021-01-12T19:40:03Z New Revision: 33e2494bea653a845cb0502cc6d3cecdf2b47750 URL: https://github.com/llvm/llvm-project/commit/33e2494bea653a845cb0502cc6d3cecdf2b47750 DIFF: https://github.com/llvm/llvm-project/commit/33e2494bea653a845cb0502cc6d3cecdf2b47750.diff LOG: [libomptarget][amdgpu][nfc] Fix build on centos [libomptarget][amdgpu][nfc] Fix build on centos rtl.cpp replaced 224 with a #define from elf.h, but that doesn't work on a centos 7 build machine with an old elf.h Reviewed By: ronlieb Differential Revision: https://reviews.llvm.org/D94528 Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 437846f8b15b8..bd450f9898faf 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -638,7 +638,7 @@ void finiAsyncInfoPtr(__tgt_async_info *async_info_ptr) { } bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { - const uint16_t amdgcnMachineID = EM_AMDGPU; + const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h int32_t r = elf_check_machine(image, amdgcnMachineID); if (!r) { DP("Supported machine ID not found\n"); ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] 84e0b14 - [libomptarget][nvptx] Include omp_data.cu in bitcode deviceRTL
Author: Jon Chesterfield Date: 2021-01-13T03:51:11Z New Revision: 84e0b14a0a419f26d0a2f7389e06aa8e36569808 URL: https://github.com/llvm/llvm-project/commit/84e0b14a0a419f26d0a2f7389e06aa8e36569808 DIFF: https://github.com/llvm/llvm-project/commit/84e0b14a0a419f26d0a2f7389e06aa8e36569808.diff LOG: [libomptarget][nvptx] Include omp_data.cu in bitcode deviceRTL [libomptarget][nvptx] Include omp_data.cu in bitcode deviceRTL Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D94565 Added: Modified: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt Removed: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt index 425c674fb11e..ea11c8114166 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -56,6 +56,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) ${devicertl_common_directory}/src/data_sharing.cu ${devicertl_common_directory}/src/libcall.cu ${devicertl_common_directory}/src/loop.cu + ${devicertl_common_directory}/src/omp_data.cu ${devicertl_common_directory}/src/omptarget.cu ${devicertl_common_directory}/src/parallel.cu ${devicertl_common_directory}/src/reduction.cu @@ -65,8 +66,6 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) src/target_impl.cu ) - set(omp_data_objects ${devicertl_common_directory}/src/omp_data.cu) - # Build library support for the highest compute capability the system supports # and always build support for sm_35 by default if (${LIBOMPTARGET_DEP_CUDA_ARCH} EQUAL 35) @@ -105,7 +104,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) set(CUDA_SEPARABLE_COMPILATION ON) list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory} -I${devicertl_nvptx_directory}/src) - cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects} + cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG} ${MAX_SM_DEFINITION}) # Install device RTL under the lib destination folder. ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] 5d165f0 - [libomptarget][amdgpu] Fix kernel launch tracing to match previous behavior
Author: Jon Chesterfield Date: 2021-01-14T18:13:22Z New Revision: 5d165f0b893d4fc5fb5caeb2b05c566dd26e4d89 URL: https://github.com/llvm/llvm-project/commit/5d165f0b893d4fc5fb5caeb2b05c566dd26e4d89 DIFF: https://github.com/llvm/llvm-project/commit/5d165f0b893d4fc5fb5caeb2b05c566dd26e4d89.diff LOG: [libomptarget][amdgpu] Fix kernel launch tracing to match previous behavior Restore control of kernel launch tracing to be >= 1 as it was before export LIBOMPTARGET_KERNEL_TRACE=1 Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D94695 Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index bd450f9898fa..9453171e1378 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1762,7 +1762,7 @@ int32_t __tgt_rtl_run_target_team_region_locked( loop_tripcount, // From run_region arg KernelInfo->device_id); - if (print_kernel_trace == 4) + if (print_kernel_trace >= 1) // enum modes are SPMD, GENERIC, NONE 0,1,2 fprintf(stderr, "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] 6e7094c - [libomptarget][nvptx][nfc] Move target_impl functions out of header
Author: Jon Chesterfield Date: 2021-01-15T00:19:48Z New Revision: 6e7094c14b22a202c15959316033c164d7a84122 URL: https://github.com/llvm/llvm-project/commit/6e7094c14b22a202c15959316033c164d7a84122 DIFF: https://github.com/llvm/llvm-project/commit/6e7094c14b22a202c15959316033c164d7a84122.diff LOG: [libomptarget][nvptx][nfc] Move target_impl functions out of header [libomptarget][nvptx][nfc] Move target_impl functions out of header This removes most of the differences between the two target_impl.h. Also change name mangling from C to C++ for __kmpc_impl_*_lock. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D94728 Added: Modified: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h Removed: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu index 50867bc4010a..7e81aba4152d 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -14,19 +14,135 @@ #include "common/debug.h" #include "common/target_atomic.h" +#include + +DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); +} + +DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { + uint64_t val; + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); + return val; +} + +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { + __kmpc_impl_lanemask_t res; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res)); + return res; +} + +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { + __kmpc_impl_lanemask_t res; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res)); + return res; +} + +DEVICE uint32_t __kmpc_impl_smid() { + uint32_t id; + asm("mov.u32 %0, %%smid;" : "=r"(id)); + return id; +} + +DEVICE double __kmpc_impl_get_wtick() { + // Timer precision is 1ns + return ((double)1E-9); +} + +DEVICE double __kmpc_impl_get_wtime() { + unsigned long long nsecs; + asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); + return (double)nsecs * __kmpc_impl_get_wtick(); +} + +// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). + +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { +#if CUDA_VERSION >= 9000 + return __activemask(); +#else + return __ballot(1); +#endif +} + +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. + +DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, + int32_t SrcLane) { +#if CUDA_VERSION >= 9000 + return __shfl_sync(Mask, Var, SrcLane); +#else + return __shfl(Var, SrcLane); +#endif // CUDA_VERSION +} + +DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, + int32_t Var, uint32_t Delta, + int32_t Width) { +#if CUDA_VERSION >= 9000 + return __shfl_down_sync(Mask, Var, Delta, Width); +#else + return __shfl_down(Var, Delta, Width); +#endif // CUDA_VERSION +} + +DEVICE void __kmpc_impl_syncthreads() { + // Use original __syncthreads if compiled by nvcc or clang >= 9.0. +#if !defined(__clang__) || __clang_major__ >= 9 + __syncthreads(); +#else + asm volatile("bar.sync %0;" : : "r"(0) : "memory"); +#endif // __clang__ +} + +DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { +#if CUDA_VERSION >= 9000 + __syncwarp(Mask); +#else + // In Cuda < 9.0 no need to sync threads in warps. +#endif // CUDA_VERSION +} + +// NVPTX specific kernel initialization +DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */ +} + +// Barrier until num_threads arrive. +DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) { + // The named barrier for active parallel threads of a team in an L1 parallel + // region to synchronize with each other. + int barrier = 1; + asm volatile("bar.sync %0, %1;" + : + : "r"(barrier), "r"(num_threads) + : "memory"); +} + +DEVICE void __kmpc_impl_threadfence() { __threadfence(); } +DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); } +DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); } + +// Calls to the NVPTX layer (assuming 1D layout) +DEVICE int GetThreadIdInBlock() { return threadIdx.x; } +DEVICE int GetBlockIdInKernel() { return blockIdx.x; } +DEVICE int GetNumberOfBlocksInKernel() { return gridDim.x; } +DEVICE int GetNumberOfThreadsInBlock() { return blockDim.x; } +DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } + #define __OMP_SPIN 1000 #define UNSET 0u #define SET 1u -EXTERN void __kmpc_impl_init_lock(omp_lock_t *
[llvm-branch-commits] [openmp] 214387c - [libomptarget][nvptx] Reduce calls to cuda header
Author: Jon Chesterfield Date: 2021-01-15T02:16:33Z New Revision: 214387c2c694c92fec713f7ad224f10c1aebc1cf URL: https://github.com/llvm/llvm-project/commit/214387c2c694c92fec713f7ad224f10c1aebc1cf DIFF: https://github.com/llvm/llvm-project/commit/214387c2c694c92fec713f7ad224f10c1aebc1cf.diff LOG: [libomptarget][nvptx] Reduce calls to cuda header [libomptarget][nvptx] Reduce calls to cuda header Remove use of clock_t in favour of a builtin. Drop a preprocessor branch. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D94731 Added: Modified: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu Removed: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu index 7e81aba4152d..b68d3265a758 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -56,7 +56,6 @@ DEVICE double __kmpc_impl_get_wtime() { } // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { #if CUDA_VERSION >= 9000 return __activemask(); @@ -66,7 +65,6 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { } // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. - DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, int32_t SrcLane) { #if CUDA_VERSION >= 9000 @@ -86,14 +84,7 @@ DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, #endif // CUDA_VERSION } -DEVICE void __kmpc_impl_syncthreads() { - // Use original __syncthreads if compiled by nvcc or clang >= 9.0. -#if !defined(__clang__) || __clang_major__ >= 9 - __syncthreads(); -#else - asm volatile("bar.sync %0;" : : "r"(0) : "memory"); -#endif // __clang__ -} +DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); } DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { #if CUDA_VERSION >= 9000 @@ -145,11 +136,11 @@ DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) { DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) { // TODO: not sure spinning is a good idea here.. while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) { -clock_t start = clock(); -clock_t now; +int32_t start = __nvvm_read_ptx_sreg_clock(); +int32_t now; for (;;) { - now = clock(); - clock_t cycles = now > start ? now - start : now + (0x - start); + now = __nvvm_read_ptx_sreg_clock(); + int32_t cycles = now > start ? now - start : now + (0x - start); if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) { break; } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] e069662 - [libomptarget][devicertl] Wrap source in declare target pragmas
Author: Jon Chesterfield Date: 2021-01-20T15:50:41Z New Revision: e069662deb1fa167b3e5fdce4c9949e663df8082 URL: https://github.com/llvm/llvm-project/commit/e069662deb1fa167b3e5fdce4c9949e663df8082 DIFF: https://github.com/llvm/llvm-project/commit/e069662deb1fa167b3e5fdce4c9949e663df8082.diff LOG: [libomptarget][devicertl] Wrap source in declare target pragmas [libomptarget][devicertl] Wrap source in declare target pragmas Factored out of D93135 / D94745. C++ and cuda ignore unknown pragmas so this is a NFC for the current implementation language. Removes noise from patches for building deviceRTL as openmp. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D95048 Added: Modified: openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip openmp/libomptarget/deviceRTLs/common/src/cancel.cu openmp/libomptarget/deviceRTLs/common/src/critical.cu openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu openmp/libomptarget/deviceRTLs/common/src/libcall.cu openmp/libomptarget/deviceRTLs/common/src/loop.cu openmp/libomptarget/deviceRTLs/common/src/omp_data.cu openmp/libomptarget/deviceRTLs/common/src/omptarget.cu openmp/libomptarget/deviceRTLs/common/src/parallel.cu openmp/libomptarget/deviceRTLs/common/src/reduction.cu openmp/libomptarget/deviceRTLs/common/src/support.cu openmp/libomptarget/deviceRTLs/common/src/sync.cu openmp/libomptarget/deviceRTLs/common/src/task.cu openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu Removed: diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip index 4163a14f50bf..f537fb28318c 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip @@ -14,6 +14,7 @@ // a SIMD => wavefront mapping once that is implemented. // //===--===// +#pragma omp declare target #include "common/debug.h" @@ -26,3 +27,5 @@ DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); } DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); } DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); } DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip index 74d0d167137f..c85045570de2 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===--===// +#pragma omp declare target #include "target_impl.h" @@ -59,3 +60,5 @@ DEVICE uint32_t __kmpc_impl_smid() { ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); return (se_id << HW_ID_CU_ID_SIZE) + cu_id; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip index 3e70beb85d5b..7388a29215cc 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -9,6 +9,7 @@ // Definitions of target specific functions // //===--===// +#pragma omp declare target #include "target_impl.h" @@ -151,3 +152,5 @@ EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() { // Stub implementations DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; } DEVICE void __kmpc_impl_free(void *) {} + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu index 9540f5647699..0e5cd2b60554 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu @@ -9,6 +9,7 @@ // Interface to be used in the implementation of OpenMP cancel. // //===--===// +#pragma omp declare target #include "interface.h" #include "common/debug.h" @@ -26,3 +27,5 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, // disabled return 0; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu index ee4b056ddad9..3fd89c50aa46 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu +++ b/openmp/libomptarget/deviceRTLs/
[llvm-branch-commits] [openmp] ea616f9 - [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify
Author: Jon Chesterfield Date: 2021-01-20T19:45:05Z New Revision: ea616f9026dc6bd9c67ebe2d3226ac91122a7945 URL: https://github.com/llvm/llvm-project/commit/ea616f9026dc6bd9c67ebe2d3226ac91122a7945 DIFF: https://github.com/llvm/llvm-project/commit/ea616f9026dc6bd9c67ebe2d3226ac91122a7945.diff LOG: [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify Replace __popc, __ffs with clang intrinsics. Move kmpc_impl_min to only file that uses it and replace template with explictly typed. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95060 Added: Modified: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h openmp/libomptarget/deviceRTLs/common/src/reduction.cu openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h Removed: diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index d25ea8559c05..b1e9a1a9403a 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -65,6 +65,10 @@ enum DATA_SHARING_SIZES { DS_Max_Warp_Number = 16, }; +enum : __kmpc_impl_lanemask_t { + __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 +}; + INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { lo = (uint32_t)(val & UINT64_C(0x)); hi = (uint32_t)((val & UINT64_C(0x)) >> 32); @@ -74,28 +78,15 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { return (((uint64_t)hi) << 32) | (uint64_t)lo; } -enum : __kmpc_impl_lanemask_t { - __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 -}; - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); - DEVICE uint32_t __kmpc_impl_smid(); - DEVICE double __kmpc_impl_get_wtick(); - DEVICE double __kmpc_impl_get_wtime(); INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } - INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } -template INLINE T __kmpc_impl_min(T x, T y) { - return x < y ? x : y; -} - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu index 92b34d77bd8a..3a3c44503f34 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -184,6 +184,8 @@ INLINE static uint32_t roundToWarpsize(uint32_t s) { return (s & ~(unsigned)(WARPSIZE - 1)); } +INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } + DEVICE static volatile uint32_t IterCnt = 0; DEVICE static volatile uint32_t Cnt = 0; EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( @@ -261,14 +263,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( // by returning 1 in the thread holding the reduction result. // Check if this is the very last team. - unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records)); + unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); if (ChunkTeamCount == NumTeams - Bound - 1) { // // Last team processing. // if (ThreadId >= NumRecs) return 0; -NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs)); +NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); if (ThreadId >= NumThreads) return 0; @@ -283,7 +285,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( // When we have more than [warpsize] number of threads // a block reduction is performed here. - uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads); + uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); if (ActiveThreads > WARPSIZE) { uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; // Gather all the reduced values from each warp diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 8382cd6aaf47..ab9fd1697f14 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -93,13 +93,8 @@ DEVICE uint32_t __kmpc_impl_smid(); DEVICE double __kmpc_impl_get_wtick(); DEVICE double __kmpc_impl_get_wtime(); -INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } - -INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); } - -template INLINE T __kmpc_impl_min(T x, T y) { - return min(x, y); -} +INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } +INLINE uint32_t __kmpc_impl_popc(ui
[llvm-branch-commits] [openmp] fbc1dcb - [libomptarget][devicertl][nfc] Simplify target_atomic abstraction
Author: Jon Chesterfield Date: 2021-01-20T19:50:50Z New Revision: fbc1dcb946553a3dc923a63288d9275eea86f918 URL: https://github.com/llvm/llvm-project/commit/fbc1dcb946553a3dc923a63288d9275eea86f918 DIFF: https://github.com/llvm/llvm-project/commit/fbc1dcb946553a3dc923a63288d9275eea86f918.diff LOG: [libomptarget][devicertl][nfc] Simplify target_atomic abstraction [libomptarget][devicertl][nfc] Simplify target_atomic abstraction Atomic functions were implemented as a shim around cuda's atomics, with amdgcn implementing those symbols as a shim around gcc style intrinsics. This patch folds target_atomic.h into target_impl.h and folds amdgcn. Further work is likely to be useful here, either changing to openmp's atomic interface or instantiating the templates on the few used types in order to move them into a cuda/c++ implementation file. This change is mostly to group the remaining uses of the cuda api under nvptx' target_impl abstraction. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95062 Added: Modified: openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h openmp/libomptarget/deviceRTLs/common/omptargeti.h openmp/libomptarget/deviceRTLs/common/src/libcall.cu openmp/libomptarget/deviceRTLs/common/src/loop.cu openmp/libomptarget/deviceRTLs/common/src/reduction.cu openmp/libomptarget/deviceRTLs/common/state-queuei.h openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h Removed: openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h openmp/libomptarget/deviceRTLs/common/target_atomic.h diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt index 8bb395f1126f..8d9abe5d0bbd 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -73,14 +73,12 @@ set(cuda_sources set(h_files ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h - ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h ${devicertl_base_directory}/common/debug.h ${devicertl_base_directory}/common/device_environment.h ${devicertl_base_directory}/common/omptarget.h ${devicertl_base_directory}/common/omptargeti.h ${devicertl_base_directory}/common/state-queue.h - ${devicertl_base_directory}/common/target_atomic.h ${devicertl_base_directory}/common/state-queuei.h ${devicertl_base_directory}/common/support.h) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h deleted file mode 100644 index 04e80b945070.. --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h +++ /dev/null @@ -1,41 +0,0 @@ -//=== hip_atomics.h - Declarations of hip atomic functions C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===--===// - -#ifndef OMPTARGET_AMDGCN_HIP_ATOMICS_H -#define OMPTARGET_AMDGCN_HIP_ATOMICS_H - -#include "target_impl.h" - -namespace { - -template DEVICE T atomicAdd(T *address, T val) { - return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST); -} - -template DEVICE T atomicMax(T *address, T val) { - return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST); -} - -template DEVICE T atomicExch(T *address, T val) { - T r; - __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST); - return r; -} - -template DEVICE T atomicCAS(T *address, T compare, T val) { - (void)__atomic_compare_exchange(address, &compare, &val, false, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return compare; -} - -INLINE uint32_t atomicInc(uint32_t *address, uint32_t max) { - return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, ""); -} - -} // namespace -#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index b1e9a1a9403a..6e8a651bd886 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -29,8 +29,6 @@ #define SHARED __attribute__((shared)) #define ALIGN(N) __attribute__((aligned(N))) -#include "hip_atomics.h" - // Kernel options @@ -127,6 +125,31 @@ DEVICE int GetNumberOfThreadsInBlock(); DEVICE unsigned GetWarpId(); DEVICE unsigned GetLaneId(); +// Atomics +temp
[llvm-branch-commits] [openmp] 9b19ecb - [libomptarget][devicertl] Drop templated atomic functions
Author: Jon Chesterfield Date: 2021-01-22T14:48:22Z New Revision: 9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d URL: https://github.com/llvm/llvm-project/commit/9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d DIFF: https://github.com/llvm/llvm-project/commit/9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d.diff LOG: [libomptarget][devicertl] Drop templated atomic functions [libomptarget][devicertl] Drop templated atomic functions The five __kmpc_atomic templates are instantiated a total of seven times. This change replaces the template with explictly typed functions, which have the same prototype for amdgcn and nvptx, and implements them with the same code presently in use. Rolls in the accepted but not yet landed D95085. The unsigned long long type can be replaced with uint64_t when replacing the cuda function. Until then, clang warns on casting a pointer to one to a pointer to the other. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D95093 Added: Modified: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h Removed: diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index 6e8a651bd886..228d3f6e556d 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -126,29 +126,17 @@ DEVICE unsigned GetWarpId(); DEVICE unsigned GetLaneId(); // Atomics -template INLINE T __kmpc_atomic_add(T *address, T val) { - return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST); -} - -INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) { - return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, ""); -} - -template INLINE T __kmpc_atomic_max(T *address, T val) { - return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST); -} - -template INLINE T __kmpc_atomic_exchange(T *address, T val) { - T r; - __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST); - return r; -} - -template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { - (void)__atomic_compare_exchange(address, &compare, &val, false, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return compare; -} +DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); + +static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, +unsigned long long); // Locks DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip index 7388a29215cc..35828cda0e06 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -132,11 +132,13 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size, } // namespace DEVICE int GetNumberOfBlocksInKernel() { - return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); + return get_grid_dim(__builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); } DEVICE int GetNumberOfThreadsInBlock() { - return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(), + return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), + __builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); } @@ -149,6 +151,40 @@ EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() { return GetNumberOfThreadsInBlock(); } +// Atomics +DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); +} +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { + return __builtin_amdgcn_atomic_inc32(Address, max, __ATOMIC_SEQ_CST, ""); +} +DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { + return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); +} + +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { + uint32_t R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; +} +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Ad
[llvm-branch-commits] [openmp] 47e95e8 - [libomptarget] Build cuda plugin without cuda installed locally
Author: Jon Chesterfield Date: 2021-01-23T00:15:04Z New Revision: 47e95e87a3e4f738635ff965616d4e2d96bf838a URL: https://github.com/llvm/llvm-project/commit/47e95e87a3e4f738635ff965616d4e2d96bf838a DIFF: https://github.com/llvm/llvm-project/commit/47e95e87a3e4f738635ff965616d4e2d96bf838a.diff LOG: [libomptarget] Build cuda plugin without cuda installed locally [libomptarget] Build cuda plugin without cuda installed locally Compiles a new file, `plugins/cuda/dynamic_cuda/cuda.cpp`, to an object file that exposes the same symbols that the plugin presently uses from libcuda. The object file contains dlopen of libcuda and cached dlsym calls. Also provides a cuda.h containing the subset that is used. This lets the cmake file choose between the system cuda and a dlopen shim, with no changes to rtl.cpp. The corresponding change to amdgpu is postponed until after a refactor of the plugin to reduce the size of the hsa.h stub required Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95155 Added: openmp/libomptarget/include/dlwrap.h openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h Modified: openmp/libomptarget/plugins/cuda/CMakeLists.txt Removed: diff --git a/openmp/libomptarget/include/dlwrap.h b/openmp/libomptarget/include/dlwrap.h new file mode 100644 index ..9e078b34ca57 --- /dev/null +++ b/openmp/libomptarget/include/dlwrap.h @@ -0,0 +1,277 @@ +//===--- dlwrap.h - Convenience wrapper around dlopen/dlsym -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// The openmp plugins depend on extern libraries. These can be used via: +// - bitcode file statically linked +// - (relocatable) object file statically linked +// - static library +// - dynamic library, linked at build time +// - dynamic library, loaded at application run time by dlopen +// +// This file factors out most boilerplate for using a dlopened library. +// - Function symbols are generated that are statically linked against +// - The dlopen can be done implicitly when initializing the library +// - dlsym lookups are done once and cached +// - The abstraction is very thin to permit varied uses of the library +// +// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to: +// int foo(char x0, double x1, void* x2) { +// constexpr size_t index = id(); +// void * dlsymResult = pointer(index); +// return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2); +// } +// +// Multiple calls to DLWRAP(symbol_name, arity) with bespoke +// initialization code that can use the thin abstraction: +// namespace dlwrap { +// static size_t size(); +// static const char *symbol(size_t); +// static void **pointer(size_t); +// } +// will compile to an object file that only exposes the symbols that the +// dynamic library would do, with the right function types. +// +//===--===// + +#ifndef DLWRAP_H_INCLUDED +#define DLWRAP_H_INCLUDED + +#include +#include +#include +#include + +// Where symbol is a function, these expand to some book keeping and an +// implementation of that function +#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY) +#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) + +// For example, given a prototype: +// int foo(char, double); +// +// DLWRAP(foo, 2) expands to: +// +// namespace dlwrap { +// struct foo_Trait : public dlwrap::trait { +// using T = dlwrap::trait; +// static T::FunctionType get() { +// constexpr size_t Index = getIndex(); +// void *P = *dlwrap::pointer(Index); +// return reinterpret_cast(P); +// } +// }; +// } +// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); } +// +// DLWRAP_INTERNAL is similar, except the function it expands to is: +// static int dlwrap_foo(char x0, double x1) { ... } +// so that the function pointer call can be wrapped in library-specific code + +// DLWRAP_FINALIZE() expands to definitions of: +#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL() +namespace dlwrap { +static size_t size(); +static const char *symbol(size_t); // get symbol name in [0, size()) +static void **pointer(size_t); // get pointer to function pointer in [0, size()) +} // namespace dlwrap + +// Implementation details follow. + +namespace dlwrap { + +// Extract return / argument types from address of function symbol +template struct trait; +template struct trait { + constexpr static const size_t nargs = sizeof...(Ts); + typedef R ReturnType; + template struct arg { +typedef type
[llvm-branch-commits] [openmp] 76bfbb7 - [libomptarget][amdgpu] Call into deviceRTL instead of ockl
Author: Jon Chesterfield Date: 2021-01-04T16:48:47Z New Revision: 76bfbb74d38b611f150e8e1a4becc11be95703da URL: https://github.com/llvm/llvm-project/commit/76bfbb74d38b611f150e8e1a4becc11be95703da DIFF: https://github.com/llvm/llvm-project/commit/76bfbb74d38b611f150e8e1a4becc11be95703da.diff LOG: [libomptarget][amdgpu] Call into deviceRTL instead of ockl [libomptarget][amdgpu] Call into deviceRTL instead of ockl Amdgpu codegen presently emits a call into ockl. The same functionality is already present in the deviceRTL. Adds an amdgpu specific entry point to avoid the dependency. This lets simple openmp code (specifically, that which doesn't use libm) run without rocm device libraries installed. Reviewed By: ronlieb Differential Revision: https://reviews.llvm.org/D93356 Added: Modified: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp clang/test/OpenMP/amdgcn_target_codegen.cpp openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip Removed: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp index ccffdf43549f..33d4ab838af1 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -49,13 +49,12 @@ llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) { llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; llvm::Module *M = &CGF.CGM.getModule(); - const char *LocSize = "__ockl_get_local_size"; + const char *LocSize = "__kmpc_amdgcn_gpu_num_threads"; llvm::Function *F = M->getFunction(LocSize); if (!F) { F = llvm::Function::Create( -llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false), +llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false), llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); } - return Bld.CreateTrunc( - Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty); + return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); } diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp index 85ef69942a50..416ed06083b0 100644 --- a/clang/test/OpenMP/amdgcn_target_codegen.cpp +++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp @@ -13,9 +13,8 @@ int test_amdgcn_target_tid_threads() { int arr[N]; -// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0) -// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32 -// CHECK-NEXT: sub nuw i32 [[VAR]], 64 +// CHECK: [[NUM_THREADS:%.+]] = call i32 @__kmpc_amdgcn_gpu_num_threads() +// CHECK: sub nuw i32 [[NUM_THREADS]], 64 // CHECK: call i32 @llvm.amdgcn.workitem.id.x() #pragma omp target for (int i = 0; i < N; i++) { @@ -30,9 +29,8 @@ int test_amdgcn_target_tid_threads_simd() { int arr[N]; -// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0) -// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32 -// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0) +// CHECK: [[NUM_THREADS:%.+]] = call i32 @__kmpc_amdgcn_gpu_num_threads() +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[NUM_THREADS]], i16 0) #pragma omp target simd for (int i = 0; i < N; i++) { arr[i] = 1; diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h index f7c75c09362a..80409d611f6f 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h @@ -15,4 +15,6 @@ typedef uint64_t __kmpc_impl_lanemask_t; typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ +EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads(); + #endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip index 9fbdc67b56ab..3e70beb85d5b 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -144,6 +144,10 @@ DEVICE unsigned GetLaneId() { return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); } +EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() { + return GetNumberOfThreadsInBlock(); +} + // Stub implementations -DEVICE void *__kmpc_impl_malloc(size_t ) { return nullptr } +DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; } DEVICE void __kmpc_impl_free(void *) {} ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] 4b2e7d0 - [amdgpu] Default to code object v3
Author: Jon Chesterfield Date: 2020-12-15T01:11:09Z New Revision: 4b2e7d0215021d0d1df1a6319884b21d33936265 URL: https://github.com/llvm/llvm-project/commit/4b2e7d0215021d0d1df1a6319884b21d33936265 DIFF: https://github.com/llvm/llvm-project/commit/4b2e7d0215021d0d1df1a6319884b21d33936265.diff LOG: [amdgpu] Default to code object v3 [amdgpu] Default to code object v3 v4 is not yet readily available, and doesn't appear to be implemented in the back end Reviewed By: t-tye Differential Revision: https://reviews.llvm.org/D93258 Added: Modified: clang/include/clang/Driver/Options.td clang/lib/Driver/ToolChains/CommonArgs.cpp llvm/docs/AMDGPUUsage.rst Removed: diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 67d41c3711f5..87c786065fa9 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2811,7 +2811,7 @@ def mexec_model_EQ : Joined<["-"], "mexec-model=">, Group; def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group, - HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">, + HelpText<"Specify code object ABI version. Defaults to 3. (AMDGPU only)">, MetaVarName<"">, Values<"2,3,4">; def mcode_object_v3_legacy : Flag<["-"], "mcode-object-v3">, Group, diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 72bedc16846d..04d0e0771f70 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1549,7 +1549,7 @@ unsigned tools::getOrCheckAMDGPUCodeObjectVersion( const Driver &D, const llvm::opt::ArgList &Args, bool Diagnose) { const unsigned MinCodeObjVer = 2; const unsigned MaxCodeObjVer = 4; - unsigned CodeObjVer = 4; + unsigned CodeObjVer = 3; // Emit warnings for legacy options even if they are overridden. if (Diagnose) { diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index e5d081a37500..95fb164310cc 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -911,12 +911,12 @@ The AMDGPU backend uses the following ELF header: * ``ELFABIVERSION_AMDGPU_HSA_V3`` is used to specify the version of AMD HSA runtime ABI for code object V3. Specify using the Clang option -``-mcode-object-version=3``. +``-mcode-object-version=3``. This is the default code object +version if not specified. * ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA runtime ABI for code object V4. Specify using the Clang option -``-mcode-object-version=4``. This is the default code object -version if not specified. +``-mcode-object-version=4``. * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL runtime ABI. @@ -2871,10 +2871,6 @@ non-AMD key names should be prefixed by "*vendor-name*.". Code Object V3 Metadata +++ -.. warning:: - Code object V3 is not the default code object version emitted by this version - of LLVM. - Code object V3 to V4 metadata is specified by the ``NT_AMDGPU_METADATA`` note record (see :ref:`amdgpu-note-records-v3-v4`). @@ -3279,6 +3275,10 @@ same *vendor-name*. Code Object V4 Metadata +++ +.. warning:: + Code object V4 is not the default code object version emitted by this version + of LLVM. + Code object V4 metadata is the same as :ref:`amdgpu-amdhsa-code-object-metadata-v3` with the changes and additions defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v3`. ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] b607837 - [libomptarget][nfc] Replace static const with enum
Author: Jon Chesterfield Date: 2020-12-16T16:40:37Z New Revision: b607837c75d04cc007dcf855983dfa3b69f63d73 URL: https://github.com/llvm/llvm-project/commit/b607837c75d04cc007dcf855983dfa3b69f63d73 DIFF: https://github.com/llvm/llvm-project/commit/b607837c75d04cc007dcf855983dfa3b69f63d73.diff LOG: [libomptarget][nfc] Replace static const with enum [libomptarget][nfc] Replace static const with enum Semantically identical. Replaces 0xff... with ~0 to spare counting the f. Has the advantage that the compiler doesn't need to prove the 4/8 byte value dead before discarding it, and sidesteps the compilation question associated with what static means for a single source language. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D93328 Added: Modified: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h Removed: diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index 34794587e0fe..d25ea8559c05 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -74,8 +74,9 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { return (((uint64_t)hi) << 32) | (uint64_t)lo; } -static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes = -UINT64_C(0x); +enum : __kmpc_impl_lanemask_t { + __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 +}; DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 46ce751c44c4..411e1676b7c7 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -91,8 +91,9 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { return val; } -static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes = -UINT32_C(0x); +enum : __kmpc_impl_lanemask_t { + __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 +}; INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { __kmpc_impl_lanemask_t res; ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] c0619d3 - [NFC] Use regex for code object version in hip tests
Author: Jon Chesterfield Date: 2020-12-16T17:00:19Z New Revision: c0619d3b21cd420b9faf15f14db0816787c44ded URL: https://github.com/llvm/llvm-project/commit/c0619d3b21cd420b9faf15f14db0816787c44ded DIFF: https://github.com/llvm/llvm-project/commit/c0619d3b21cd420b9faf15f14db0816787c44ded.diff LOG: [NFC] Use regex for code object version in hip tests [NFC] Use regex for code object version in hip tests Extracted from D93258. Makes tests robust to changes in default code object version. Reviewed By: t-tye Differential Revision: https://reviews.llvm.org/D93398 Added: Modified: clang/test/Driver/hip-autolink.hip clang/test/Driver/hip-code-object-version.hip clang/test/Driver/hip-device-compile.hip clang/test/Driver/hip-host-cpu-features.hip clang/test/Driver/hip-rdc-device-only.hip clang/test/Driver/hip-target-id.hip clang/test/Driver/hip-toolchain-mllvm.hip clang/test/Driver/hip-toolchain-no-rdc.hip clang/test/Driver/hip-toolchain-opt.hip clang/test/Driver/hip-toolchain-rdc-separate.hip clang/test/Driver/hip-toolchain-rdc-static-lib.hip clang/test/Driver/hip-toolchain-rdc.hip Removed: diff --git a/clang/test/Driver/hip-autolink.hip b/clang/test/Driver/hip-autolink.hip index 073c6c4d244a..5f9311d7ba73 100644 --- a/clang/test/Driver/hip-autolink.hip +++ b/clang/test/Driver/hip-autolink.hip @@ -7,7 +7,7 @@ // RUN: %clang --target=i386-pc-windows-msvc --cuda-gpu-arch=gfx906 -nogpulib \ // RUN: --cuda-host-only %s -### 2>&1 | FileCheck --check-prefix=HOST %s -// DEV: "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" "amdgcn-amd-amdhsa" +// DEV: "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9]+}}" "-triple" "amdgcn-amd-amdhsa" // DEV-SAME: "-fno-autolink" // HOST: "-cc1" "-triple" "i386-pc-windows-msvc{{.*}}" diff --git a/clang/test/Driver/hip-code-object-version.hip b/clang/test/Driver/hip-code-object-version.hip index 26ad6f8710cc..51d9004b0cbf 100644 --- a/clang/test/Driver/hip-code-object-version.hip +++ b/clang/test/Driver/hip-code-object-version.hip @@ -44,12 +44,17 @@ // RUN: --offload-arch=gfx906 -nogpulib \ // RUN: %s 2>&1 | FileCheck -check-prefix=V4 %s +// V4: "-mllvm" "--amdhsa-code-object-version=4" +// V4: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906" + +// Check bundle ID for code object version default + // RUN: %clang -### -target x86_64-linux-gnu \ // RUN: --offload-arch=gfx906 -nogpulib \ -// RUN: %s 2>&1 | FileCheck -check-prefix=V4 %s +// RUN: %s 2>&1 | FileCheck -check-prefix=VD %s -// V4: "-mllvm" "--amdhsa-code-object-version=4" -// V4: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906" +// VD: "-mllvm" "--amdhsa-code-object-version=4" +// VD: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906" // Check invalid code object version option. diff --git a/clang/test/Driver/hip-device-compile.hip b/clang/test/Driver/hip-device-compile.hip index 5fbcbc97bd80..c460ff7e8c67 100644 --- a/clang/test/Driver/hip-device-compile.hip +++ b/clang/test/Driver/hip-device-compile.hip @@ -26,7 +26,7 @@ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASM %s -// CHECK: {{".*clang.*"}} "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" "amdgcn-amd-amdhsa" +// CHECK: {{".*clang.*"}} "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9]+}}" "-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" // BC-SAME: "-emit-llvm-bc" // LL-SAME: "-emit-llvm" diff --git a/clang/test/Driver/hip-host-cpu-features.hip b/clang/test/Driver/hip-host-cpu-features.hip index 235f0f1f22c2..8addfb11dc0b 100644 --- a/clang/test/Driver/hip-host-cpu-features.hip +++ b/clang/test/Driver/hip-host-cpu-features.hip @@ -6,14 +6,14 @@ // RUN: %clang -### -c -target x86_64-linux-gnu -msse3 --cuda-gpu-arch=gfx803 -nogpulib %s 2>&1 | FileCheck %s -check-prefix=HOSTSSE3 // RUN: %clang -### -c -target x86_64-linux-gnu --gpu-use-aux-triple-only -march=znver2 --cuda-gpu-arch=gfx803 -nogpulib %s 2>&1 | FileCheck %s -check-prefix=NOHOSTCPU -// HOSTCPU: "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" "amdgcn-amd-amdhsa" +// HOSTCPU: "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9]+}}" "-triple" "amdgcn-amd-amdhsa" // HOSTCPU-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" // HOSTCPU-SAME: "-aux-target-cpu" "znver2" -// HOSTSSE3: "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" "amdgcn-amd-amdhsa" +// HOSTSSE3: "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9]+}}" "-triple" "amdgcn-amd-amdhsa" // HOSTSSE3-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" // HOSTSSE3-SAME: "-aux-target-feature" "+sse3" -// NOHOSTCPU: "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" "amdgcn-amd-amdhsa" +// NOHOSTCPU: "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9
[llvm-branch-commits] [clang] daf39e3 - [amdgpu] Default to code object v3
Author: Jon Chesterfield Date: 2020-12-17T16:09:33Z New Revision: daf39e3f2dba18bd39cd89a1c91bae126a31d4fe URL: https://github.com/llvm/llvm-project/commit/daf39e3f2dba18bd39cd89a1c91bae126a31d4fe DIFF: https://github.com/llvm/llvm-project/commit/daf39e3f2dba18bd39cd89a1c91bae126a31d4fe.diff LOG: [amdgpu] Default to code object v3 [amdgpu] Default to code object v3 v4 is not yet readily available, and doesn't appear to be implemented in the back end Reviewed By: t-tye, yaxunl Differential Revision: https://reviews.llvm.org/D93258 Added: Modified: clang/include/clang/Driver/Options.td clang/lib/Driver/ToolChains/CommonArgs.cpp clang/test/Driver/hip-code-object-version.hip llvm/docs/AMDGPUUsage.rst Removed: diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index f384e0d993c2..07f15add28ec 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2909,7 +2909,7 @@ def mexec_model_EQ : Joined<["-"], "mexec-model=">, Group; def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group, - HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">, + HelpText<"Specify code object ABI version. Defaults to 3. (AMDGPU only)">, MetaVarName<"">, Values<"2,3,4">; def mcode_object_v3_legacy : Flag<["-"], "mcode-object-v3">, Group, diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 72bedc16846d..04d0e0771f70 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1549,7 +1549,7 @@ unsigned tools::getOrCheckAMDGPUCodeObjectVersion( const Driver &D, const llvm::opt::ArgList &Args, bool Diagnose) { const unsigned MinCodeObjVer = 2; const unsigned MaxCodeObjVer = 4; - unsigned CodeObjVer = 4; + unsigned CodeObjVer = 3; // Emit warnings for legacy options even if they are overridden. if (Diagnose) { diff --git a/clang/test/Driver/hip-code-object-version.hip b/clang/test/Driver/hip-code-object-version.hip index 51d9004b0cbf..6e4e96688593 100644 --- a/clang/test/Driver/hip-code-object-version.hip +++ b/clang/test/Driver/hip-code-object-version.hip @@ -53,7 +53,7 @@ // RUN: --offload-arch=gfx906 -nogpulib \ // RUN: %s 2>&1 | FileCheck -check-prefix=VD %s -// VD: "-mllvm" "--amdhsa-code-object-version=4" +// VD: "-mllvm" "--amdhsa-code-object-version=3" // VD: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906" // Check invalid code object version option. diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 6d3fa7021a7a..c8dda47352ab 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -911,12 +911,12 @@ The AMDGPU backend uses the following ELF header: * ``ELFABIVERSION_AMDGPU_HSA_V3`` is used to specify the version of AMD HSA runtime ABI for code object V3. Specify using the Clang option -``-mcode-object-version=3``. +``-mcode-object-version=3``. This is the default code object +version if not specified. * ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA runtime ABI for code object V4. Specify using the Clang option -``-mcode-object-version=4``. This is the default code object -version if not specified. +``-mcode-object-version=4``. * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL runtime ABI. @@ -2871,10 +2871,6 @@ non-AMD key names should be prefixed by "*vendor-name*.". Code Object V3 Metadata +++ -.. warning:: - Code object V3 is not the default code object version emitted by this version - of LLVM. - Code object V3 to V4 metadata is specified by the ``NT_AMDGPU_METADATA`` note record (see :ref:`amdgpu-note-records-v3-v4`). @@ -3279,6 +3275,10 @@ same *vendor-name*. Code Object V4 Metadata +++ +.. warning:: + Code object V4 is not the default code object version emitted by this version + of LLVM. + Code object V4 metadata is the same as :ref:`amdgpu-amdhsa-code-object-metadata-v3` with the changes and additions defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v3`. ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] 89a0f48 - [libomptarget][cuda] Detect missing symbols in plugin at build time
Author: Jon Chesterfield Date: 2020-11-27T15:39:41Z New Revision: 89a0f48c58f82262c7ce2b9ca51ffad0ffc559ea URL: https://github.com/llvm/llvm-project/commit/89a0f48c58f82262c7ce2b9ca51ffad0ffc559ea DIFF: https://github.com/llvm/llvm-project/commit/89a0f48c58f82262c7ce2b9ca51ffad0ffc559ea.diff LOG: [libomptarget][cuda] Detect missing symbols in plugin at build time [libomptarget][cuda] Detect missing symbols in plugin at build time Passes -z,defs to the linker. Error on unresolved symbol references. Otherwise, those unresolved symbols present as target code running on the host as the plugin fails to load. This is significantly harder to debug than a link time error. Flag matches that passed by amdgcn and ve plugins. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D92143 Added: Modified: openmp/libomptarget/plugins/cuda/CMakeLists.txt Removed: diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt index 54bcdf26e9e6..e0299b1f3270 100644 --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -39,7 +39,8 @@ install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR} target_link_libraries(omptarget.rtl.cuda ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + "-Wl,-z,defs") # Report to the parent scope that we are building a plugin for CUDA. set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE) ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] ae9d96a - [libomptarget][amdgpu] Address compiler warnings, drive by fixes
Author: Jon Chesterfield Date: 2020-12-03T11:09:12Z New Revision: ae9d96a656a17fa782ccaa9ba10d4570f497e855 URL: https://github.com/llvm/llvm-project/commit/ae9d96a656a17fa782ccaa9ba10d4570f497e855 DIFF: https://github.com/llvm/llvm-project/commit/ae9d96a656a17fa782ccaa9ba10d4570f497e855.diff LOG: [libomptarget][amdgpu] Address compiler warnings, drive by fixes [libomptarget][amdgpu] Address compiler warnings, drive by fixes Initialize some variables, remove unused ones. Changes the debug printing condition to align with the aomp test suite. Differential Revision: https://reviews.llvm.org/D92559 Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 477439d19b50..f22b4697f30b 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -184,7 +184,7 @@ struct KernelTy { int8_t ExecutionMode; int16_t ConstWGSize; int32_t device_id; - void *CallStackAddr; + void *CallStackAddr = nullptr; const char *Name; KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id, @@ -322,7 +322,8 @@ class RTLDeviceInfoTy { std::vector, uint64_t>> deviceStateStore; - static const int HardTeamLimit = 1 << 20; // 1 Meg + static const unsigned HardTeamLimit = + (1 << 16) - 1; // 64K needed to fit in uint16 static const int DefaultNumTeams = 128; static const int Max_Teams = llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams]; @@ -648,7 +649,7 @@ int32_t __tgt_rtl_init_device(int device_id) { DeviceInfo.ComputeUnits[device_id] = compute_units; DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); } - if (print_kernel_trace > 1) + if (print_kernel_trace == 4) fprintf(stderr, "Device#%-2d CU's: %2d\n", device_id, DeviceInfo.ComputeUnits[device_id]); @@ -926,6 +927,27 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image) { + // This function loads the device image onto gpu[device_id] and does other + // per-image initialization work. Specifically: + // + // - Initialize an omptarget_device_environmentTy instance embedded in the + // image at the symbol "omptarget_device_environment" + // Fields debug_level, device_num, num_devices. Used by the deviceRTL. + // + // - Allocate a large array per-gpu (could be moved to init_device) + // - Read a uint64_t at symbol omptarget_nvptx_device_State_size + // - Allocate at least that many bytes of gpu memory + // - Zero initialize it + // - Write the pointer to the symbol omptarget_nvptx_device_State + // + // - Pulls some per-kernel information together from various sources and + // records it in the KernelsList for quicker access later + // + // The initialization can be done before or after loading the image onto the + // gpu. This function presently does a mixture. Using the hsa api to get/set + // the information is simpler to implement, in exchange for more complicated + // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes + // back from the gpu vs a hashtable lookup on the host. const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart; @@ -962,7 +984,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, if (si.size != sizeof(host_device_env)) { return ATMI_STATUS_ERROR; } -DP("Setting global device environment %lu bytes\n", si.size); +DP("Setting global device environment %u bytes\n", si.size); uint64_t offset = (char *)si.addr - (char *)image->ImageStart; void *pos = (char *)data + offset; memcpy(pos, &host_device_env, sizeof(host_device_env)); @@ -1145,7 +1167,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, uint16_t TSize; uint16_t WG_Size; uint8_t Mode; - uint8_t HostServices; }; struct KernDescValType KernDescVal; std::string KernDescNameStr(e->name); @@ -1154,7 +1175,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, void *KernDescPtr; uint32_t KernDescSize; -void *CallStackAddr; +void *CallStackAddr = nullptr; err = interop_get_symbol_info((char *)image->ImageStart, img_size, KernDescName, &KernDescPtr, &KernDescSize); @@ -1176,7 +1197,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, DP("KernDesc: TSize: %d\n", KernDescVal.TSize); DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); DP("KernDesc: Mode: %d\n", KernDescVal.Mode); - DP("KernDesc: HostServices:
[llvm-branch-commits] [openmp] f628eef - [libomptarget][amdgpu] Fix latent race in load binary
Author: Jon Chesterfield Date: 2020-12-04T16:29:09Z New Revision: f628eef98acd24f8eb6a52d67ee887bb18f04bca URL: https://github.com/llvm/llvm-project/commit/f628eef98acd24f8eb6a52d67ee887bb18f04bca DIFF: https://github.com/llvm/llvm-project/commit/f628eef98acd24f8eb6a52d67ee887bb18f04bca.diff LOG: [libomptarget][amdgpu] Fix latent race in load binary Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index f22b4697f30b..ea8770e4543a 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -925,6 +925,26 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, return res; } +static atmi_status_t atmi_calloc(void **ret_ptr, size_t size, + atmi_mem_place_t place) { + uint64_t rounded = 4 * ((size + 3) / 4); + void *ptr; + atmi_status_t err = atmi_malloc(&ptr, rounded, place); + if (err != ATMI_STATUS_SUCCESS) { +return err; + } + + hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4); + if (rc != HSA_STATUS_SUCCESS) { +fprintf(stderr, "zero fill device_state failed with %u\n", rc); +atmi_free(ptr); +return ATMI_STATUS_ERROR; + } + + *ret_ptr = ptr; + return ATMI_STATUS_SUCCESS; +} + __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image) { // This function loads the device image onto gpu[device_id] and does other @@ -1024,7 +1044,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, assert(dss.second == 0); void *ptr = NULL; atmi_status_t err = - atmi_malloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id)); + atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id)); if (err != ATMI_STATUS_SUCCESS) { fprintf(stderr, "Failed to allocate device_state array\n"); return NULL; @@ -1062,13 +1082,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, fprintf(stderr, "memcpy install of state_ptr failed\n"); return NULL; } - -assert((device_State_bytes & 0x3) == 0); // known >= 4 byte aligned -hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, device_State_bytes / 4); -if (rc != HSA_STATUS_SUCCESS) { - fprintf(stderr, "zero fill device_state failed with %u\n", rc); - return NULL; -} } // TODO: Check with Guansong to understand the below comment more thoroughly. ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] e1b8e8a - [libomptarget][amdgpu] Skip device_State allocation when using bss global
Author: Jon Chesterfield Date: 2020-12-06T12:13:56Z New Revision: e1b8e8a1f4c35c8596956d56ffc9f1d91b64f780 URL: https://github.com/llvm/llvm-project/commit/e1b8e8a1f4c35c8596956d56ffc9f1d91b64f780 DIFF: https://github.com/llvm/llvm-project/commit/e1b8e8a1f4c35c8596956d56ffc9f1d91b64f780.diff LOG: [libomptarget][amdgpu] Skip device_State allocation when using bss global Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index ea8770e4543a..e688ef7f41ec 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1033,54 +1033,64 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, DP("ATMI module successfully loaded!\n"); - // Zero the pseudo-bss variable by calling into hsa - // Do this post-load to handle got - uint64_t device_State_bytes = - get_device_State_bytes((char *)image->ImageStart, img_size); - auto &dss = DeviceInfo.deviceStateStore[device_id]; - if (device_State_bytes != 0) { - -if (dss.first.get() == nullptr) { - assert(dss.second == 0); - void *ptr = NULL; - atmi_status_t err = - atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id)); - if (err != ATMI_STATUS_SUCCESS) { -fprintf(stderr, "Failed to allocate device_state array\n"); -return NULL; - } - dss = {std::unique_ptr{ptr}, - device_State_bytes}; -} - -void *ptr = dss.first.get(); -if (device_State_bytes != dss.second) { - fprintf(stderr, "Inconsistent sizes of device_State unsupported\n"); - exit(1); -} + { +// the device_State array is either large value in bss or a void* that +// needs to be assigned to a pointer to an array of size device_state_bytes void *state_ptr; uint32_t state_ptr_size; -err = atmi_interop_hsa_get_symbol_info(get_gpu_mem_place(device_id), - "omptarget_nvptx_device_State", - &state_ptr, &state_ptr_size); +atmi_status_t err = atmi_interop_hsa_get_symbol_info( +get_gpu_mem_place(device_id), "omptarget_nvptx_device_State", +&state_ptr, &state_ptr_size); if (err != ATMI_STATUS_SUCCESS) { - fprintf(stderr, "failed to find device_state ptr\n"); + fprintf(stderr, "failed to find device_state symbol\n"); return NULL; } -if (state_ptr_size != sizeof(void *)) { + +if (state_ptr_size < sizeof(void *)) { fprintf(stderr, "unexpected size of state_ptr %u != %zu\n", state_ptr_size, sizeof(void *)); return NULL; } -// write ptr to device memory so it can be used by later kernels -err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *), - device_id); -if (err != ATMI_STATUS_SUCCESS) { - fprintf(stderr, "memcpy install of state_ptr failed\n"); - return NULL; +// if it's larger than a void*, assume it's a bss array and no further +// initialization is required. Only try to set up a pointer for +// sizeof(void*) +if (state_ptr_size == sizeof(void *)) { + uint64_t device_State_bytes = + get_device_State_bytes((char *)image->ImageStart, img_size); + if (device_State_bytes == 0) { +return NULL; + } + + auto &dss = DeviceInfo.deviceStateStore[device_id]; + if (dss.first.get() == nullptr) { +assert(dss.second == 0); +void *ptr = NULL; +atmi_status_t err = +atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id)); +if (err != ATMI_STATUS_SUCCESS) { + fprintf(stderr, "Failed to allocate device_state array\n"); + return NULL; +} +dss = {std::unique_ptr{ptr}, + device_State_bytes}; + } + + void *ptr = dss.first.get(); + if (device_State_bytes != dss.second) { +fprintf(stderr, "Inconsistent sizes of device_State unsupported\n"); +exit(1); + } + + // write ptr to device memory so it can be used by later kernels + err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, + sizeof(void *), device_id); + if (err != ATMI_STATUS_SUCCESS) { +fprintf(stderr, "memcpy install of state_ptr failed\n"); +return NULL; + } } } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] 71f4693 - [libomptarget][amdgpu] Add plumbing to call into hostrpc lib, if linked
Author: Jon Chesterfield Date: 2020-12-07T15:24:01Z New Revision: 71f469302079baeb552b29c6959ac884da101102 URL: https://github.com/llvm/llvm-project/commit/71f469302079baeb552b29c6959ac884da101102 DIFF: https://github.com/llvm/llvm-project/commit/71f469302079baeb552b29c6959ac884da101102.diff LOG: [libomptarget][amdgpu] Add plumbing to call into hostrpc lib, if linked Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index e688ef7f41ec..252abca08944 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -45,6 +45,29 @@ #endif #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" +// hostrpc interface, FIXME: consider moving to its own include these are +// statically linked into amdgpu/plugin if present from hostrpc_services.a, +// linked as --whole-archive to override the weak symbols that are used to +// implement a fallback for toolchains that do not yet have a hostrpc library. +extern "C" { +unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q, +uint32_t device_id); +hsa_status_t hostrpc_init(); +hsa_status_t hostrpc_terminate(); + +__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } +__attribute__((weak)) hsa_status_t hostrpc_terminate() { + return HSA_STATUS_SUCCESS; +} +__attribute__((weak)) unsigned long +hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) { + DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " + "missing\n", + device_id); + return 0; +} +} + int print_kernel_trace; // Size of the target call stack struture @@ -431,6 +454,8 @@ class RTLDeviceInfoTy { DP("Error when initializing HSA-ATMI\n"); return; } +// Init hostcall soon after initializing ATMI +hostrpc_init(); HSAAgents = find_gpu_agents(); NumberOfDevices = (int)HSAAgents.size(); @@ -520,6 +545,8 @@ class RTLDeviceInfoTy { // atmi_finalize removes access to it deviceStateStore.clear(); KernelArgPoolMap.clear(); +// Terminate hostrpc before finalizing ATMI +hostrpc_terminate(); atmi_finalize(); } }; @@ -1540,6 +1567,8 @@ static uint64_t acquire_available_packet_id(hsa_queue_t *queue) { return packet_id; } +extern bool g_atmi_hostcall_required; // declared without header by atmi + static int32_t __tgt_rtl_run_target_team_region_locked( int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptr diff _t *tgt_offsets, int32_t arg_num, int32_t num_teams, @@ -1683,6 +1712,22 @@ int32_t __tgt_rtl_run_target_team_region_locked( impl_args->offset_y = 0; impl_args->offset_z = 0; + // assign a hostcall buffer for the selected Q + if (g_atmi_hostcall_required) { +// hostrpc_assign_buffer is not thread safe, and this function is +// under a multiple reader lock, not a writer lock. +static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_lock(&hostcall_init_lock); +impl_args->hostcall_ptr = hostrpc_assign_buffer( +DeviceInfo.HSAAgents[device_id], queue, device_id); +pthread_mutex_unlock(&hostcall_init_lock); +if (!impl_args->hostcall_ptr) { + DP("hostrpc_assign_buffer failed, gpu would dereference null and " + "error\n"); + return OFFLOAD_FAIL; +} + } + packet->kernarg_address = kernarg; } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] cab9f69 - [libomptarget][amdgpu] Improve diagnostics on arch mismatch
Author: Jon Chesterfield Date: 2020-12-09T18:55:53Z New Revision: cab9f6923522475e0d2137c66622c3fa70b01d3b URL: https://github.com/llvm/llvm-project/commit/cab9f6923522475e0d2137c66622c3fa70b01d3b DIFF: https://github.com/llvm/llvm-project/commit/cab9f6923522475e0d2137c66622c3fa70b01d3b.diff LOG: [libomptarget][amdgpu] Improve diagnostics on arch mismatch Added: openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h Modified: openmp/libomptarget/plugins/amdgpu/CMakeLists.txt openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt index 0c50ffdf2fa6..38f0afabf3ad 100644 --- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt +++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt @@ -57,6 +57,7 @@ add_library(omptarget.rtl.amdgpu SHARED impl/atmi.cpp impl/atmi_interop_hsa.cpp impl/data.cpp + impl/get_elf_mach_gfx_name.cpp impl/machine.cpp impl/system.cpp impl/utils.cpp diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp new file mode 100644 index ..45af34684117 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp @@ -0,0 +1,53 @@ +#include "get_elf_mach_gfx_name.h" + +// This header conflicts with the system elf.h (macros vs enums of the same +// identifier) and contains more up to date values for the enum checked here. +// rtl.cpp uses the system elf.h. +#include "llvm/BinaryFormat/ELF.h" + +const char *get_elf_mach_gfx_name(uint32_t EFlags) { + using namespace llvm::ELF; + uint32_t Gfx = (EFlags & EF_AMDGPU_MACH); + switch (Gfx) { + case EF_AMDGPU_MACH_AMDGCN_GFX801: +return "gfx801"; + case EF_AMDGPU_MACH_AMDGCN_GFX802: +return "gfx802"; + case EF_AMDGPU_MACH_AMDGCN_GFX803: +return "gfx803"; + case EF_AMDGPU_MACH_AMDGCN_GFX805: +return "gfx805"; + case EF_AMDGPU_MACH_AMDGCN_GFX810: +return "gfx810"; + case EF_AMDGPU_MACH_AMDGCN_GFX900: +return "gfx900"; + case EF_AMDGPU_MACH_AMDGCN_GFX902: +return "gfx902"; + case EF_AMDGPU_MACH_AMDGCN_GFX904: +return "gfx904"; + case EF_AMDGPU_MACH_AMDGCN_GFX906: +return "gfx906"; + case EF_AMDGPU_MACH_AMDGCN_GFX908: +return "gfx908"; + case EF_AMDGPU_MACH_AMDGCN_GFX909: +return "gfx909"; + case EF_AMDGPU_MACH_AMDGCN_GFX90C: +return "gfx90c"; + case EF_AMDGPU_MACH_AMDGCN_GFX1010: +return "gfx1010"; + case EF_AMDGPU_MACH_AMDGCN_GFX1011: +return "gfx1011"; + case EF_AMDGPU_MACH_AMDGCN_GFX1012: +return "gfx1012"; + case EF_AMDGPU_MACH_AMDGCN_GFX1030: +return "gfx1030"; + case EF_AMDGPU_MACH_AMDGCN_GFX1031: +return "gfx1031"; + case EF_AMDGPU_MACH_AMDGCN_GFX1032: +return "gfx1032"; + case EF_AMDGPU_MACH_AMDGCN_GFX1033: +return "gfx1033"; + default: +return "--unknown gfx"; + } +} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h new file mode 100644 index ..b1be90dc29d5 --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h @@ -0,0 +1,8 @@ +#ifndef GET_ELF_MACH_GFX_NAME_H_INCLUDED +#define GET_ELF_MACH_GFX_NAME_H_INCLUDED + +#include + +const char *get_elf_mach_gfx_name(uint32_t EFlags); + +#endif diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 252abca08944..60040d1c0da4 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -36,6 +36,7 @@ #include "internal.h" #include "Debug.h" +#include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" @@ -92,14 +93,6 @@ uint32_t TgtStackItemSize = 0; #include "../../common/elf_common.c" -static bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { - const uint16_t amdgcnMachineID = 224; - int32_t r = elf_check_machine(image, amdgcnMachineID); - if (!r) { -DP("Supported machine ID not found\n"); - } - return r; -} /// Keep entries table per device struct FuncOrGblEntryTy { @@ -319,6 +312,7 @@ class RTLDeviceInfoTy { std::vector GroupsPerDevice; std::vector ThreadsPerGroup; std::vector WarpSize; + std::vector GPUName; // OpenMP properties std::vector NumTeams; @@ -472,6 +466,7 @@ class RTLDeviceInfoTy { FuncGblEntries.resize(NumberOfDevices); ThreadsPerGroup.resize(NumberOfDevices); ComputeUnits.resize(NumberOfDevices); +GPUName.resize(NumberOfDevices); GroupsPerDevice.resize(NumberOfDevices); WarpSize.resize(NumberOfDevices); NumTeams.resize(NumberOfD
[llvm-branch-commits] [openmp] e191d31 - [libomptarget][amdgpu] Robust handling of device_environment symbol
Author: Jon Chesterfield Date: 2020-12-09T19:21:51Z New Revision: e191d3115921d9b5b6602747bff72a1f2cf565c4 URL: https://github.com/llvm/llvm-project/commit/e191d3115921d9b5b6602747bff72a1f2cf565c4 DIFF: https://github.com/llvm/llvm-project/commit/e191d3115921d9b5b6602747bff72a1f2cf565c4.diff LOG: [libomptarget][amdgpu] Robust handling of device_environment symbol Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 60040d1c0da4d..e13d769a16aad 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -891,6 +891,7 @@ const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash, typedef struct { void *addr = nullptr; uint32_t size = UINT32_MAX; + uint32_t sh_type = SHT_NULL; } symbol_info; int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname, @@ -913,8 +914,23 @@ int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname, return 1; } - res->size = static_cast(sym->st_size); + if (sym->st_shndx == SHN_UNDEF) { +return 1; + } + + Elf_Scn *section = elf_getscn(elf, sym->st_shndx); + if (!section) { +return 1; + } + + Elf64_Shdr *header = elf64_getshdr(section); + if (!header) { +return 1; + } + res->addr = sym->st_value + base; + res->size = static_cast(sym->st_size); + res->sh_type = header->sh_type; return 0; } @@ -992,6 +1008,99 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, return res; } +struct device_environment { + // initialise an omptarget_device_environmentTy in the deviceRTL + // patches around diff erences in the deviceRTL between trunk, aomp, + // rocmcc. Over time these diff erences will tend to zero and this class + // simplified. + // Symbol may be in .data or .bss, and may be missing fields: + // - aomp has debug_level, num_devices, device_num + // - trunk has debug_level + // - under review in trunk is debug_level, device_num + // - rocmcc matches aomp, patch to swap num_devices and device_num + + // If the symbol is in .data (aomp, rocm) it can be written directly. + // If it is in .bss, we must wait for it to be allocated space on the + // gpu (trunk) and initialize after loading. + const char *sym() { return "omptarget_device_environment"; } + + omptarget_device_environmentTy host_device_env; + symbol_info si; + bool valid = false; + + __tgt_device_image *image; + const size_t img_size; + + device_environment(int device_id, int number_devices, + __tgt_device_image *image, const size_t img_size) + : image(image), img_size(img_size) { + +host_device_env.num_devices = number_devices; +host_device_env.device_num = device_id; +host_device_env.debug_level = 0; +#ifdef OMPTARGET_DEBUG +if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { + host_device_env.debug_level = std::stoi(envStr); +} +#endif + +int rc = get_symbol_info_without_loading((char *)image->ImageStart, + img_size, sym(), &si); +if (rc != 0) { + DP("Finding global device environment '%s' - symbol missing.\n", sym()); + return; +} + +if (si.size > sizeof(host_device_env)) { + DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size, + sizeof(host_device_env)); + return; +} + +valid = true; + } + + bool in_image() { return si.sh_type != SHT_NOBITS; } + + atmi_status_t before_loading(void *data, size_t size) { +assert(valid); +if (in_image()) { + DP("Setting global device environment before load (%u bytes)\n", si.size); + uint64_t offset = (char *)si.addr - (char *)image->ImageStart; + void *pos = (char *)data + offset; + memcpy(pos, &host_device_env, si.size); +} +return ATMI_STATUS_SUCCESS; + } + + atmi_status_t after_loading() { +assert(valid); +if (!in_image()) { + DP("Setting global device environment after load (%u bytes)\n", si.size); + int device_id = host_device_env.device_num; + + void *state_ptr; + uint32_t state_ptr_size; + atmi_status_t err = atmi_interop_hsa_get_symbol_info( + get_gpu_mem_place(device_id), sym(), &state_ptr, &state_ptr_size); + if (err != ATMI_STATUS_SUCCESS) { +DP("failed to find %s in loaded image\n", sym()); +return err; + } + + if (state_ptr_size != si.size) { +DP("Symbol had size %u before loading, %u after\n", state_ptr_size, + si.size); +return ATMI_STATUS_ERROR; + } + + return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env, + state_ptr_size, device_id); +}
[llvm-branch-commits] [openmp] c9bc414 - [libomptarget][amdgpu] Let default number of teams equal number of CUs
Author: Jon Chesterfield Date: 2020-12-09T19:35:34Z New Revision: c9bc414840a41aff3e83a1529ba6dd98e13ce39d URL: https://github.com/llvm/llvm-project/commit/c9bc414840a41aff3e83a1529ba6dd98e13ce39d DIFF: https://github.com/llvm/llvm-project/commit/c9bc414840a41aff3e83a1529ba6dd98e13ce39d.diff LOG: [libomptarget][amdgpu] Let default number of teams equal number of CUs Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index e13d769a16aa..18bf67f7fc8a 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -789,9 +789,17 @@ int32_t __tgt_rtl_init_device(int device_id) { DP("Default number of teams set according to environment %d\n", DeviceInfo.EnvNumTeams); } else { -DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; -DP("Default number of teams set according to library's default %d\n", - RTLDeviceInfoTy::DefaultNumTeams); +char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); +int TeamsPerCU = 1; // default number of teams per CU is 1 +if (TeamsPerCUEnvStr) { + TeamsPerCU = std::stoi(TeamsPerCUEnvStr); +} + +DeviceInfo.NumTeams[device_id] = + TeamsPerCU * DeviceInfo.ComputeUnits[device_id]; +DP("Default number of teams = %d * number of compute units %d\n", + TeamsPerCU, + DeviceInfo.ComputeUnits[device_id]); } if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) { @@ -1548,11 +1556,12 @@ int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { // loop_tripcount. void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, int ExecutionMode, int EnvTeamLimit, int EnvNumTeams, - int num_teams, int thread_limit, uint64_t loop_tripcount) { + int num_teams, int thread_limit, uint64_t loop_tripcount, + int32_t device_id) { int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0 ? DeviceInfo.EnvMaxTeamsDefault - : DeviceInfo.Max_Teams; + : DeviceInfo.NumTeams[device_id]; if (Max_Teams > DeviceInfo.HardTeamLimit) Max_Teams = DeviceInfo.HardTeamLimit; @@ -1752,7 +1761,8 @@ int32_t __tgt_rtl_run_target_team_region_locked( DeviceInfo.EnvNumTeams, num_teams, // From run_region arg thread_limit, // From run_region arg -loop_tripcount // From run_region arg +loop_tripcount, // From run_region arg +KernelInfo->device_id ); if (print_kernel_trace == 4) ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] 7c59614 - [libomptarget][amdgpu] clang-format src/rtl.cpp
Author: Jon Chesterfield Date: 2020-12-09T19:45:51Z New Revision: 7c5961439485e59b8f463b17bf37dab8d8aa7c3a URL: https://github.com/llvm/llvm-project/commit/7c5961439485e59b8f463b17bf37dab8d8aa7c3a DIFF: https://github.com/llvm/llvm-project/commit/7c5961439485e59b8f463b17bf37dab8d8aa7c3a.diff LOG: [libomptarget][amdgpu] clang-format src/rtl.cpp Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 18bf67f7fc8a..5ec5f5e45e36 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -93,7 +93,6 @@ uint32_t TgtStackItemSize = 0; #include "../../common/elf_common.c" - /// Keep entries table per device struct FuncOrGblEntryTy { __tgt_target_table Table; @@ -708,7 +707,7 @@ int32_t __tgt_rtl_init_device(int device_id) { char GetInfoName[64]; // 64 max size returned by get info err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, - (void *) GetInfoName); + (void *)GetInfoName); if (err) DeviceInfo.GPUName[device_id] = "--unknown gpu--"; else { @@ -718,7 +717,7 @@ int32_t __tgt_rtl_init_device(int device_id) { if (print_kernel_trace == 4) fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id, DeviceInfo.ComputeUnits[device_id], - DeviceInfo.GPUName[device_id].c_str()); +DeviceInfo.GPUName[device_id].c_str()); // Query attributes to determine number of threads/block and blocks/grid. uint16_t workgroup_max_dim[3]; @@ -794,12 +793,11 @@ int32_t __tgt_rtl_init_device(int device_id) { if (TeamsPerCUEnvStr) { TeamsPerCU = std::stoi(TeamsPerCUEnvStr); } - + DeviceInfo.NumTeams[device_id] = - TeamsPerCU * DeviceInfo.ComputeUnits[device_id]; +TeamsPerCU * DeviceInfo.ComputeUnits[device_id]; DP("Default number of teams = %d * number of compute units %d\n", - TeamsPerCU, - DeviceInfo.ComputeUnits[device_id]); + TeamsPerCU, DeviceInfo.ComputeUnits[device_id]); } if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) { @@ -1183,7 +1181,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, "Possible gpu arch mismatch: device:%s, image:%s please check" " compiler flag: -march=\n", DeviceInfo.GPUName[device_id].c_str(), - get_elf_mach_gfx_name(elf_e_flags(image))); + get_elf_mach_gfx_name(elf_e_flags(image))); return NULL; } @@ -1759,11 +1757,10 @@ int32_t __tgt_rtl_run_target_team_region_locked( getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit, DeviceInfo.EnvNumTeams, -num_teams, // From run_region arg -thread_limit, // From run_region arg +num_teams, // From run_region arg +thread_limit, // From run_region arg loop_tripcount, // From run_region arg -KernelInfo->device_id - ); +KernelInfo->device_id); if (print_kernel_trace == 4) // enum modes are SPMD, GENERIC, NONE 0,1,2 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] ce93de3 - [libomptarget][nfc] Remove data_sharing type aliasing
Author: Jon Chesterfield Date: 2020-12-11T02:13:34Z New Revision: ce93de3bb284c944676c7b81890156d9d80b1db9 URL: https://github.com/llvm/llvm-project/commit/ce93de3bb284c944676c7b81890156d9d80b1db9 DIFF: https://github.com/llvm/llvm-project/commit/ce93de3bb284c944676c7b81890156d9d80b1db9.diff LOG: [libomptarget][nfc] Remove data_sharing type aliasing [libomptarget][nfc] Remove data_sharing type aliasing Libomptarget previous used __kmpc_data_sharing_slot to access values of type __kmpc_data_sharing_{worker,master}_slot_static. This aliasing violation was benign in practice. The master type has since been removed, so a single type can be used instead. This is particularly helpful for the transition to an openmp deviceRTL, as the c++/openmp compiler for amdgcn currently rejects the flexible array member for being an incomplete type. Serves the same purpose as abandoned D86324. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D93075 Added: Modified: openmp/libomptarget/deviceRTLs/common/omptarget.h openmp/libomptarget/deviceRTLs/interface.h Removed: diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h index 0ccd71c3b55f..fc4eb6bfbcfa 100644 --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -74,6 +74,16 @@ class omptarget_nvptx_SharedArgs { extern DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +// Worker slot type which is initialized with the default worker slot +// size of 4*32 bytes. +struct __kmpc_data_sharing_slot { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Worker_Warp_Slot_Size]; +}; + // Data structure to keep in shared memory that traces the current slot, stack, // and frame pointer as well as the active threads that didn't exit the current // environment. @@ -83,15 +93,6 @@ struct DataSharingStateTy { void * volatile FramePtr[DS_Max_Warp_Number]; __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; }; -// Additional worker slot type which is initialized with the default worker slot -// size of 4*32 bytes. -struct __kmpc_data_sharing_worker_slot_static { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Worker_Warp_Slot_Size]; -}; extern DEVICE SHARED DataSharingStateTy DataSharingState; @@ -213,7 +214,7 @@ class omptarget_nvptx_TeamDescr { workDescrForActiveParallel; // one, ONLY for the active par ALIGN(16) - __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number]; + __kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number]; }; diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h index 330880556293..5f539bc3fd66 100644 --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -438,17 +438,6 @@ EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); EXTERN void __kmpc_end_sharing_variables(); EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); -// The slot used for data sharing by the master and worker threads. We use a -// complete (default size version and an incomplete one so that we allow sizes -// greater than the default). -struct __kmpc_data_sharing_slot { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[]; -}; - // SPMD execution mode interrogation function. EXTERN int8_t __kmpc_is_spmd_exec_mode(); ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits