from:"Jon Chesterfield via llvm\-branch\-commits"

[llvm-branch-commits] [openmp] 78b0630 - [libomptarget][cuda] Call v2 functions explicitly

2021-01-23 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-23T20:33:13Z
New Revision: 78b0630b72a9742d62b07cef912b72f1743bfae9

URL: 
https://github.com/llvm/llvm-project/commit/78b0630b72a9742d62b07cef912b72f1743bfae9
DIFF: 
https://github.com/llvm/llvm-project/commit/78b0630b72a9742d62b07cef912b72f1743bfae9.diff

LOG: [libomptarget][cuda] Call v2 functions explicitly

[libomptarget][cuda] Call v2 functions explicitly

rtl.cpp calls functions like cuMemFree that are replaced by a macro
in cuda.h with cuMemFree_v2. This patch changes the source to use
the v2 names consistently.

See also D95104, D95155 for the idea. Alternatives are to use a mixture,
e.g. call the macro names and explictly dlopen the _v2 names, or to keep
the current status where the symbols are replaced by macros in both files

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95274

Added: 


Modified: 
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
openmp/libomptarget/plugins/cuda/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h 
b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
index 832c26965144..dd579a1f7490 100644
--- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
@@ -48,18 +48,6 @@ typedef enum CUctx_flags_enum {
   CU_CTX_SCHED_MASK = 0x07,
 } CUctx_flags;
 
-#define cuMemFree cuMemFree_v2
-#define cuMemAlloc cuMemAlloc_v2
-#define cuMemcpyDtoH cuMemcpyDtoH_v2
-#define cuMemcpyHtoD cuMemcpyHtoD_v2
-#define cuStreamDestroy cuStreamDestroy_v2
-#define cuModuleGetGlobal cuModuleGetGlobal_v2
-#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
-#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
-#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
-#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
-#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2
-
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
 CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
@@ -72,26 +60,26 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, 
unsigned, unsigned,
 unsigned, unsigned, unsigned, CUstream, void **,
 void **);
 
-CUresult cuMemAlloc(CUdeviceptr *, size_t);
-CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+CUresult cuMemAlloc_v2(CUdeviceptr *, size_t);
+CUresult cuMemcpyDtoDAsync_v2(CUdeviceptr, CUdeviceptr, size_t, CUstream);
 
-CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
-CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
-CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
-CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
+CUresult cuMemcpyDtoH_v2(void *, CUdeviceptr, size_t);
+CUresult cuMemcpyDtoHAsync_v2(void *, CUdeviceptr, size_t, CUstream);
+CUresult cuMemcpyHtoD_v2(CUdeviceptr, const void *, size_t);
+CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr, const void *, size_t, CUstream);
 
-CUresult cuMemFree(CUdeviceptr);
+CUresult cuMemFree_v2(CUdeviceptr);
 CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
-CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
+CUresult cuModuleGetGlobal_v2(CUdeviceptr *, size_t *, CUmodule, const char *);
 
 CUresult cuModuleUnload(CUmodule);
 CUresult cuStreamCreate(CUstream *, unsigned);
-CUresult cuStreamDestroy(CUstream);
+CUresult cuStreamDestroy_v2(CUstream);
 CUresult cuStreamSynchronize(CUstream);
 CUresult cuCtxSetCurrent(CUcontext);
-CUresult cuDevicePrimaryCtxRelease(CUdevice);
+CUresult cuDevicePrimaryCtxRelease_v2(CUdevice);
 CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
-CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned);
+CUresult cuDevicePrimaryCtxSetFlags_v2(CUdevice, unsigned);
 CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice);
 CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *,
 void **);

diff  --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp 
b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index e4ac1e0820e6..f83c9df920aa 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -110,8 +110,8 @@ bool checkResult(CUresult Err, const char *ErrMsg) {
 
 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
CUstream Stream) {
-  CUresult Err =
-  cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, 
Stream);
+  CUresult Err = cuMemcpyDtoDAsync_v2((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr,
+  Size, Stream);
 
   if (Err != CUDA_SUCCESS) {
 REPORT("Error when copying data from device to device. Pointers: src "
@@ -207,8 +207,8 @@ class StreamManagerTy {
 
   for (CUstream &S : StreamPool[I]) {
 if (S)
-  checkResult(

[llvm-branch-commits] [openmp] dc70c56 - [libomptarget][amdgpu][nfc] Update comments

2021-01-23 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-23T22:53:58Z
New Revision: dc70c56be5922b874b1408edc1315fcda40680ba

URL: 
https://github.com/llvm/llvm-project/commit/dc70c56be5922b874b1408edc1315fcda40680ba
DIFF: 
https://github.com/llvm/llvm-project/commit/dc70c56be5922b874b1408edc1315fcda40680ba.diff

LOG: [libomptarget][amdgpu][nfc] Update comments

[libomptarget][amdgpu][nfc] Update comments

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95295

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/impl/data.cpp
openmp/libomptarget/plugins/amdgpu/impl/machine.h
openmp/libomptarget/plugins/amdgpu/impl/rt.h
openmp/libomptarget/plugins/amdgpu/impl/system.cpp
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp 
b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
index 39546fbae4b3..0d98d5c51ce1 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
@@ -51,7 +51,6 @@ hsa_amd_memory_pool_t 
get_memory_pool_by_mem_place(atmi_mem_place_t place) {
 void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) {
   if (place.dev_type == ATMI_DEVTYPE_CPU)
 allow_access_to_all_gpu_agents(ptr);
-  // TODO(ashwinma): what if one GPU wants to access another GPU?
 }
 
 atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) 
{

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.h 
b/openmp/libomptarget/plugins/amdgpu/impl/machine.h
index 93169ed4eafb..9250c3b7c663 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/machine.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.h
@@ -22,9 +22,6 @@ class ATLProcessor {
   }
   void addMemory(const ATLMemory &p);
   hsa_agent_t agent() const { return agent_; }
-  // TODO(ashwinma): Do we need this or are we building the machine structure
-  // just once in the program?
-  // void removeMemory(ATLMemory &p);
   const std::vector &memories() const;
   atmi_devtype_t type() const { return type_; }
 
@@ -86,7 +83,7 @@ template  T &get_processor(atmi_place_t place) {
   int dev_id = place.device_id;
   if (dev_id == -1) {
 // user is asking runtime to pick a device
-// TODO(ashwinma): best device of this type? pick 0 for now
+// best device of this type? pick 0 for now
 dev_id = 0;
   }
   return g_atl_machine.processors()[dev_id];

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h 
b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
index 757919eb3a45..a857861307c6 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
@@ -26,10 +26,7 @@ class Environment {
   void GetEnvAll();
 
   int getMaxQueueSize() const { return max_queue_size_; }
-
-  // TODO(ashwinma): int may change to enum if we have more debug modes
   int getDebugMode() const { return debug_mode_; }
-  // TODO(ashwinma): int may change to enum if we have more profile modes
 
 private:
   std::string GetEnv(const char *name) {
@@ -69,10 +66,7 @@ class Runtime final {
   static atmi_status_t Memfree(void *);
   static atmi_status_t Malloc(void **, size_t, atmi_mem_place_t);
 
-  // environment variables
   int getMaxQueueSize() const { return env_.getMaxQueueSize(); }
-
-  // TODO(ashwinma): int may change to enum if we have more debug modes
   int getDebugMode() const { return env_.getDebugMode(); }
 
 protected:

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp 
b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
index 913dc91b298d..1a126a186ff2 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -133,7 +133,7 @@ static const std::map 
ArgValueKind = {
 {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
 };
 
-// public variables -- TODO(ashwinma) move these to a runtime object?
+// global variables. TODO: Get rid of these
 atmi_machine_t g_atmi_machine;
 ATLMachine g_atl_machine;
 
@@ -210,8 +210,6 @@ atmi_status_t Runtime::Initialize() {
 }
 
 atmi_status_t Runtime::Finalize() {
-  // TODO(ashwinma): Finalize all processors, queues, signals, kernarg memory
-  // regions
   hsa_status_t err;
 
   for (uint32_t i = 0; i < g_executables.size(); i++) {
@@ -874,8 +872,6 @@ static hsa_status_t get_code_object_custom_metadata(void 
*binary,
 msgpackErrorCheck(iterate args map in kernel args metadata,
   msgpack_errors);
 
-// TODO(ashwinma): should the below population actions be done only for
-// non-implicit args?
 // populate info with sizes and offsets
 info.arg_sizes.push_back(lcArg.size_);
 // v3 has offset field and not align field

diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
ind

[llvm-branch-commits] [openmp] c3074d4 - [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics

2021-01-24 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-24T10:59:15Z
New Revision: c3074d48d38cc1207da893b6f3545b5777db4c27

URL: 
https://github.com/llvm/llvm-project/commit/c3074d48d38cc1207da893b6f3545b5777db4c27
DIFF: 
https://github.com/llvm/llvm-project/commit/c3074d48d38cc1207da893b6f3545b5777db4c27.diff

LOG: [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics

[libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics

Tested by diff of IR generated for target_impl.cu before and after. NFC. Part
of removing deviceRTL build time dependency on cuda SDK.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D95294

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu 
b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 2bf19523ef6f..1e3ba7d664af 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -28,9 +28,6 @@ int __shfl(int val, int src_line, int width = WARPSIZE);
 int __shfl_down(int var, unsigned detla, int width);
 int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
 void __syncwarp(int mask);
-void __threadfence();
-void __threadfence_block();
-void __threadfence_system();
 }
 
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
@@ -126,9 +123,9 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
: "memory");
 }
 
-DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
-DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
-DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
+DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
+DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
+DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
 
 // Calls to the NVPTX layer (assuming 1D layout)
 DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
@@ -140,39 +137,41 @@ DEVICE int GetNumberOfThreadsInBlock() { return 
__nvvm_read_ptx_sreg_ntid_x(); }
 DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
 DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
 
-// Forward declaration of atomics. Although they're template functions, we
-// already have definitions for 
diff erent types in CUDA internal headers with
-// the right mangled names.
-template  DEVICE T atomicAdd(T *address, T val);
-template  DEVICE T atomicInc(T *address, T val);
-template  DEVICE T atomicMax(T *address, T val);
-template  DEVICE T atomicExch(T *address, T val);
-template  DEVICE T atomicCAS(T *address, T compare, T val);
-
+// Atomics
 DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
-  return atomicAdd(Address, Val);
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
 DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
-  return atomicInc(Address, Val);
+  return __nvvm_atom_inc_gen_ui(Address, Val);
 }
+
 DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
-  return atomicMax(Address, Val);
+  return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
 }
+
 DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
-  return atomicExch(Address, Val);
+  uint32_t R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
 }
+
 DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
   uint32_t Val) {
-  return atomicCAS(Address, Compare, Val);
+  (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
+  __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return Compare;
 }
 
 DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
  unsigned long long Val) {
-  return atomicExch(Address, Val);
+  unsigned long long R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
 }
+
 DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
 unsigned long long Val) {
-  return atomicAdd(Address, Val);
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
 
 #define __OMP_SPIN 1000



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] e5e448a - [libomptarget][cuda] Fix build, change missed from D95274

2021-01-24 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-24T18:30:04Z
New Revision: e5e448aafa7699c17f78aaffb001b665b607e5ae

URL: 
https://github.com/llvm/llvm-project/commit/e5e448aafa7699c17f78aaffb001b665b607e5ae
DIFF: 
https://github.com/llvm/llvm-project/commit/e5e448aafa7699c17f78aaffb001b665b607e5ae.diff

LOG: [libomptarget][cuda] Fix build, change missed from D95274

Added: 


Modified: 
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp 
b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
index cc7bc42412f6..ad67fe95c77e 100644
--- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
@@ -28,26 +28,26 @@ DLWRAP(cuFuncGetAttribute, 3);
 DLWRAP(cuGetErrorString, 2);
 DLWRAP(cuLaunchKernel, 11);
 
-DLWRAP(cuMemAlloc, 2);
-DLWRAP(cuMemcpyDtoDAsync, 4);
+DLWRAP(cuMemAlloc_v2, 2);
+DLWRAP(cuMemcpyDtoDAsync_v2, 4);
 
-DLWRAP(cuMemcpyDtoH, 3);
-DLWRAP(cuMemcpyDtoHAsync, 4);
-DLWRAP(cuMemcpyHtoD, 3);
-DLWRAP(cuMemcpyHtoDAsync, 4);
+DLWRAP(cuMemcpyDtoH_v2, 3);
+DLWRAP(cuMemcpyDtoHAsync_v2, 4);
+DLWRAP(cuMemcpyHtoD_v2, 3);
+DLWRAP(cuMemcpyHtoDAsync_v2, 4);
 
-DLWRAP(cuMemFree, 1);
+DLWRAP(cuMemFree_v2, 1);
 DLWRAP(cuModuleGetFunction, 3);
-DLWRAP(cuModuleGetGlobal, 4);
+DLWRAP(cuModuleGetGlobal_v2, 4);
 
 DLWRAP(cuModuleUnload, 1);
 DLWRAP(cuStreamCreate, 2);
-DLWRAP(cuStreamDestroy, 1);
+DLWRAP(cuStreamDestroy_v2, 1);
 DLWRAP(cuStreamSynchronize, 1);
 DLWRAP(cuCtxSetCurrent, 1);
-DLWRAP(cuDevicePrimaryCtxRelease, 1);
+DLWRAP(cuDevicePrimaryCtxRelease_v2, 1);
 DLWRAP(cuDevicePrimaryCtxGetState, 3);
-DLWRAP(cuDevicePrimaryCtxSetFlags, 2);
+DLWRAP(cuDevicePrimaryCtxSetFlags_v2, 2);
 DLWRAP(cuDevicePrimaryCtxRetain, 2);
 DLWRAP(cuModuleLoadDataEx, 5);
 



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] 95f0d1e - [libomptarget] Compile with older cuda, revert D95274

2021-01-25 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-25T16:12:56Z
New Revision: 95f0d1edafe3e52a4057768f8cde5d55faf39d16

URL: 
https://github.com/llvm/llvm-project/commit/95f0d1edafe3e52a4057768f8cde5d55faf39d16
DIFF: 
https://github.com/llvm/llvm-project/commit/95f0d1edafe3e52a4057768f8cde5d55faf39d16.diff

LOG: [libomptarget] Compile with older cuda, revert D95274

[libomptarget] Compile with older cuda, revert D95274

Fixes regression reported in comments of D95274.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95367

Added: 


Modified: 
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
openmp/libomptarget/plugins/cuda/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp 
b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
index ad67fe95c77e..cc7bc42412f6 100644
--- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
@@ -28,26 +28,26 @@ DLWRAP(cuFuncGetAttribute, 3);
 DLWRAP(cuGetErrorString, 2);
 DLWRAP(cuLaunchKernel, 11);
 
-DLWRAP(cuMemAlloc_v2, 2);
-DLWRAP(cuMemcpyDtoDAsync_v2, 4);
+DLWRAP(cuMemAlloc, 2);
+DLWRAP(cuMemcpyDtoDAsync, 4);
 
-DLWRAP(cuMemcpyDtoH_v2, 3);
-DLWRAP(cuMemcpyDtoHAsync_v2, 4);
-DLWRAP(cuMemcpyHtoD_v2, 3);
-DLWRAP(cuMemcpyHtoDAsync_v2, 4);
+DLWRAP(cuMemcpyDtoH, 3);
+DLWRAP(cuMemcpyDtoHAsync, 4);
+DLWRAP(cuMemcpyHtoD, 3);
+DLWRAP(cuMemcpyHtoDAsync, 4);
 
-DLWRAP(cuMemFree_v2, 1);
+DLWRAP(cuMemFree, 1);
 DLWRAP(cuModuleGetFunction, 3);
-DLWRAP(cuModuleGetGlobal_v2, 4);
+DLWRAP(cuModuleGetGlobal, 4);
 
 DLWRAP(cuModuleUnload, 1);
 DLWRAP(cuStreamCreate, 2);
-DLWRAP(cuStreamDestroy_v2, 1);
+DLWRAP(cuStreamDestroy, 1);
 DLWRAP(cuStreamSynchronize, 1);
 DLWRAP(cuCtxSetCurrent, 1);
-DLWRAP(cuDevicePrimaryCtxRelease_v2, 1);
+DLWRAP(cuDevicePrimaryCtxRelease, 1);
 DLWRAP(cuDevicePrimaryCtxGetState, 3);
-DLWRAP(cuDevicePrimaryCtxSetFlags_v2, 2);
+DLWRAP(cuDevicePrimaryCtxSetFlags, 2);
 DLWRAP(cuDevicePrimaryCtxRetain, 2);
 DLWRAP(cuModuleLoadDataEx, 5);
 

diff  --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h 
b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
index dd579a1f7490..832c26965144 100644
--- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
@@ -48,6 +48,18 @@ typedef enum CUctx_flags_enum {
   CU_CTX_SCHED_MASK = 0x07,
 } CUctx_flags;
 
+#define cuMemFree cuMemFree_v2
+#define cuMemAlloc cuMemAlloc_v2
+#define cuMemcpyDtoH cuMemcpyDtoH_v2
+#define cuMemcpyHtoD cuMemcpyHtoD_v2
+#define cuStreamDestroy cuStreamDestroy_v2
+#define cuModuleGetGlobal cuModuleGetGlobal_v2
+#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
+#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
+#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
+#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2
+
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
 CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
@@ -60,26 +72,26 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, 
unsigned, unsigned,
 unsigned, unsigned, unsigned, CUstream, void **,
 void **);
 
-CUresult cuMemAlloc_v2(CUdeviceptr *, size_t);
-CUresult cuMemcpyDtoDAsync_v2(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+CUresult cuMemAlloc(CUdeviceptr *, size_t);
+CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
 
-CUresult cuMemcpyDtoH_v2(void *, CUdeviceptr, size_t);
-CUresult cuMemcpyDtoHAsync_v2(void *, CUdeviceptr, size_t, CUstream);
-CUresult cuMemcpyHtoD_v2(CUdeviceptr, const void *, size_t);
-CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr, const void *, size_t, CUstream);
+CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
+CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
+CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
+CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
 
-CUresult cuMemFree_v2(CUdeviceptr);
+CUresult cuMemFree(CUdeviceptr);
 CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
-CUresult cuModuleGetGlobal_v2(CUdeviceptr *, size_t *, CUmodule, const char *);
+CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
 
 CUresult cuModuleUnload(CUmodule);
 CUresult cuStreamCreate(CUstream *, unsigned);
-CUresult cuStreamDestroy_v2(CUstream);
+CUresult cuStreamDestroy(CUstream);
 CUresult cuStreamSynchronize(CUstream);
 CUresult cuCtxSetCurrent(CUcontext);
-CUresult cuDevicePrimaryCtxRelease_v2(CUdevice);
+CUresult cuDevicePrimaryCtxRelease(CUdevice);
 CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
-CUresult cuDevicePrimaryCtxSetFlags_v2(CUdevice, unsigned);
+CUresult cuDevicePrimary

[llvm-branch-commits] [openmp] 33e2494 - [libomptarget][amdgpu][nfc] Fix build on centos

2021-01-12 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-12T19:40:03Z
New Revision: 33e2494bea653a845cb0502cc6d3cecdf2b47750

URL: 
https://github.com/llvm/llvm-project/commit/33e2494bea653a845cb0502cc6d3cecdf2b47750
DIFF: 
https://github.com/llvm/llvm-project/commit/33e2494bea653a845cb0502cc6d3cecdf2b47750.diff

LOG: [libomptarget][amdgpu][nfc] Fix build on centos

[libomptarget][amdgpu][nfc] Fix build on centos

rtl.cpp replaced 224 with a #define from elf.h, but that
doesn't work on a centos 7 build machine with an old elf.h

Reviewed By: ronlieb

Differential Revision: https://reviews.llvm.org/D94528

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 437846f8b15b8..bd450f9898faf 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -638,7 +638,7 @@ void finiAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
 }
 
 bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
-  const uint16_t amdgcnMachineID = EM_AMDGPU;
+  const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
   int32_t r = elf_check_machine(image, amdgcnMachineID);
   if (!r) {
 DP("Supported machine ID not found\n");



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] 84e0b14 - [libomptarget][nvptx] Include omp_data.cu in bitcode deviceRTL

2021-01-12 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-13T03:51:11Z
New Revision: 84e0b14a0a419f26d0a2f7389e06aa8e36569808

URL: 
https://github.com/llvm/llvm-project/commit/84e0b14a0a419f26d0a2f7389e06aa8e36569808
DIFF: 
https://github.com/llvm/llvm-project/commit/84e0b14a0a419f26d0a2f7389e06aa8e36569808.diff

LOG: [libomptarget][nvptx] Include omp_data.cu in bitcode deviceRTL

[libomptarget][nvptx] Include omp_data.cu in bitcode deviceRTL

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D94565

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt 
b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index 425c674fb11e..ea11c8114166 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -56,6 +56,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
   ${devicertl_common_directory}/src/data_sharing.cu
   ${devicertl_common_directory}/src/libcall.cu
   ${devicertl_common_directory}/src/loop.cu
+  ${devicertl_common_directory}/src/omp_data.cu
   ${devicertl_common_directory}/src/omptarget.cu
   ${devicertl_common_directory}/src/parallel.cu
   ${devicertl_common_directory}/src/reduction.cu
@@ -65,8 +66,6 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
   src/target_impl.cu
   )
 
-  set(omp_data_objects ${devicertl_common_directory}/src/omp_data.cu)
-
   # Build library support for the highest compute capability the system 
supports
   # and always build support for sm_35 by default
   if (${LIBOMPTARGET_DEP_CUDA_ARCH} EQUAL 35)
@@ -105,7 +104,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
   set(CUDA_SEPARABLE_COMPILATION ON)
   list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}
   -I${devicertl_nvptx_directory}/src)
-  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
+  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files}
   OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG} ${MAX_SM_DEFINITION})
 
   # Install device RTL under the lib destination folder.



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] 5d165f0 - [libomptarget][amdgpu] Fix kernel launch tracing to match previous behavior

2021-01-14 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-14T18:13:22Z
New Revision: 5d165f0b893d4fc5fb5caeb2b05c566dd26e4d89

URL: 
https://github.com/llvm/llvm-project/commit/5d165f0b893d4fc5fb5caeb2b05c566dd26e4d89
DIFF: 
https://github.com/llvm/llvm-project/commit/5d165f0b893d4fc5fb5caeb2b05c566dd26e4d89.diff

LOG: [libomptarget][amdgpu] Fix kernel launch tracing to match previous behavior

Restore control of kernel launch tracing to be >= 1 as it was before

export LIBOMPTARGET_KERNEL_TRACE=1

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D94695

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index bd450f9898fa..9453171e1378 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -1762,7 +1762,7 @@ int32_t __tgt_rtl_run_target_team_region_locked(
 loop_tripcount, // From run_region arg
 KernelInfo->device_id);
 
-  if (print_kernel_trace == 4)
+  if (print_kernel_trace >= 1)
 // enum modes are SPMD, GENERIC, NONE 0,1,2
 fprintf(stderr,
 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) 
"



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] 6e7094c - [libomptarget][nvptx][nfc] Move target_impl functions out of header

2021-01-14 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-15T00:19:48Z
New Revision: 6e7094c14b22a202c15959316033c164d7a84122

URL: 
https://github.com/llvm/llvm-project/commit/6e7094c14b22a202c15959316033c164d7a84122
DIFF: 
https://github.com/llvm/llvm-project/commit/6e7094c14b22a202c15959316033c164d7a84122.diff

LOG: [libomptarget][nvptx][nfc] Move target_impl functions out of header

[libomptarget][nvptx][nfc] Move target_impl functions out of header

This removes most of the differences between the two target_impl.h.

Also change name mangling from C to C++ for __kmpc_impl_*_lock.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D94728

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu 
b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 50867bc4010a..7e81aba4152d 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -14,19 +14,135 @@
 #include "common/debug.h"
 #include "common/target_atomic.h"
 
+#include 
+
+DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+}
+
+DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+  uint64_t val;
+  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+  return val;
+}
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
+  return res;
+}
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
+  return res;
+}
+
+DEVICE uint32_t __kmpc_impl_smid() {
+  uint32_t id;
+  asm("mov.u32 %0, %%smid;" : "=r"(id));
+  return id;
+}
+
+DEVICE double __kmpc_impl_get_wtick() {
+  // Timer precision is 1ns
+  return ((double)1E-9);
+}
+
+DEVICE double __kmpc_impl_get_wtime() {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  return (double)nsecs * __kmpc_impl_get_wtick();
+}
+
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+#if CUDA_VERSION >= 9000
+  return __activemask();
+#else
+  return __ballot(1);
+#endif
+}
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+ int32_t SrcLane) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(Mask, Var, SrcLane);
+#else
+  return __shfl(Var, SrcLane);
+#endif // CUDA_VERSION
+}
+
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+  int32_t Var, uint32_t Delta,
+  int32_t Width) {
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(Mask, Var, Delta, Width);
+#else
+  return __shfl_down(Var, Delta, Width);
+#endif // CUDA_VERSION
+}
+
+DEVICE void __kmpc_impl_syncthreads() {
+  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
+#if !defined(__clang__) || __clang_major__ >= 9
+  __syncthreads();
+#else
+  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
+#endif // __clang__
+}
+
+DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
+#if CUDA_VERSION >= 9000
+  __syncwarp(Mask);
+#else
+  // In Cuda < 9.0 no need to sync threads in warps.
+#endif // CUDA_VERSION
+}
+
+// NVPTX specific kernel initialization
+DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
+}
+
+// Barrier until num_threads arrive.
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
+  // The named barrier for active parallel threads of a team in an L1 parallel
+  // region to synchronize with each other.
+  int barrier = 1;
+  asm volatile("bar.sync %0, %1;"
+   :
+   : "r"(barrier), "r"(num_threads)
+   : "memory");
+}
+
+DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
+DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
+DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
+
+// Calls to the NVPTX layer (assuming 1D layout)
+DEVICE int GetThreadIdInBlock() { return threadIdx.x; }
+DEVICE int GetBlockIdInKernel() { return blockIdx.x; }
+DEVICE int GetNumberOfBlocksInKernel() { return gridDim.x; }
+DEVICE int GetNumberOfThreadsInBlock() { return blockDim.x; }
+DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+
 #define __OMP_SPIN 1000
 #define UNSET 0u
 #define SET 1u
 
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *

[llvm-branch-commits] [openmp] 214387c - [libomptarget][nvptx] Reduce calls to cuda header

2021-01-14 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-15T02:16:33Z
New Revision: 214387c2c694c92fec713f7ad224f10c1aebc1cf

URL: 
https://github.com/llvm/llvm-project/commit/214387c2c694c92fec713f7ad224f10c1aebc1cf
DIFF: 
https://github.com/llvm/llvm-project/commit/214387c2c694c92fec713f7ad224f10c1aebc1cf.diff

LOG: [libomptarget][nvptx] Reduce calls to cuda header

[libomptarget][nvptx] Reduce calls to cuda header

Remove use of clock_t in favour of a builtin. Drop a preprocessor branch.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D94731

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu 
b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 7e81aba4152d..b68d3265a758 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -56,7 +56,6 @@ DEVICE double __kmpc_impl_get_wtime() {
 }
 
 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
 #if CUDA_VERSION >= 9000
   return __activemask();
@@ -66,7 +65,6 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
 }
 
 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
  int32_t SrcLane) {
 #if CUDA_VERSION >= 9000
@@ -86,14 +84,7 @@ DEVICE int32_t 
__kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
 #endif // CUDA_VERSION
 }
 
-DEVICE void __kmpc_impl_syncthreads() {
-  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
-#if !defined(__clang__) || __clang_major__ >= 9
-  __syncthreads();
-#else
-  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
-#endif // __clang__
-}
+DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #if CUDA_VERSION >= 9000
@@ -145,11 +136,11 @@ DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
 DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
   // TODO: not sure spinning is a good idea here..
   while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
-clock_t start = clock();
-clock_t now;
+int32_t start = __nvvm_read_ptx_sreg_clock();
+int32_t now;
 for (;;) {
-  now = clock();
-  clock_t cycles = now > start ? now - start : now + (0x - start);
+  now = __nvvm_read_ptx_sreg_clock();
+  int32_t cycles = now > start ? now - start : now + (0x - start);
   if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
 break;
   }



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] e069662 - [libomptarget][devicertl] Wrap source in declare target pragmas

2021-01-20 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-20T15:50:41Z
New Revision: e069662deb1fa167b3e5fdce4c9949e663df8082

URL: 
https://github.com/llvm/llvm-project/commit/e069662deb1fa167b3e5fdce4c9949e663df8082
DIFF: 
https://github.com/llvm/llvm-project/commit/e069662deb1fa167b3e5fdce4c9949e663df8082.diff

LOG: [libomptarget][devicertl] Wrap source in declare target pragmas

[libomptarget][devicertl] Wrap source in declare target pragmas

Factored out of D93135 / D94745. C++ and cuda ignore unknown pragmas
so this is a NFC for the current implementation language. Removes noise
from patches for building deviceRTL as openmp.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D95048

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
openmp/libomptarget/deviceRTLs/common/src/cancel.cu
openmp/libomptarget/deviceRTLs/common/src/critical.cu
openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
openmp/libomptarget/deviceRTLs/common/src/libcall.cu
openmp/libomptarget/deviceRTLs/common/src/loop.cu
openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
openmp/libomptarget/deviceRTLs/common/src/parallel.cu
openmp/libomptarget/deviceRTLs/common/src/reduction.cu
openmp/libomptarget/deviceRTLs/common/src/support.cu
openmp/libomptarget/deviceRTLs/common/src/sync.cu
openmp/libomptarget/deviceRTLs/common/src/task.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
index 4163a14f50bf..f537fb28318c 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
@@ -14,6 +14,7 @@
 // a SIMD => wavefront mapping once that is implemented.
 //
 
//===--===//
+#pragma omp declare target
 
 #include "common/debug.h"
 
@@ -26,3 +27,5 @@ DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
 DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
 DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
 DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); }
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
index 74d0d167137f..c85045570de2 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 
//===--===//
+#pragma omp declare target
 
 #include "target_impl.h"
 
@@ -59,3 +60,5 @@ DEVICE uint32_t __kmpc_impl_smid() {
   ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
   return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
 }
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 3e70beb85d5b..7388a29215cc 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -9,6 +9,7 @@
 // Definitions of target specific functions
 //
 
//===--===//
+#pragma omp declare target
 
 #include "target_impl.h"
 
@@ -151,3 +152,5 @@ EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
 // Stub implementations
 DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }
 DEVICE void __kmpc_impl_free(void *) {}
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu 
b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
index 9540f5647699..0e5cd2b60554 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
@@ -9,6 +9,7 @@
 // Interface to be used in the implementation of OpenMP cancel.
 //
 
//===--===//
+#pragma omp declare target
 
 #include "interface.h"
 #include "common/debug.h"
@@ -26,3 +27,5 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t 
global_tid,
   // disabled
   return 0;
 }
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu 
b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
index ee4b056ddad9..3fd89c50aa46 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu
+++ b/openmp/libomptarget/deviceRTLs/

[llvm-branch-commits] [openmp] ea616f9 - [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify

2021-01-20 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-20T19:45:05Z
New Revision: ea616f9026dc6bd9c67ebe2d3226ac91122a7945

URL: 
https://github.com/llvm/llvm-project/commit/ea616f9026dc6bd9c67ebe2d3226ac91122a7945
DIFF: 
https://github.com/llvm/llvm-project/commit/ea616f9026dc6bd9c67ebe2d3226ac91122a7945.diff

LOG: [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify

[libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify

Replace __popc, __ffs with clang intrinsics. Move kmpc_impl_min to only file
that uses it and replace template with explictly typed.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95060

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
openmp/libomptarget/deviceRTLs/common/src/reduction.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index d25ea8559c05..b1e9a1a9403a 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -65,6 +65,10 @@ enum DATA_SHARING_SIZES {
   DS_Max_Warp_Number = 16,
 };
 
+enum : __kmpc_impl_lanemask_t {
+  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
+};
+
 INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   lo = (uint32_t)(val & UINT64_C(0x));
   hi = (uint32_t)((val & UINT64_C(0x)) >> 32);
@@ -74,28 +78,15 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
   return (((uint64_t)hi) << 32) | (uint64_t)lo;
 }
 
-enum : __kmpc_impl_lanemask_t {
-  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
-};
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
-
 DEVICE uint32_t __kmpc_impl_smid();
-
 DEVICE double __kmpc_impl_get_wtick();
-
 DEVICE double __kmpc_impl_get_wtime();
 
 INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
-
 INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
 
-template  INLINE T __kmpc_impl_min(T x, T y) {
-  return x < y ? x : y;
-}
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu 
b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
index 92b34d77bd8a..3a3c44503f34 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -184,6 +184,8 @@ INLINE static uint32_t roundToWarpsize(uint32_t s) {
   return (s & ~(unsigned)(WARPSIZE - 1));
 }
 
+INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; 
}
+
 DEVICE static volatile uint32_t IterCnt = 0;
 DEVICE static volatile uint32_t Cnt = 0;
 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
@@ -261,14 +263,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   // by returning 1 in the thread holding the reduction result.
 
   // Check if this is the very last team.
-  unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
+  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
   if (ChunkTeamCount == NumTeams - Bound - 1) {
 //
 // Last team processing.
 //
 if (ThreadId >= NumRecs)
   return 0;
-NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
+NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
 if (ThreadId >= NumThreads)
   return 0;
 
@@ -283,7 +285,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 
   // When we have more than [warpsize] number of threads
   // a block reduction is performed here.
-  uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
+  uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
   if (ActiveThreads > WARPSIZE) {
 uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
 // Gather all the reduced values from each warp

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h 
b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 8382cd6aaf47..ab9fd1697f14 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -93,13 +93,8 @@ DEVICE uint32_t __kmpc_impl_smid();
 DEVICE double __kmpc_impl_get_wtick();
 DEVICE double __kmpc_impl_get_wtime();
 
-INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
-
-INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
-
-template  INLINE T __kmpc_impl_min(T x, T y) {
-  return min(x, y);
-}
+INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
+INLINE uint32_t __kmpc_impl_popc(ui

[llvm-branch-commits] [openmp] fbc1dcb - [libomptarget][devicertl][nfc] Simplify target_atomic abstraction

2021-01-20 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-20T19:50:50Z
New Revision: fbc1dcb946553a3dc923a63288d9275eea86f918

URL: 
https://github.com/llvm/llvm-project/commit/fbc1dcb946553a3dc923a63288d9275eea86f918
DIFF: 
https://github.com/llvm/llvm-project/commit/fbc1dcb946553a3dc923a63288d9275eea86f918.diff

LOG: [libomptarget][devicertl][nfc] Simplify target_atomic abstraction

[libomptarget][devicertl][nfc] Simplify target_atomic abstraction

Atomic functions were implemented as a shim around cuda's atomics, with
amdgcn implementing those symbols as a shim around gcc style intrinsics.

This patch folds target_atomic.h into target_impl.h and folds amdgcn.

Further work is likely to be useful here, either changing to openmp's atomic
interface or instantiating the templates on the few used types in order to
move them into a cuda/c++ implementation file. This change is mostly to
group the remaining uses of the cuda api under nvptx' target_impl abstraction.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95062

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
openmp/libomptarget/deviceRTLs/common/omptargeti.h
openmp/libomptarget/deviceRTLs/common/src/libcall.cu
openmp/libomptarget/deviceRTLs/common/src/loop.cu
openmp/libomptarget/deviceRTLs/common/src/reduction.cu
openmp/libomptarget/deviceRTLs/common/state-queuei.h
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 
openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
openmp/libomptarget/deviceRTLs/common/target_atomic.h



diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt 
b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
index 8bb395f1126f..8d9abe5d0bbd 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -73,14 +73,12 @@ set(cuda_sources
 
 set(h_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h
   ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
   ${devicertl_base_directory}/common/debug.h
   ${devicertl_base_directory}/common/device_environment.h
   ${devicertl_base_directory}/common/omptarget.h
   ${devicertl_base_directory}/common/omptargeti.h
   ${devicertl_base_directory}/common/state-queue.h
-  ${devicertl_base_directory}/common/target_atomic.h
   ${devicertl_base_directory}/common/state-queuei.h
   ${devicertl_base_directory}/common/support.h)
 

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
deleted file mode 100644
index 04e80b945070..
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//=== hip_atomics.h - Declarations of hip atomic functions  C++ 
-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===--===//
-
-#ifndef OMPTARGET_AMDGCN_HIP_ATOMICS_H
-#define OMPTARGET_AMDGCN_HIP_ATOMICS_H
-
-#include "target_impl.h"
-
-namespace {
-
-template  DEVICE T atomicAdd(T *address, T val) {
-  return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
-}
-
-template  DEVICE T atomicMax(T *address, T val) {
-  return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
-}
-
-template  DEVICE T atomicExch(T *address, T val) {
-  T r;
-  __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
-  return r;
-}
-
-template  DEVICE T atomicCAS(T *address, T compare, T val) {
-  (void)__atomic_compare_exchange(address, &compare, &val, false,
-  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-  return compare;
-}
-
-INLINE uint32_t atomicInc(uint32_t *address, uint32_t max) {
-  return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
-}
-
-} // namespace
-#endif

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index b1e9a1a9403a..6e8a651bd886 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -29,8 +29,6 @@
 #define SHARED __attribute__((shared))
 #define ALIGN(N) __attribute__((aligned(N)))
 
-#include "hip_atomics.h"
-
 

 // Kernel options
 

@@ -127,6 +125,31 @@ DEVICE int GetNumberOfThreadsInBlock();
 DEVICE unsigned GetWarpId();
 DEVICE unsigned GetLaneId();
 
+// Atomics
+temp

[llvm-branch-commits] [openmp] 9b19ecb - [libomptarget][devicertl] Drop templated atomic functions

2021-01-22 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-22T14:48:22Z
New Revision: 9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d

URL: 
https://github.com/llvm/llvm-project/commit/9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d
DIFF: 
https://github.com/llvm/llvm-project/commit/9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d.diff

LOG: [libomptarget][devicertl] Drop templated atomic functions

[libomptarget][devicertl] Drop templated atomic functions

The five __kmpc_atomic templates are instantiated a total of seven times.
This change replaces the template with explictly typed functions, which
have the same prototype for amdgcn and nvptx, and implements them with
the same code presently in use.

Rolls in the accepted but not yet landed D95085.

The unsigned long long type can be replaced with uint64_t when replacing
the cuda function. Until then, clang warns on casting a pointer to one to
a pointer to the other.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D95093

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 6e8a651bd886..228d3f6e556d 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -126,29 +126,17 @@ DEVICE unsigned GetWarpId();
 DEVICE unsigned GetLaneId();
 
 // Atomics
-template  INLINE T __kmpc_atomic_add(T *address, T val) {
-  return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
-}
-
-INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) {
-  return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
-}
-
-template  INLINE T __kmpc_atomic_max(T *address, T val) {
-  return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
-}
-
-template  INLINE T __kmpc_atomic_exchange(T *address, T val) {
-  T r;
-  __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
-  return r;
-}
-
-template  INLINE T __kmpc_atomic_cas(T *address, T compare, T val) 
{
-  (void)__atomic_compare_exchange(address, &compare, &val, false,
-  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-  return compare;
-}
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
+
+static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
+ unsigned long long);
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
+unsigned long long);
 
 // Locks
 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 7388a29215cc..35828cda0e06 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -132,11 +132,13 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, 
uint32_t grid_size,
 } // namespace
 
 DEVICE int GetNumberOfBlocksInKernel() {
-  return get_grid_dim(__builtin_amdgcn_grid_size_x(), 
__builtin_amdgcn_workgroup_size_x());
+  return get_grid_dim(__builtin_amdgcn_grid_size_x(),
+  __builtin_amdgcn_workgroup_size_x());
 }
 
 DEVICE int GetNumberOfThreadsInBlock() {
-  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), 
__builtin_amdgcn_grid_size_x(),
+  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
+   __builtin_amdgcn_grid_size_x(),
__builtin_amdgcn_workgroup_size_x());
 }
 
@@ -149,6 +151,40 @@ EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
   return GetNumberOfThreadsInBlock();
 }
 
+// Atomics
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
+}
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
+  return __builtin_amdgcn_atomic_inc32(Address, max, __ATOMIC_SEQ_CST, "");
+}
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
+  return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
+}
+
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
+  uint32_t R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
+}
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Ad

[llvm-branch-commits] [openmp] 47e95e8 - [libomptarget] Build cuda plugin without cuda installed locally

2021-01-22 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-23T00:15:04Z
New Revision: 47e95e87a3e4f738635ff965616d4e2d96bf838a

URL: 
https://github.com/llvm/llvm-project/commit/47e95e87a3e4f738635ff965616d4e2d96bf838a
DIFF: 
https://github.com/llvm/llvm-project/commit/47e95e87a3e4f738635ff965616d4e2d96bf838a.diff

LOG: [libomptarget] Build cuda plugin without cuda installed locally

[libomptarget] Build cuda plugin without cuda installed locally

Compiles a new file, `plugins/cuda/dynamic_cuda/cuda.cpp`, to an object file 
that exposes the same symbols that the plugin presently uses from libcuda. The 
object file contains dlopen of libcuda and cached dlsym calls. Also provides a 
cuda.h containing the subset that is used.

This lets the cmake file choose between the system cuda and a dlopen shim, with 
no changes to rtl.cpp.

The corresponding change to amdgpu is postponed until after a refactor of the 
plugin to reduce the size of the hsa.h stub required

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95155

Added: 
openmp/libomptarget/include/dlwrap.h
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h

Modified: 
openmp/libomptarget/plugins/cuda/CMakeLists.txt

Removed: 




diff  --git a/openmp/libomptarget/include/dlwrap.h 
b/openmp/libomptarget/include/dlwrap.h
new file mode 100644
index ..9e078b34ca57
--- /dev/null
+++ b/openmp/libomptarget/include/dlwrap.h
@@ -0,0 +1,277 @@
+//===--- dlwrap.h - Convenience wrapper around dlopen/dlsym  -- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// The openmp plugins depend on extern libraries. These can be used via:
+//  - bitcode file statically linked
+//  - (relocatable) object file statically linked
+//  - static library
+//  - dynamic library, linked at build time
+//  - dynamic library, loaded at application run time by dlopen
+//
+// This file factors out most boilerplate for using a dlopened library.
+// - Function symbols are generated that are statically linked against
+// - The dlopen can be done implicitly when initializing the library
+// - dlsym lookups are done once and cached
+// - The abstraction is very thin to permit varied uses of the library
+//
+// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to:
+// int foo(char x0, double x1, void* x2) {
+//   constexpr size_t index = id();
+//   void * dlsymResult = pointer(index);
+//   return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2);
+// }
+//
+// Multiple calls to DLWRAP(symbol_name, arity) with bespoke
+// initialization code that can use the thin abstraction:
+// namespace dlwrap {
+//   static size_t size();
+//   static const char *symbol(size_t);
+//   static void **pointer(size_t);
+// }
+// will compile to an object file that only exposes the symbols that the
+// dynamic library would do, with the right function types.
+//
+//===--===//
+
+#ifndef DLWRAP_H_INCLUDED
+#define DLWRAP_H_INCLUDED
+
+#include 
+#include 
+#include 
+#include 
+
+// Where symbol is a function, these expand to some book keeping and an
+// implementation of that function
+#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY)
+#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)
+
+// For example, given a prototype:
+// int foo(char, double);
+//
+// DLWRAP(foo, 2) expands to:
+//
+// namespace dlwrap {
+// struct foo_Trait : public dlwrap::trait {
+//   using T = dlwrap::trait;
+//   static T::FunctionType get() {
+// constexpr size_t Index = getIndex();
+// void *P = *dlwrap::pointer(Index);
+// return reinterpret_cast(P);
+//   }
+// };
+// }
+// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); }
+//
+// DLWRAP_INTERNAL is similar, except the function it expands to is:
+// static int dlwrap_foo(char x0, double x1) { ... }
+// so that the function pointer call can be wrapped in library-specific code
+
+// DLWRAP_FINALIZE() expands to definitions of:
+#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL()
+namespace dlwrap {
+static size_t size();
+static const char *symbol(size_t); // get symbol name in [0, size())
+static void **pointer(size_t); // get pointer to function pointer in [0, 
size())
+} // namespace dlwrap
+
+// Implementation details follow.
+
+namespace dlwrap {
+
+// Extract return / argument types from address of function symbol
+template  struct trait;
+template  struct trait {
+  constexpr static const size_t nargs = sizeof...(Ts);
+  typedef R ReturnType;
+  template  struct arg {
+typedef type

[llvm-branch-commits] [openmp] 76bfbb7 - [libomptarget][amdgpu] Call into deviceRTL instead of ockl

2021-01-04 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2021-01-04T16:48:47Z
New Revision: 76bfbb74d38b611f150e8e1a4becc11be95703da

URL: 
https://github.com/llvm/llvm-project/commit/76bfbb74d38b611f150e8e1a4becc11be95703da
DIFF: 
https://github.com/llvm/llvm-project/commit/76bfbb74d38b611f150e8e1a4becc11be95703da.diff

LOG: [libomptarget][amdgpu] Call into deviceRTL instead of ockl

[libomptarget][amdgpu] Call into deviceRTL instead of ockl

Amdgpu codegen presently emits a call into ockl. The same functionality
is already present in the deviceRTL. Adds an amdgpu specific entry point
to avoid the dependency. This lets simple openmp code (specifically, that
which doesn't use libm) run without rocm device libraries installed.

Reviewed By: ronlieb

Differential Revision: https://reviews.llvm.org/D93356

Added: 


Modified: 
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
clang/test/OpenMP/amdgcn_target_codegen.cpp
openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Removed: 




diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
index ccffdf43549f..33d4ab838af1 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -49,13 +49,12 @@ llvm::Value 
*CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
 llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
   llvm::Module *M = &CGF.CGM.getModule();
-  const char *LocSize = "__ockl_get_local_size";
+  const char *LocSize = "__kmpc_amdgcn_gpu_num_threads";
   llvm::Function *F = M->getFunction(LocSize);
   if (!F) {
 F = llvm::Function::Create(
-llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
+llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
   }
-  return Bld.CreateTrunc(
-  Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
+  return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
 }

diff  --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp 
b/clang/test/OpenMP/amdgcn_target_codegen.cpp
index 85ef69942a50..416ed06083b0 100644
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -13,9 +13,8 @@ int test_amdgcn_target_tid_threads() {
 
   int arr[N];
 
-// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
-// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
-// CHECK-NEXT: sub nuw i32 [[VAR]], 64
+// CHECK: [[NUM_THREADS:%.+]] = call i32 @__kmpc_amdgcn_gpu_num_threads()
+// CHECK: sub nuw i32 [[NUM_THREADS]], 64
 // CHECK: call i32 @llvm.amdgcn.workitem.id.x()
 #pragma omp target
   for (int i = 0; i < N; i++) {
@@ -30,9 +29,8 @@ int test_amdgcn_target_tid_threads_simd() {
 
   int arr[N];
 
-// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
-// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
-// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0)
+// CHECK: [[NUM_THREADS:%.+]] = call i32 @__kmpc_amdgcn_gpu_num_threads()
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[NUM_THREADS]], i16 0)
 #pragma omp target simd
   for (int i = 0; i < N; i++) {
 arr[i] = 1;

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
index f7c75c09362a..80409d611f6f 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
@@ -15,4 +15,6 @@
 typedef uint64_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
 
+EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads();
+
 #endif

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 9fbdc67b56ab..3e70beb85d5b 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -144,6 +144,10 @@ DEVICE unsigned GetLaneId() {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }
 
+EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
+  return GetNumberOfThreadsInBlock();
+}
+
 // Stub implementations
-DEVICE void *__kmpc_impl_malloc(size_t ) { return nullptr }
+DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }
 DEVICE void __kmpc_impl_free(void *) {}



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] 4b2e7d0 - [amdgpu] Default to code object v3

2020-12-14 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-15T01:11:09Z
New Revision: 4b2e7d0215021d0d1df1a6319884b21d33936265

URL: 
https://github.com/llvm/llvm-project/commit/4b2e7d0215021d0d1df1a6319884b21d33936265
DIFF: 
https://github.com/llvm/llvm-project/commit/4b2e7d0215021d0d1df1a6319884b21d33936265.diff

LOG: [amdgpu] Default to code object v3

[amdgpu] Default to code object v3
v4 is not yet readily available, and doesn't appear
to be implemented in the back end

Reviewed By: t-tye

Differential Revision: https://reviews.llvm.org/D93258

Added: 


Modified: 
clang/include/clang/Driver/Options.td
clang/lib/Driver/ToolChains/CommonArgs.cpp
llvm/docs/AMDGPUUsage.rst

Removed: 




diff  --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 67d41c3711f5..87c786065fa9 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2811,7 +2811,7 @@ def mexec_model_EQ : Joined<["-"], "mexec-model=">, 
Group;
 
 def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, 
Group,
-  HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">,
+  HelpText<"Specify code object ABI version. Defaults to 3. (AMDGPU only)">,
   MetaVarName<"">, Values<"2,3,4">;
 
 def mcode_object_v3_legacy : Flag<["-"], "mcode-object-v3">, Group,

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp 
b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 72bedc16846d..04d0e0771f70 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1549,7 +1549,7 @@ unsigned tools::getOrCheckAMDGPUCodeObjectVersion(
 const Driver &D, const llvm::opt::ArgList &Args, bool Diagnose) {
   const unsigned MinCodeObjVer = 2;
   const unsigned MaxCodeObjVer = 4;
-  unsigned CodeObjVer = 4;
+  unsigned CodeObjVer = 3;
 
   // Emit warnings for legacy options even if they are overridden.
   if (Diagnose) {

diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e5d081a37500..95fb164310cc 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -911,12 +911,12 @@ The AMDGPU backend uses the following ELF header:
 
   * ``ELFABIVERSION_AMDGPU_HSA_V3`` is used to specify the version of AMD HSA
 runtime ABI for code object V3. Specify using the Clang option
-``-mcode-object-version=3``.
+``-mcode-object-version=3``. This is the default code object
+version if not specified.
 
   * ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA
 runtime ABI for code object V4. Specify using the Clang option
-``-mcode-object-version=4``. This is the default code object
-version if not specified.
+``-mcode-object-version=4``.
 
   * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL
 runtime ABI.
@@ -2871,10 +2871,6 @@ non-AMD key names should be prefixed by "*vendor-name*.".
 Code Object V3 Metadata
 +++
 
-.. warning::
-  Code object V3 is not the default code object version emitted by this version
-  of LLVM.
-
 Code object V3 to V4 metadata is specified by the ``NT_AMDGPU_METADATA`` note
 record (see :ref:`amdgpu-note-records-v3-v4`).
 
@@ -3279,6 +3275,10 @@ same *vendor-name*.
 Code Object V4 Metadata
 +++
 
+.. warning::
+  Code object V4 is not the default code object version emitted by this version
+  of LLVM.
+
 Code object V4 metadata is the same as
 :ref:`amdgpu-amdhsa-code-object-metadata-v3` with the changes and additions
 defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v3`.



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] b607837 - [libomptarget][nfc] Replace static const with enum

2020-12-16 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-16T16:40:37Z
New Revision: b607837c75d04cc007dcf855983dfa3b69f63d73

URL: 
https://github.com/llvm/llvm-project/commit/b607837c75d04cc007dcf855983dfa3b69f63d73
DIFF: 
https://github.com/llvm/llvm-project/commit/b607837c75d04cc007dcf855983dfa3b69f63d73.diff

LOG: [libomptarget][nfc] Replace static const with enum

[libomptarget][nfc] Replace static const with enum

Semantically identical. Replaces 0xff... with ~0 to spare counting the f.
Has the advantage that the compiler doesn't need to prove the 4/8 byte
value dead before discarding it, and sidesteps the compilation question
associated with what static means for a single source language.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D93328

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h 
b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 34794587e0fe..d25ea8559c05 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -74,8 +74,9 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
   return (((uint64_t)hi) << 32) | (uint64_t)lo;
 }
 
-static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
-UINT64_C(0x);
+enum : __kmpc_impl_lanemask_t {
+  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
+};
 
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
 

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h 
b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 46ce751c44c4..411e1676b7c7 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -91,8 +91,9 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
   return val;
 }
 
-static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
-UINT32_C(0x);
+enum : __kmpc_impl_lanemask_t {
+  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
+};
 
 INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
   __kmpc_impl_lanemask_t res;



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] c0619d3 - [NFC] Use regex for code object version in hip tests

2020-12-16 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-16T17:00:19Z
New Revision: c0619d3b21cd420b9faf15f14db0816787c44ded

URL: 
https://github.com/llvm/llvm-project/commit/c0619d3b21cd420b9faf15f14db0816787c44ded
DIFF: 
https://github.com/llvm/llvm-project/commit/c0619d3b21cd420b9faf15f14db0816787c44ded.diff

LOG: [NFC] Use regex for code object version in hip tests

[NFC] Use regex for code object version in hip tests

Extracted from D93258. Makes tests robust to changes in default
code object version.

Reviewed By: t-tye

Differential Revision: https://reviews.llvm.org/D93398

Added: 


Modified: 
clang/test/Driver/hip-autolink.hip
clang/test/Driver/hip-code-object-version.hip
clang/test/Driver/hip-device-compile.hip
clang/test/Driver/hip-host-cpu-features.hip
clang/test/Driver/hip-rdc-device-only.hip
clang/test/Driver/hip-target-id.hip
clang/test/Driver/hip-toolchain-mllvm.hip
clang/test/Driver/hip-toolchain-no-rdc.hip
clang/test/Driver/hip-toolchain-opt.hip
clang/test/Driver/hip-toolchain-rdc-separate.hip
clang/test/Driver/hip-toolchain-rdc-static-lib.hip
clang/test/Driver/hip-toolchain-rdc.hip

Removed: 




diff  --git a/clang/test/Driver/hip-autolink.hip 
b/clang/test/Driver/hip-autolink.hip
index 073c6c4d244a..5f9311d7ba73 100644
--- a/clang/test/Driver/hip-autolink.hip
+++ b/clang/test/Driver/hip-autolink.hip
@@ -7,7 +7,7 @@
 // RUN: %clang --target=i386-pc-windows-msvc --cuda-gpu-arch=gfx906 -nogpulib \
 // RUN:   --cuda-host-only %s -### 2>&1 | FileCheck --check-prefix=HOST %s
 
-// DEV: "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" 
"amdgcn-amd-amdhsa"
+// DEV: "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9]+}}" "-triple" 
"amdgcn-amd-amdhsa"
 // DEV-SAME: "-fno-autolink"
 
 // HOST: "-cc1" "-triple" "i386-pc-windows-msvc{{.*}}"

diff  --git a/clang/test/Driver/hip-code-object-version.hip 
b/clang/test/Driver/hip-code-object-version.hip
index 26ad6f8710cc..51d9004b0cbf 100644
--- a/clang/test/Driver/hip-code-object-version.hip
+++ b/clang/test/Driver/hip-code-object-version.hip
@@ -44,12 +44,17 @@
 // RUN:   --offload-arch=gfx906 -nogpulib \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=V4 %s
 
+// V4: "-mllvm" "--amdhsa-code-object-version=4"
+// V4: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906"
+
+// Check bundle ID for code object version default
+
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --offload-arch=gfx906 -nogpulib \
-// RUN:   %s 2>&1 | FileCheck -check-prefix=V4 %s
+// RUN:   %s 2>&1 | FileCheck -check-prefix=VD %s
 
-// V4: "-mllvm" "--amdhsa-code-object-version=4"
-// V4: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906"
+// VD: "-mllvm" "--amdhsa-code-object-version=4"
+// VD: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906"
 
 // Check invalid code object version option.
 

diff  --git a/clang/test/Driver/hip-device-compile.hip 
b/clang/test/Driver/hip-device-compile.hip
index 5fbcbc97bd80..c460ff7e8c67 100644
--- a/clang/test/Driver/hip-device-compile.hip
+++ b/clang/test/Driver/hip-device-compile.hip
@@ -26,7 +26,7 @@
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASM %s
 
-// CHECK: {{".*clang.*"}} "-cc1" "-mllvm" "--amdhsa-code-object-version=4" 
"-triple" "amdgcn-amd-amdhsa"
+// CHECK: {{".*clang.*"}} "-cc1" "-mllvm" 
"--amdhsa-code-object-version={{[0-9]+}}" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // BC-SAME: "-emit-llvm-bc"
 // LL-SAME: "-emit-llvm"

diff  --git a/clang/test/Driver/hip-host-cpu-features.hip 
b/clang/test/Driver/hip-host-cpu-features.hip
index 235f0f1f22c2..8addfb11dc0b 100644
--- a/clang/test/Driver/hip-host-cpu-features.hip
+++ b/clang/test/Driver/hip-host-cpu-features.hip
@@ -6,14 +6,14 @@
 // RUN: %clang -### -c -target x86_64-linux-gnu -msse3 --cuda-gpu-arch=gfx803 
-nogpulib %s 2>&1 | FileCheck %s -check-prefix=HOSTSSE3
 // RUN: %clang -### -c -target x86_64-linux-gnu --gpu-use-aux-triple-only 
-march=znver2 --cuda-gpu-arch=gfx803 -nogpulib %s 2>&1 | FileCheck %s 
-check-prefix=NOHOSTCPU
 
-// HOSTCPU: "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" 
"amdgcn-amd-amdhsa"
+// HOSTCPU: "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9]+}}" 
"-triple" "amdgcn-amd-amdhsa"
 // HOSTCPU-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // HOSTCPU-SAME: "-aux-target-cpu" "znver2"
 
-// HOSTSSE3: "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" 
"amdgcn-amd-amdhsa"
+// HOSTSSE3: "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9]+}}" 
"-triple" "amdgcn-amd-amdhsa"
 // HOSTSSE3-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // HOSTSSE3-SAME: "-aux-target-feature" "+sse3"
 
-// NOHOSTCPU: "-cc1" "-mllvm" "--amdhsa-code-object-version=4" "-triple" 
"amdgcn-amd-amdhsa"
+// NOHOSTCPU: "-cc1" "-mllvm" "--amdhsa-code-object-version={{[0-9

[llvm-branch-commits] [clang] daf39e3 - [amdgpu] Default to code object v3

2020-12-17 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-17T16:09:33Z
New Revision: daf39e3f2dba18bd39cd89a1c91bae126a31d4fe

URL: 
https://github.com/llvm/llvm-project/commit/daf39e3f2dba18bd39cd89a1c91bae126a31d4fe
DIFF: 
https://github.com/llvm/llvm-project/commit/daf39e3f2dba18bd39cd89a1c91bae126a31d4fe.diff

LOG: [amdgpu] Default to code object v3

[amdgpu] Default to code object v3
v4 is not yet readily available, and doesn't appear
to be implemented in the back end

Reviewed By: t-tye, yaxunl

Differential Revision: https://reviews.llvm.org/D93258

Added: 


Modified: 
clang/include/clang/Driver/Options.td
clang/lib/Driver/ToolChains/CommonArgs.cpp
clang/test/Driver/hip-code-object-version.hip
llvm/docs/AMDGPUUsage.rst

Removed: 




diff  --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index f384e0d993c2..07f15add28ec 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2909,7 +2909,7 @@ def mexec_model_EQ : Joined<["-"], "mexec-model=">, 
Group;
 
 def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, 
Group,
-  HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">,
+  HelpText<"Specify code object ABI version. Defaults to 3. (AMDGPU only)">,
   MetaVarName<"">, Values<"2,3,4">;
 
 def mcode_object_v3_legacy : Flag<["-"], "mcode-object-v3">, Group,

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp 
b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 72bedc16846d..04d0e0771f70 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1549,7 +1549,7 @@ unsigned tools::getOrCheckAMDGPUCodeObjectVersion(
 const Driver &D, const llvm::opt::ArgList &Args, bool Diagnose) {
   const unsigned MinCodeObjVer = 2;
   const unsigned MaxCodeObjVer = 4;
-  unsigned CodeObjVer = 4;
+  unsigned CodeObjVer = 3;
 
   // Emit warnings for legacy options even if they are overridden.
   if (Diagnose) {

diff  --git a/clang/test/Driver/hip-code-object-version.hip 
b/clang/test/Driver/hip-code-object-version.hip
index 51d9004b0cbf..6e4e96688593 100644
--- a/clang/test/Driver/hip-code-object-version.hip
+++ b/clang/test/Driver/hip-code-object-version.hip
@@ -53,7 +53,7 @@
 // RUN:   --offload-arch=gfx906 -nogpulib \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=VD %s
 
-// VD: "-mllvm" "--amdhsa-code-object-version=4"
+// VD: "-mllvm" "--amdhsa-code-object-version=3"
 // VD: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906"
 
 // Check invalid code object version option.

diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6d3fa7021a7a..c8dda47352ab 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -911,12 +911,12 @@ The AMDGPU backend uses the following ELF header:
 
   * ``ELFABIVERSION_AMDGPU_HSA_V3`` is used to specify the version of AMD HSA
 runtime ABI for code object V3. Specify using the Clang option
-``-mcode-object-version=3``.
+``-mcode-object-version=3``. This is the default code object
+version if not specified.
 
   * ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA
 runtime ABI for code object V4. Specify using the Clang option
-``-mcode-object-version=4``. This is the default code object
-version if not specified.
+``-mcode-object-version=4``.
 
   * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL
 runtime ABI.
@@ -2871,10 +2871,6 @@ non-AMD key names should be prefixed by "*vendor-name*.".
 Code Object V3 Metadata
 +++
 
-.. warning::
-  Code object V3 is not the default code object version emitted by this version
-  of LLVM.
-
 Code object V3 to V4 metadata is specified by the ``NT_AMDGPU_METADATA`` note
 record (see :ref:`amdgpu-note-records-v3-v4`).
 
@@ -3279,6 +3275,10 @@ same *vendor-name*.
 Code Object V4 Metadata
 +++
 
+.. warning::
+  Code object V4 is not the default code object version emitted by this version
+  of LLVM.
+
 Code object V4 metadata is the same as
 :ref:`amdgpu-amdhsa-code-object-metadata-v3` with the changes and additions
 defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v3`.



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] 89a0f48 - [libomptarget][cuda] Detect missing symbols in plugin at build time

2020-11-27 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-11-27T15:39:41Z
New Revision: 89a0f48c58f82262c7ce2b9ca51ffad0ffc559ea

URL: 
https://github.com/llvm/llvm-project/commit/89a0f48c58f82262c7ce2b9ca51ffad0ffc559ea
DIFF: 
https://github.com/llvm/llvm-project/commit/89a0f48c58f82262c7ce2b9ca51ffad0ffc559ea.diff

LOG: [libomptarget][cuda] Detect missing symbols in plugin at build time

[libomptarget][cuda] Detect missing symbols in plugin at build time

Passes -z,defs to the linker. Error on unresolved symbol references.

Otherwise, those unresolved symbols present as target code running on the host
as the plugin fails to load. This is significantly harder to debug than a link
time error. Flag matches that passed by amdgcn and ve plugins.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D92143

Added: 


Modified: 
openmp/libomptarget/plugins/cuda/CMakeLists.txt

Removed: 




diff  --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt 
b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
index 54bcdf26e9e6..e0299b1f3270 100644
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@@ -39,7 +39,8 @@ install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION 
"${OPENMP_INSTALL_LIBDIR}
 target_link_libraries(omptarget.rtl.cuda
   ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
   ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+  "-Wl,-z,defs")
 
 # Report to the parent scope that we are building a plugin for CUDA.
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} 
nvptx64-nvidia-cuda" PARENT_SCOPE)



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] ae9d96a - [libomptarget][amdgpu] Address compiler warnings, drive by fixes

2020-12-03 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-03T11:09:12Z
New Revision: ae9d96a656a17fa782ccaa9ba10d4570f497e855

URL: 
https://github.com/llvm/llvm-project/commit/ae9d96a656a17fa782ccaa9ba10d4570f497e855
DIFF: 
https://github.com/llvm/llvm-project/commit/ae9d96a656a17fa782ccaa9ba10d4570f497e855.diff

LOG: [libomptarget][amdgpu] Address compiler warnings, drive by fixes

[libomptarget][amdgpu] Address compiler warnings, drive by fixes

Initialize some variables, remove unused ones.
Changes the debug printing condition to align with the aomp test suite.

Differential Revision: https://reviews.llvm.org/D92559

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 477439d19b50..f22b4697f30b 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -184,7 +184,7 @@ struct KernelTy {
   int8_t ExecutionMode;
   int16_t ConstWGSize;
   int32_t device_id;
-  void *CallStackAddr;
+  void *CallStackAddr = nullptr;
   const char *Name;
 
   KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
@@ -322,7 +322,8 @@ class RTLDeviceInfoTy {
   std::vector, uint64_t>>
   deviceStateStore;
 
-  static const int HardTeamLimit = 1 << 20; // 1 Meg
+  static const unsigned HardTeamLimit =
+  (1 << 16) - 1; // 64K needed to fit in uint16
   static const int DefaultNumTeams = 128;
   static const int Max_Teams =
   llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
@@ -648,7 +649,7 @@ int32_t __tgt_rtl_init_device(int device_id) {
 DeviceInfo.ComputeUnits[device_id] = compute_units;
 DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
   }
-  if (print_kernel_trace > 1)
+  if (print_kernel_trace == 4)
 fprintf(stderr, "Device#%-2d CU's: %2d\n", device_id,
 DeviceInfo.ComputeUnits[device_id]);
 
@@ -926,6 +927,27 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t 
device_id,
 
 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
  __tgt_device_image *image) {
+  // This function loads the device image onto gpu[device_id] and does other
+  // per-image initialization work. Specifically:
+  //
+  // - Initialize an omptarget_device_environmentTy instance embedded in the
+  //   image at the symbol "omptarget_device_environment"
+  //   Fields debug_level, device_num, num_devices. Used by the deviceRTL.
+  //
+  // - Allocate a large array per-gpu (could be moved to init_device)
+  //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
+  //   - Allocate at least that many bytes of gpu memory
+  //   - Zero initialize it
+  //   - Write the pointer to the symbol omptarget_nvptx_device_State
+  //
+  // - Pulls some per-kernel information together from various sources and
+  //   records it in the KernelsList for quicker access later
+  //
+  // The initialization can be done before or after loading the image onto the
+  // gpu. This function presently does a mixture. Using the hsa api to get/set
+  // the information is simpler to implement, in exchange for more complicated
+  // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
+  // back from the gpu vs a hashtable lookup on the host.
 
   const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
 
@@ -962,7 +984,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
 if (si.size != sizeof(host_device_env)) {
   return ATMI_STATUS_ERROR;
 }
-DP("Setting global device environment %lu bytes\n", si.size);
+DP("Setting global device environment %u bytes\n", si.size);
 uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
 void *pos = (char *)data + offset;
 memcpy(pos, &host_device_env, sizeof(host_device_env));
@@ -1145,7 +1167,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
   uint16_t TSize;
   uint16_t WG_Size;
   uint8_t Mode;
-  uint8_t HostServices;
 };
 struct KernDescValType KernDescVal;
 std::string KernDescNameStr(e->name);
@@ -1154,7 +1175,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
 
 void *KernDescPtr;
 uint32_t KernDescSize;
-void *CallStackAddr;
+void *CallStackAddr = nullptr;
 err = interop_get_symbol_info((char *)image->ImageStart, img_size,
   KernDescName, &KernDescPtr, &KernDescSize);
 
@@ -1176,7 +1197,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
   DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
   DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
   DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
-  DP("KernDesc: HostServices:

[llvm-branch-commits] [openmp] f628eef - [libomptarget][amdgpu] Fix latent race in load binary

2020-12-04 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-04T16:29:09Z
New Revision: f628eef98acd24f8eb6a52d67ee887bb18f04bca

URL: 
https://github.com/llvm/llvm-project/commit/f628eef98acd24f8eb6a52d67ee887bb18f04bca
DIFF: 
https://github.com/llvm/llvm-project/commit/f628eef98acd24f8eb6a52d67ee887bb18f04bca.diff

LOG: [libomptarget][amdgpu] Fix latent race in load binary

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index f22b4697f30b..ea8770e4543a 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -925,6 +925,26 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t 
device_id,
   return res;
 }
 
+static atmi_status_t atmi_calloc(void **ret_ptr, size_t size,
+ atmi_mem_place_t place) {
+  uint64_t rounded = 4 * ((size + 3) / 4);
+  void *ptr;
+  atmi_status_t err = atmi_malloc(&ptr, rounded, place);
+  if (err != ATMI_STATUS_SUCCESS) {
+return err;
+  }
+
+  hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
+  if (rc != HSA_STATUS_SUCCESS) {
+fprintf(stderr, "zero fill device_state failed with %u\n", rc);
+atmi_free(ptr);
+return ATMI_STATUS_ERROR;
+  }
+
+  *ret_ptr = ptr;
+  return ATMI_STATUS_SUCCESS;
+}
+
 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
  __tgt_device_image *image) {
   // This function loads the device image onto gpu[device_id] and does other
@@ -1024,7 +1044,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
   assert(dss.second == 0);
   void *ptr = NULL;
   atmi_status_t err =
-  atmi_malloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
+  atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
   if (err != ATMI_STATUS_SUCCESS) {
 fprintf(stderr, "Failed to allocate device_state array\n");
 return NULL;
@@ -1062,13 +1082,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
   fprintf(stderr, "memcpy install of state_ptr failed\n");
   return NULL;
 }
-
-assert((device_State_bytes & 0x3) == 0); // known >= 4 byte aligned
-hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, device_State_bytes / 4);
-if (rc != HSA_STATUS_SUCCESS) {
-  fprintf(stderr, "zero fill device_state failed with %u\n", rc);
-  return NULL;
-}
   }
 
   // TODO: Check with Guansong to understand the below comment more thoroughly.



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] e1b8e8a - [libomptarget][amdgpu] Skip device_State allocation when using bss global

2020-12-06 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-06T12:13:56Z
New Revision: e1b8e8a1f4c35c8596956d56ffc9f1d91b64f780

URL: 
https://github.com/llvm/llvm-project/commit/e1b8e8a1f4c35c8596956d56ffc9f1d91b64f780
DIFF: 
https://github.com/llvm/llvm-project/commit/e1b8e8a1f4c35c8596956d56ffc9f1d91b64f780.diff

LOG: [libomptarget][amdgpu] Skip device_State allocation when using bss global

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index ea8770e4543a..e688ef7f41ec 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -1033,54 +1033,64 @@ __tgt_target_table 
*__tgt_rtl_load_binary_locked(int32_t device_id,
 
   DP("ATMI module successfully loaded!\n");
 
-  // Zero the pseudo-bss variable by calling into hsa
-  // Do this post-load to handle got
-  uint64_t device_State_bytes =
-  get_device_State_bytes((char *)image->ImageStart, img_size);
-  auto &dss = DeviceInfo.deviceStateStore[device_id];
-  if (device_State_bytes != 0) {
-
-if (dss.first.get() == nullptr) {
-  assert(dss.second == 0);
-  void *ptr = NULL;
-  atmi_status_t err =
-  atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
-  if (err != ATMI_STATUS_SUCCESS) {
-fprintf(stderr, "Failed to allocate device_state array\n");
-return NULL;
-  }
-  dss = {std::unique_ptr{ptr},
- device_State_bytes};
-}
-
-void *ptr = dss.first.get();
-if (device_State_bytes != dss.second) {
-  fprintf(stderr, "Inconsistent sizes of device_State unsupported\n");
-  exit(1);
-}
+  {
+// the device_State array is either large value in bss or a void* that
+// needs to be assigned to a pointer to an array of size device_state_bytes
 
 void *state_ptr;
 uint32_t state_ptr_size;
-err = atmi_interop_hsa_get_symbol_info(get_gpu_mem_place(device_id),
-   "omptarget_nvptx_device_State",
-   &state_ptr, &state_ptr_size);
+atmi_status_t err = atmi_interop_hsa_get_symbol_info(
+get_gpu_mem_place(device_id), "omptarget_nvptx_device_State",
+&state_ptr, &state_ptr_size);
 
 if (err != ATMI_STATUS_SUCCESS) {
-  fprintf(stderr, "failed to find device_state ptr\n");
+  fprintf(stderr, "failed to find device_state symbol\n");
   return NULL;
 }
-if (state_ptr_size != sizeof(void *)) {
+
+if (state_ptr_size < sizeof(void *)) {
   fprintf(stderr, "unexpected size of state_ptr %u != %zu\n",
   state_ptr_size, sizeof(void *));
   return NULL;
 }
 
-// write ptr to device memory so it can be used by later kernels
-err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *),
-   device_id);
-if (err != ATMI_STATUS_SUCCESS) {
-  fprintf(stderr, "memcpy install of state_ptr failed\n");
-  return NULL;
+// if it's larger than a void*, assume it's a bss array and no further
+// initialization is required. Only try to set up a pointer for
+// sizeof(void*)
+if (state_ptr_size == sizeof(void *)) {
+  uint64_t device_State_bytes =
+  get_device_State_bytes((char *)image->ImageStart, img_size);
+  if (device_State_bytes == 0) {
+return NULL;
+  }
+
+  auto &dss = DeviceInfo.deviceStateStore[device_id];
+  if (dss.first.get() == nullptr) {
+assert(dss.second == 0);
+void *ptr = NULL;
+atmi_status_t err =
+atmi_calloc(&ptr, device_State_bytes, 
get_gpu_mem_place(device_id));
+if (err != ATMI_STATUS_SUCCESS) {
+  fprintf(stderr, "Failed to allocate device_state array\n");
+  return NULL;
+}
+dss = {std::unique_ptr{ptr},
+   device_State_bytes};
+  }
+
+  void *ptr = dss.first.get();
+  if (device_State_bytes != dss.second) {
+fprintf(stderr, "Inconsistent sizes of device_State unsupported\n");
+exit(1);
+  }
+
+  // write ptr to device memory so it can be used by later kernels
+  err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
+ sizeof(void *), device_id);
+  if (err != ATMI_STATUS_SUCCESS) {
+fprintf(stderr, "memcpy install of state_ptr failed\n");
+return NULL;
+  }
 }
   }
 



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] 71f4693 - [libomptarget][amdgpu] Add plumbing to call into hostrpc lib, if linked

2020-12-07 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-07T15:24:01Z
New Revision: 71f469302079baeb552b29c6959ac884da101102

URL: 
https://github.com/llvm/llvm-project/commit/71f469302079baeb552b29c6959ac884da101102
DIFF: 
https://github.com/llvm/llvm-project/commit/71f469302079baeb552b29c6959ac884da101102.diff

LOG: [libomptarget][amdgpu] Add plumbing to call into hostrpc lib, if linked

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index e688ef7f41ec..252abca08944 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -45,6 +45,29 @@
 #endif
 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
 
+// hostrpc interface, FIXME: consider moving to its own include these are
+// statically linked into amdgpu/plugin if present from hostrpc_services.a,
+// linked as --whole-archive to override the weak symbols that are used to
+// implement a fallback for toolchains that do not yet have a hostrpc library.
+extern "C" {
+unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q,
+uint32_t device_id);
+hsa_status_t hostrpc_init();
+hsa_status_t hostrpc_terminate();
+
+__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; 
}
+__attribute__((weak)) hsa_status_t hostrpc_terminate() {
+  return HSA_STATUS_SUCCESS;
+}
+__attribute__((weak)) unsigned long
+hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) {
+  DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
+ "missing\n",
+ device_id);
+  return 0;
+}
+}
+
 int print_kernel_trace;
 
 // Size of the target call stack struture
@@ -431,6 +454,8 @@ class RTLDeviceInfoTy {
   DP("Error when initializing HSA-ATMI\n");
   return;
 }
+// Init hostcall soon after initializing ATMI
+hostrpc_init();
 
 HSAAgents = find_gpu_agents();
 NumberOfDevices = (int)HSAAgents.size();
@@ -520,6 +545,8 @@ class RTLDeviceInfoTy {
 // atmi_finalize removes access to it
 deviceStateStore.clear();
 KernelArgPoolMap.clear();
+// Terminate hostrpc before finalizing ATMI
+hostrpc_terminate();
 atmi_finalize();
   }
 };
@@ -1540,6 +1567,8 @@ static uint64_t acquire_available_packet_id(hsa_queue_t 
*queue) {
   return packet_id;
 }
 
+extern bool g_atmi_hostcall_required; // declared without header by atmi
+
 static int32_t __tgt_rtl_run_target_team_region_locked(
 int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
 ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t num_teams,
@@ -1683,6 +1712,22 @@ int32_t __tgt_rtl_run_target_team_region_locked(
   impl_args->offset_y = 0;
   impl_args->offset_z = 0;
 
+  // assign a hostcall buffer for the selected Q
+  if (g_atmi_hostcall_required) {
+// hostrpc_assign_buffer is not thread safe, and this function is
+// under a multiple reader lock, not a writer lock.
+static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_lock(&hostcall_init_lock);
+impl_args->hostcall_ptr = hostrpc_assign_buffer(
+DeviceInfo.HSAAgents[device_id], queue, device_id);
+pthread_mutex_unlock(&hostcall_init_lock);
+if (!impl_args->hostcall_ptr) {
+  DP("hostrpc_assign_buffer failed, gpu would dereference null and "
+ "error\n");
+  return OFFLOAD_FAIL;
+}
+  }
+
   packet->kernarg_address = kernarg;
 }
 



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] cab9f69 - [libomptarget][amdgpu] Improve diagnostics on arch mismatch

2020-12-09 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-09T18:55:53Z
New Revision: cab9f6923522475e0d2137c66622c3fa70b01d3b

URL: 
https://github.com/llvm/llvm-project/commit/cab9f6923522475e0d2137c66622c3fa70b01d3b
DIFF: 
https://github.com/llvm/llvm-project/commit/cab9f6923522475e0d2137c66622c3fa70b01d3b.diff

LOG: [libomptarget][amdgpu] Improve diagnostics on arch mismatch

Added: 
openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h

Modified: 
openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt 
b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
index 0c50ffdf2fa6..38f0afabf3ad 100644
--- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
@@ -57,6 +57,7 @@ add_library(omptarget.rtl.amdgpu SHARED
   impl/atmi.cpp
   impl/atmi_interop_hsa.cpp
   impl/data.cpp
+  impl/get_elf_mach_gfx_name.cpp
   impl/machine.cpp
   impl/system.cpp
   impl/utils.cpp

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp 
b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
new file mode 100644
index ..45af34684117
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
@@ -0,0 +1,53 @@
+#include "get_elf_mach_gfx_name.h"
+
+// This header conflicts with the system elf.h (macros vs enums of the same
+// identifier) and contains more up to date values for the enum checked here.
+// rtl.cpp uses the system elf.h.
+#include "llvm/BinaryFormat/ELF.h"
+
+const char *get_elf_mach_gfx_name(uint32_t EFlags) {
+  using namespace llvm::ELF;
+  uint32_t Gfx = (EFlags & EF_AMDGPU_MACH);
+  switch (Gfx) {
+  case EF_AMDGPU_MACH_AMDGCN_GFX801:
+return "gfx801";
+  case EF_AMDGPU_MACH_AMDGCN_GFX802:
+return "gfx802";
+  case EF_AMDGPU_MACH_AMDGCN_GFX803:
+return "gfx803";
+  case EF_AMDGPU_MACH_AMDGCN_GFX805:
+return "gfx805";
+  case EF_AMDGPU_MACH_AMDGCN_GFX810:
+return "gfx810";
+  case EF_AMDGPU_MACH_AMDGCN_GFX900:
+return "gfx900";
+  case EF_AMDGPU_MACH_AMDGCN_GFX902:
+return "gfx902";
+  case EF_AMDGPU_MACH_AMDGCN_GFX904:
+return "gfx904";
+  case EF_AMDGPU_MACH_AMDGCN_GFX906:
+return "gfx906";
+  case EF_AMDGPU_MACH_AMDGCN_GFX908:
+return "gfx908";
+  case EF_AMDGPU_MACH_AMDGCN_GFX909:
+return "gfx909";
+  case EF_AMDGPU_MACH_AMDGCN_GFX90C:
+return "gfx90c";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1010:
+return "gfx1010";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1011:
+return "gfx1011";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1012:
+return "gfx1012";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1030:
+return "gfx1030";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1031:
+return "gfx1031";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1032:
+return "gfx1032";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1033:
+return "gfx1033";
+  default:
+return "--unknown gfx";
+  }
+}

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h 
b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
new file mode 100644
index ..b1be90dc29d5
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
@@ -0,0 +1,8 @@
+#ifndef GET_ELF_MACH_GFX_NAME_H_INCLUDED
+#define GET_ELF_MACH_GFX_NAME_H_INCLUDED
+
+#include 
+
+const char *get_elf_mach_gfx_name(uint32_t EFlags);
+
+#endif

diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 252abca08944..60040d1c0da4 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -36,6 +36,7 @@
 #include "internal.h"
 
 #include "Debug.h"
+#include "get_elf_mach_gfx_name.h"
 #include "omptargetplugin.h"
 
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
@@ -92,14 +93,6 @@ uint32_t TgtStackItemSize = 0;
 
 #include "../../common/elf_common.c"
 
-static bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
-  const uint16_t amdgcnMachineID = 224;
-  int32_t r = elf_check_machine(image, amdgcnMachineID);
-  if (!r) {
-DP("Supported machine ID not found\n");
-  }
-  return r;
-}
 
 /// Keep entries table per device
 struct FuncOrGblEntryTy {
@@ -319,6 +312,7 @@ class RTLDeviceInfoTy {
   std::vector GroupsPerDevice;
   std::vector ThreadsPerGroup;
   std::vector WarpSize;
+  std::vector GPUName;
 
   // OpenMP properties
   std::vector NumTeams;
@@ -472,6 +466,7 @@ class RTLDeviceInfoTy {
 FuncGblEntries.resize(NumberOfDevices);
 ThreadsPerGroup.resize(NumberOfDevices);
 ComputeUnits.resize(NumberOfDevices);
+GPUName.resize(NumberOfDevices);
 GroupsPerDevice.resize(NumberOfDevices);
 WarpSize.resize(NumberOfDevices);
 NumTeams.resize(NumberOfD

[llvm-branch-commits] [openmp] e191d31 - [libomptarget][amdgpu] Robust handling of device_environment symbol

2020-12-09 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-09T19:21:51Z
New Revision: e191d3115921d9b5b6602747bff72a1f2cf565c4

URL: 
https://github.com/llvm/llvm-project/commit/e191d3115921d9b5b6602747bff72a1f2cf565c4
DIFF: 
https://github.com/llvm/llvm-project/commit/e191d3115921d9b5b6602747bff72a1f2cf565c4.diff

LOG: [libomptarget][amdgpu] Robust handling of device_environment symbol

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 60040d1c0da4d..e13d769a16aad 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -891,6 +891,7 @@ const Elf64_Sym *elf_lookup(Elf *elf, char *base, 
Elf64_Shdr *section_hash,
 typedef struct {
   void *addr = nullptr;
   uint32_t size = UINT32_MAX;
+  uint32_t sh_type = SHT_NULL;
 } symbol_info;
 
 int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
@@ -913,8 +914,23 @@ int get_symbol_info_without_loading(Elf *elf, char *base, 
const char *symname,
 return 1;
   }
 
-  res->size = static_cast(sym->st_size);
+  if (sym->st_shndx == SHN_UNDEF) {
+return 1;
+  }
+
+  Elf_Scn *section = elf_getscn(elf, sym->st_shndx);
+  if (!section) {
+return 1;
+  }
+
+  Elf64_Shdr *header = elf64_getshdr(section);
+  if (!header) {
+return 1;
+  }
+
   res->addr = sym->st_value + base;
+  res->size = static_cast(sym->st_size);
+  res->sh_type = header->sh_type;
   return 0;
 }
 
@@ -992,6 +1008,99 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t 
device_id,
   return res;
 }
 
+struct device_environment {
+  // initialise an omptarget_device_environmentTy in the deviceRTL
+  // patches around 
diff erences in the deviceRTL between trunk, aomp,
+  // rocmcc. Over time these 
diff erences will tend to zero and this class
+  // simplified.
+  // Symbol may be in .data or .bss, and may be missing fields:
+  //  - aomp has debug_level, num_devices, device_num
+  //  - trunk has debug_level
+  //  - under review in trunk is debug_level, device_num
+  //  - rocmcc matches aomp, patch to swap num_devices and device_num
+
+  // If the symbol is in .data (aomp, rocm) it can be written directly.
+  // If it is in .bss, we must wait for it to be allocated space on the
+  // gpu (trunk) and initialize after loading.
+  const char *sym() { return "omptarget_device_environment"; }
+
+  omptarget_device_environmentTy host_device_env;
+  symbol_info si;
+  bool valid = false;
+
+  __tgt_device_image *image;
+  const size_t img_size;
+
+  device_environment(int device_id, int number_devices,
+ __tgt_device_image *image, const size_t img_size)
+  : image(image), img_size(img_size) {
+
+host_device_env.num_devices = number_devices;
+host_device_env.device_num = device_id;
+host_device_env.debug_level = 0;
+#ifdef OMPTARGET_DEBUG
+if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
+  host_device_env.debug_level = std::stoi(envStr);
+}
+#endif
+
+int rc = get_symbol_info_without_loading((char *)image->ImageStart,
+ img_size, sym(), &si);
+if (rc != 0) {
+  DP("Finding global device environment '%s' - symbol missing.\n", sym());
+  return;
+}
+
+if (si.size > sizeof(host_device_env)) {
+  DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size,
+ sizeof(host_device_env));
+  return;
+}
+
+valid = true;
+  }
+
+  bool in_image() { return si.sh_type != SHT_NOBITS; }
+
+  atmi_status_t before_loading(void *data, size_t size) {
+assert(valid);
+if (in_image()) {
+  DP("Setting global device environment before load (%u bytes)\n", 
si.size);
+  uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
+  void *pos = (char *)data + offset;
+  memcpy(pos, &host_device_env, si.size);
+}
+return ATMI_STATUS_SUCCESS;
+  }
+
+  atmi_status_t after_loading() {
+assert(valid);
+if (!in_image()) {
+  DP("Setting global device environment after load (%u bytes)\n", si.size);
+  int device_id = host_device_env.device_num;
+
+  void *state_ptr;
+  uint32_t state_ptr_size;
+  atmi_status_t err = atmi_interop_hsa_get_symbol_info(
+  get_gpu_mem_place(device_id), sym(), &state_ptr, &state_ptr_size);
+  if (err != ATMI_STATUS_SUCCESS) {
+DP("failed to find %s in loaded image\n", sym());
+return err;
+  }
+
+  if (state_ptr_size != si.size) {
+DP("Symbol had size %u before loading, %u after\n", state_ptr_size,
+   si.size);
+return ATMI_STATUS_ERROR;
+  }
+
+  return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env,
+  state_ptr_size, device_id);
+}

[llvm-branch-commits] [openmp] c9bc414 - [libomptarget][amdgpu] Let default number of teams equal number of CUs

2020-12-09 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-09T19:35:34Z
New Revision: c9bc414840a41aff3e83a1529ba6dd98e13ce39d

URL: 
https://github.com/llvm/llvm-project/commit/c9bc414840a41aff3e83a1529ba6dd98e13ce39d
DIFF: 
https://github.com/llvm/llvm-project/commit/c9bc414840a41aff3e83a1529ba6dd98e13ce39d.diff

LOG: [libomptarget][amdgpu] Let default number of teams equal number of CUs

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index e13d769a16aa..18bf67f7fc8a 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -789,9 +789,17 @@ int32_t __tgt_rtl_init_device(int device_id) {
 DP("Default number of teams set according to environment %d\n",
DeviceInfo.EnvNumTeams);
   } else {
-DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
-DP("Default number of teams set according to library's default %d\n",
-   RTLDeviceInfoTy::DefaultNumTeams);
+char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
+int TeamsPerCU = 1; // default number of teams per CU is 1
+if (TeamsPerCUEnvStr) {
+  TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
+}
+   
+DeviceInfo.NumTeams[device_id] =
+  TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
+DP("Default number of teams = %d * number of compute units %d\n",
+   TeamsPerCU,
+   DeviceInfo.ComputeUnits[device_id]);
   }
 
   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) {
@@ -1548,11 +1556,12 @@ int32_t __tgt_rtl_data_delete(int device_id, void 
*tgt_ptr) {
 // loop_tripcount.
 void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize,
int ExecutionMode, int EnvTeamLimit, int EnvNumTeams,
-   int num_teams, int thread_limit, uint64_t loop_tripcount) {
+   int num_teams, int thread_limit, uint64_t loop_tripcount,
+   int32_t device_id) {
 
   int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0
   ? DeviceInfo.EnvMaxTeamsDefault
-  : DeviceInfo.Max_Teams;
+  : DeviceInfo.NumTeams[device_id];
   if (Max_Teams > DeviceInfo.HardTeamLimit)
 Max_Teams = DeviceInfo.HardTeamLimit;
 
@@ -1752,7 +1761,8 @@ int32_t __tgt_rtl_run_target_team_region_locked(
 DeviceInfo.EnvNumTeams,
 num_teams, // From run_region arg
 thread_limit,  // From run_region arg
-loop_tripcount // From run_region arg
+loop_tripcount, // From run_region arg
+KernelInfo->device_id
   );
 
   if (print_kernel_trace == 4)



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] 7c59614 - [libomptarget][amdgpu] clang-format src/rtl.cpp

2020-12-09 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-09T19:45:51Z
New Revision: 7c5961439485e59b8f463b17bf37dab8d8aa7c3a

URL: 
https://github.com/llvm/llvm-project/commit/7c5961439485e59b8f463b17bf37dab8d8aa7c3a
DIFF: 
https://github.com/llvm/llvm-project/commit/7c5961439485e59b8f463b17bf37dab8d8aa7c3a.diff

LOG: [libomptarget][amdgpu] clang-format src/rtl.cpp

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 18bf67f7fc8a..5ec5f5e45e36 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -93,7 +93,6 @@ uint32_t TgtStackItemSize = 0;
 
 #include "../../common/elf_common.c"
 
-
 /// Keep entries table per device
 struct FuncOrGblEntryTy {
   __tgt_target_table Table;
@@ -708,7 +707,7 @@ int32_t __tgt_rtl_init_device(int device_id) {
 
   char GetInfoName[64]; // 64 max size returned by get info
   err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
-  (void *) GetInfoName);
+   (void *)GetInfoName);
   if (err)
 DeviceInfo.GPUName[device_id] = "--unknown gpu--";
   else {
@@ -718,7 +717,7 @@ int32_t __tgt_rtl_init_device(int device_id) {
   if (print_kernel_trace == 4)
 fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id,
 DeviceInfo.ComputeUnits[device_id],
-   DeviceInfo.GPUName[device_id].c_str());
+DeviceInfo.GPUName[device_id].c_str());
 
   // Query attributes to determine number of threads/block and blocks/grid.
   uint16_t workgroup_max_dim[3];
@@ -794,12 +793,11 @@ int32_t __tgt_rtl_init_device(int device_id) {
 if (TeamsPerCUEnvStr) {
   TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
 }
-   
+
 DeviceInfo.NumTeams[device_id] =
-  TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
+TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
 DP("Default number of teams = %d * number of compute units %d\n",
-   TeamsPerCU,
-   DeviceInfo.ComputeUnits[device_id]);
+   TeamsPerCU, DeviceInfo.ComputeUnits[device_id]);
   }
 
   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) {
@@ -1183,7 +1181,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
   "Possible gpu arch mismatch: device:%s, image:%s please check"
   " compiler flag: -march=\n",
   DeviceInfo.GPUName[device_id].c_str(),
- get_elf_mach_gfx_name(elf_e_flags(image)));
+  get_elf_mach_gfx_name(elf_e_flags(image)));
   return NULL;
 }
 
@@ -1759,11 +1757,10 @@ int32_t __tgt_rtl_run_target_team_region_locked(
   getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize,
 KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit,
 DeviceInfo.EnvNumTeams,
-num_teams, // From run_region arg
-thread_limit,  // From run_region arg
+num_teams,  // From run_region arg
+thread_limit,   // From run_region arg
 loop_tripcount, // From run_region arg
-KernelInfo->device_id
-  );
+KernelInfo->device_id);
 
   if (print_kernel_trace == 4)
 // enum modes are SPMD, GENERIC, NONE 0,1,2



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] ce93de3 - [libomptarget][nfc] Remove data_sharing type aliasing

2020-12-10 Thread Jon Chesterfield via llvm-branch-commits


Author: Jon Chesterfield
Date: 2020-12-11T02:13:34Z
New Revision: ce93de3bb284c944676c7b81890156d9d80b1db9

URL: 
https://github.com/llvm/llvm-project/commit/ce93de3bb284c944676c7b81890156d9d80b1db9
DIFF: 
https://github.com/llvm/llvm-project/commit/ce93de3bb284c944676c7b81890156d9d80b1db9.diff

LOG: [libomptarget][nfc] Remove data_sharing type aliasing

[libomptarget][nfc] Remove data_sharing type aliasing

Libomptarget previous used __kmpc_data_sharing_slot to access values of type
__kmpc_data_sharing_{worker,master}_slot_static. This aliasing violation was
benign in practice. The master type has since been removed, so a single type
can be used instead.

This is particularly helpful for the transition to an openmp deviceRTL, as the
c++/openmp compiler for amdgcn currently rejects the flexible array member for
being an incomplete type. Serves the same purpose as abandoned D86324.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D93075

Added: 


Modified: 
openmp/libomptarget/deviceRTLs/common/omptarget.h
openmp/libomptarget/deviceRTLs/interface.h

Removed: 




diff  --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h 
b/openmp/libomptarget/deviceRTLs/common/omptarget.h
index 0ccd71c3b55f..fc4eb6bfbcfa 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -74,6 +74,16 @@ class omptarget_nvptx_SharedArgs {
 extern DEVICE SHARED omptarget_nvptx_SharedArgs
 omptarget_nvptx_globalArgs;
 
+// Worker slot type which is initialized with the default worker slot
+// size of 4*32 bytes.
+struct __kmpc_data_sharing_slot {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[DS_Worker_Warp_Slot_Size];
+};
+
 // Data structure to keep in shared memory that traces the current slot, stack,
 // and frame pointer as well as the active threads that didn't exit the current
 // environment.
@@ -83,15 +93,6 @@ struct DataSharingStateTy {
   void * volatile FramePtr[DS_Max_Warp_Number];
   __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
 };
-// Additional worker slot type which is initialized with the default worker 
slot
-// size of 4*32 bytes.
-struct __kmpc_data_sharing_worker_slot_static {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[DS_Worker_Warp_Slot_Size];
-};
 
 extern DEVICE SHARED DataSharingStateTy DataSharingState;
 
@@ -213,7 +214,7 @@ class omptarget_nvptx_TeamDescr {
   workDescrForActiveParallel; // one, ONLY for the active par
 
   ALIGN(16)
-  __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number];
+  __kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number];
 };
 
 


diff  --git a/openmp/libomptarget/deviceRTLs/interface.h 
b/openmp/libomptarget/deviceRTLs/interface.h
index 330880556293..5f539bc3fd66 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -438,17 +438,6 @@ EXTERN void __kmpc_begin_sharing_variables(void 
***GlobalArgs, size_t nArgs);
 EXTERN void __kmpc_end_sharing_variables();
 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
 
-// The slot used for data sharing by the master and worker threads. We use a
-// complete (default size version and an incomplete one so that we allow sizes
-// greater than the default).
-struct __kmpc_data_sharing_slot {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[];
-};
-
 // SPMD execution mode interrogation function.
 EXTERN int8_t __kmpc_is_spmd_exec_mode();
 



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [openmp] 78b0630 - [libomptarget][cuda] Call v2 functions explicitly

[llvm-branch-commits] [openmp] dc70c56 - [libomptarget][amdgpu][nfc] Update comments

[llvm-branch-commits] [openmp] c3074d4 - [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics

[llvm-branch-commits] [openmp] e5e448a - [libomptarget][cuda] Fix build, change missed from D95274

[llvm-branch-commits] [openmp] 95f0d1e - [libomptarget] Compile with older cuda, revert D95274

[llvm-branch-commits] [openmp] 33e2494 - [libomptarget][amdgpu][nfc] Fix build on centos

[llvm-branch-commits] [openmp] 84e0b14 - [libomptarget][nvptx] Include omp_data.cu in bitcode deviceRTL

[llvm-branch-commits] [openmp] 5d165f0 - [libomptarget][amdgpu] Fix kernel launch tracing to match previous behavior

[llvm-branch-commits] [openmp] 6e7094c - [libomptarget][nvptx][nfc] Move target_impl functions out of header

[llvm-branch-commits] [openmp] 214387c - [libomptarget][nvptx] Reduce calls to cuda header

[llvm-branch-commits] [openmp] e069662 - [libomptarget][devicertl] Wrap source in declare target pragmas

[llvm-branch-commits] [openmp] ea616f9 - [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify

[llvm-branch-commits] [openmp] fbc1dcb - [libomptarget][devicertl][nfc] Simplify target_atomic abstraction

[llvm-branch-commits] [openmp] 9b19ecb - [libomptarget][devicertl] Drop templated atomic functions

[llvm-branch-commits] [openmp] 47e95e8 - [libomptarget] Build cuda plugin without cuda installed locally

[llvm-branch-commits] [openmp] 76bfbb7 - [libomptarget][amdgpu] Call into deviceRTL instead of ockl

[llvm-branch-commits] [clang] 4b2e7d0 - [amdgpu] Default to code object v3

[llvm-branch-commits] [openmp] b607837 - [libomptarget][nfc] Replace static const with enum

[llvm-branch-commits] [clang] c0619d3 - [NFC] Use regex for code object version in hip tests

[llvm-branch-commits] [clang] daf39e3 - [amdgpu] Default to code object v3

[llvm-branch-commits] [openmp] 89a0f48 - [libomptarget][cuda] Detect missing symbols in plugin at build time

[llvm-branch-commits] [openmp] ae9d96a - [libomptarget][amdgpu] Address compiler warnings, drive by fixes

[llvm-branch-commits] [openmp] f628eef - [libomptarget][amdgpu] Fix latent race in load binary

[llvm-branch-commits] [openmp] e1b8e8a - [libomptarget][amdgpu] Skip device_State allocation when using bss global

[llvm-branch-commits] [openmp] 71f4693 - [libomptarget][amdgpu] Add plumbing to call into hostrpc lib, if linked

[llvm-branch-commits] [openmp] cab9f69 - [libomptarget][amdgpu] Improve diagnostics on arch mismatch

[llvm-branch-commits] [openmp] e191d31 - [libomptarget][amdgpu] Robust handling of device_environment symbol

[llvm-branch-commits] [openmp] c9bc414 - [libomptarget][amdgpu] Let default number of teams equal number of CUs

[llvm-branch-commits] [openmp] 7c59614 - [libomptarget][amdgpu] clang-format src/rtl.cpp

[llvm-branch-commits] [openmp] ce93de3 - [libomptarget][nfc] Remove data_sharing type aliasing

30 matches

Site Navigation

Mail list logo

Footer information