JonChesterfield created this revision.
JonChesterfield added reviewers: jdoerfert, tianshilei1992.
Herald added subscribers: mgorny, jvesely.
JonChesterfield requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.
[WIP] Move part of nvptx devicertl under clang
Example of moving the devicertl functions that depend on cuda
version under clang, so they can be injected at application
build time.
The original idea was to use the intrinsic definitions from
__clang_cuda_intrinsics, but that header needs a lot of cuda
specific setup to compile and includes part of the cuda sdk.
It's therefore difficult to compile as openmp.
This implements the code in headers and will work for c++ with
openmp, but not necessarily for C as the inline functions may not
be instantiated. It will also be a problem for fortran openmp.
I'm inclined to do something broadly equivalent to this, but in
the library. It means clang would need to link against devicertl.bc
and against a small cuda version specific devicertl_tbd.bc.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D95313
Files:
clang/lib/Driver/ToolChains/Clang.cpp
clang/lib/Headers/CMakeLists.txt
clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h
clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
===
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -100,17 +100,18 @@
#error CUDA_VERSION macro is undefined, something wrong with cuda.
#endif
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
int32_t SrcLane);
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
int32_t Var, uint32_t Delta,
int32_t Width);
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
+
DEVICE void __kmpc_impl_syncthreads();
-DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
// NVPTX specific kernel initialization
DEVICE void __kmpc_impl_target_init();
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -18,17 +18,6 @@
// Forward declaration of CUDA primitives which will be evetually transformed
// into LLVM intrinsics.
-extern "C" {
-unsigned int __activemask();
-unsigned int __ballot(unsigned);
-// The default argument here is based on NVIDIA's website
-// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
-int __shfl(int val, int src_line, int width = WARPSIZE);
-int __shfl_down(int var, unsigned detla, int width);
-int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
-void __syncwarp(int mask);
-}
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t , uint32_t ) {
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
@@ -69,45 +58,8 @@
return (double)nsecs * __kmpc_impl_get_wtick();
}
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
- return __activemask();
-#else
- return __ballot(1);
-#endif
-}
-
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
- int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
- return __shfl_sync(Mask, Var, SrcLane);
-#else
- return __shfl(Var, SrcLane);
-#endif // CUDA_VERSION
-}
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
- int32_t Var, uint32_t Delta,
- int32_t Width) {
-#if CUDA_VERSION >= 9000
- return __shfl_down_sync(Mask, Var, Delta, Width);
-#else
- return __shfl_down(Var, Delta, Width);
-#endif // CUDA_VERSION
-}
-
DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
-DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t