https://github.com/fineg74 created https://github.com/llvm/llvm-project/pull/171011
This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs >From 996c2e16dd7317cb1be936f9459145f3b0a7070c Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" <[email protected]> Date: Mon, 24 Nov 2025 16:30:47 -0800 Subject: [PATCH] Build DeviceRTL with spirv backend --- clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/gpuintrin.h | 6 +- clang/lib/Headers/spirvintrin.h | 207 ++++++++++++++++++++++++++ openmp/device/CMakeLists.txt | 105 ++++++++++--- openmp/device/include/DeviceTypes.h | 8 +- openmp/device/include/LibC.h | 7 + openmp/device/include/State.h | 2 +- openmp/device/src/Allocator.cpp | 2 +- openmp/device/src/LibC.cpp | 5 +- openmp/device/src/Parallelism.cpp | 10 +- openmp/device/src/Synchronization.cpp | 96 ++++++++++++ 11 files changed, 414 insertions(+), 35 deletions(-) create mode 100644 clang/lib/Headers/spirvintrin.h diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 33fff7645df65..208f8b9be6d60 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -295,6 +295,7 @@ set(gpu_files gpuintrin.h nvptxintrin.h amdgpuintrin.h + spirvintrin.h ) set(windows_only_files diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 7afc82413996b..cc8a72bcfb0a3 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -18,7 +18,7 @@ #define __GPUINTRIN_H #if !defined(_DEFAULT_FN_ATTRS) -#if defined(__HIP__) || defined(__CUDA__) +#if defined(__HIP__) || defined(__CUDA__) || defined(__SPIRV__) #define _DEFAULT_FN_ATTRS __attribute__((device)) #else #define _DEFAULT_FN_ATTRS @@ -56,7 +56,9 @@ __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x); _Pragma("omp end declare variant"); _Pragma("omp end declare target"); -#if defined(__NVPTX__) +#if defined(__SPIRV__) +#include <spirvintrin.h> +#elif defined(__NVPTX__) #include <nvptxintrin.h> #elif defined(__AMDGPU__) #include <amdgpuintrin.h> diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h new file mode 100644 index 0000000000000..84166a455d4db --- /dev/null +++ b/clang/lib/Headers/spirvintrin.h @@ -0,0 +1,207 @@ +#ifndef __SPIRVINTRIN_H +#define __SPIRVINTRIN_H + +#ifndef __SPIRV__ +#error "This file is intended for SPIRV targets or offloading to SPIRV" +#endif + +#ifndef __GPUINTRIN_H +#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead" +#endif + +#include <stdint.h> +#if !defined(__cplusplus) +_Pragma("push_macro(\"bool\")"); +#define bool _Bool +#define true 1 +#define false 0 +#endif + +_Pragma("omp begin declare target device_type(nohost)"); +_Pragma("omp begin declare variant match(device = {arch(spirv64)})"); + +// Type aliases to the address spaces used by the SPIR-V backend. +// +// TODO: FIX +#define __gpu_private +#define __gpu_constant +#define __gpu_local +#define __gpu_global __attribute__((address_space(1))) +#define __gpu_generic __attribute__((address_space(4))) +// Attribute to declare a function as a kernel. +#define __gpu_kernel __attribute__((spirv_kernel, visibility("protected"))) +#define __SPIRV_VAR_QUALIFIERS extern const +// Workgroup and invocation ID functions +uint64_t __spirv_BuiltInNumWorkgroups(int i); +uint64_t __spirv_BuiltInWorkgroupId(int i); +uint64_t __spirv_BuiltInWorkgroupSize(int i); +uint64_t __spirv_BuiltInLocalInvocationId(int i); + +#ifdef __cplusplus +template <typename... Args> +int __spirv_ocl_printf(Args...); +#endif + +// Subgroup functions +__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupLocalInvocationId; +__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupSize; + +// Group non-uniform operations +uint64_t __spirv_GroupNonUniformBallot(uint32_t execution_scope, bool predicate); +uint32_t __spirv_GroupNonUniformBroadcastFirst(uint32_t execution_scope, uint32_t value); +uint32_t __spirv_GroupNonUniformShuffle(uint32_t execution_scope, uint32_t value, uint32_t id); + +// Synchronization +void __spirv_ControlBarrier(uint32_t execution_scope, uint32_t memory_scope, uint32_t semantics); +void __spirv_MemoryBarrier(uint32_t memory_scope, uint32_t semantics); + + +// Returns the number of blocks in the 'x' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { + return __spirv_BuiltInNumWorkgroups(0); +} + +// Returns the number of blocks in the 'y' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { + return __spirv_BuiltInNumWorkgroups(1); +} + +// Returns the number of blocks in the 'z' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { + return __spirv_BuiltInNumWorkgroups(2); +} + +// Returns the 'x' dimension of the current block's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) { + return __spirv_BuiltInWorkgroupId(0); +} + +// Returns the 'y' dimension of the current block's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) { + return __spirv_BuiltInWorkgroupId(1); +} + +// Returns the 'z' dimension of the current block's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { + return __spirv_BuiltInWorkgroupId(2); +} + +// Returns the number of threads in the 'x' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { + return __spirv_BuiltInWorkgroupSize(0); +} + +// Returns the number of threads in the 'y' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { + return __spirv_BuiltInWorkgroupSize(1); +} + +// Returns the number of threads in the 'z' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { + return __spirv_BuiltInWorkgroupSize(2); +} + +// Returns the 'x' dimension id of the thread in the current block. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) { + return __spirv_BuiltInLocalInvocationId(0); +} + +// Returns the 'y' dimension id of the thread in the current block. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) { + return __spirv_BuiltInLocalInvocationId(1); +} + +// Returns the 'z' dimension id of the thread in the current block. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) { + return __spirv_BuiltInLocalInvocationId(2); +} + +// Returns the size of a warp, always 32 on NVIDIA hardware. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { return __spirv_BuiltInSubgroupSize; } + +// Returns the id of the thread inside of a warp executing together. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { return __spirv_BuiltInSubgroupLocalInvocationId; } + +// Returns the bit-mask of active threads in the current warp. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { + return __spirv_GroupNonUniformBallot(3, 1); +} +// Copies the value from the first active thread in the warp to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { + return __spirv_GroupNonUniformBroadcastFirst(3, __x); +} +// Returns a bitmask of threads in the current lane for which \p x is true. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, + bool __x) { + uint64_t ballot = __spirv_GroupNonUniformBallot(3, __x); + return __lane_mask & ballot; +} +// Waits for all the threads in the block to converge and issues a fence. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) { + __spirv_ControlBarrier(4, 2, 0x8); // Workgroup scope, acquire/release semantics +} +// Waits for all threads in the warp to reconverge for independent scheduling. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { + __spirv_ControlBarrier(4, 3, 0x8); // Subgroup scope, acquire/release semantics +} +// Shuffles the the lanes inside the warp according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { + uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1)); + return __spirv_GroupNonUniformShuffle(3, __x, __lane); +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + return __gpu_match_any_u32_impl(__lane_mask, __x); +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + return __gpu_match_any_u64_impl(__lane_mask, __x); +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + return __gpu_match_all_u32_impl(__lane_mask, __x); +} + + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + return __gpu_match_all_u64_impl(__lane_mask, __x); +} + +// Returns true if the flat pointer points to 'shared' memory. +_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { + return false; // TODO + //return to_local(ptr) != 0; +} +// Returns true if the flat pointer points to 'local' memory. +_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) { + return false; + //return to_private(ptr) != 0; // TODO +} +// Terminates execution of the calling thread. +_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) { + __builtin_unreachable(); +} +// Suspend the thread briefly to assist the scheduler during busy loops. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) { + // SPIR-V doesn't have a direct equivalent, use a memory barrier as hint + __spirv_MemoryBarrier(1, 0x100); +} + +_Pragma("omp end declare variant"); +_Pragma("omp end declare target"); + +#if !defined(__cplusplus) +_Pragma("pop_macro(\"bool\")"); +#endif +#endif // __SPIRVINTRIN_H \ No newline at end of file diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt index 54cfdfef440a5..86f46de912584 100644 --- a/openmp/device/CMakeLists.txt +++ b/openmp/device/CMakeLists.txt @@ -25,14 +25,17 @@ set(src_files ${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp ) -list(APPEND compile_options -flto) -list(APPEND compile_options -fvisibility=hidden) +if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND + NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv") + list(APPEND compile_options -flto) + list(APPEND compile_options -fvisibility=hidden) + list(APPEND compile_options -Wno-unknown-cuda-version) +endif() list(APPEND compile_options -nogpulib) list(APPEND compile_options -nostdlibinc) list(APPEND compile_options -fno-rtti) list(APPEND compile_options -fno-exceptions) list(APPEND compile_options -fconvergent-functions) -list(APPEND compile_options -Wno-unknown-cuda-version) if(LLVM_DEFAULT_TARGET_TRIPLE) list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE}) endif() @@ -52,37 +55,91 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx") set(target_name "nvptx") list(APPEND compile_options --cuda-feature=+ptx63) +elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR + "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64") + set(target_name "spirv") + list(APPEND compile_options -emit-llvm -c) endif() -# Trick to combine these into a bitcode file via the linker's LTO pass. -add_executable(libompdevice ${src_files}) -set_target_properties(libompdevice PROPERTIES - RUNTIME_OUTPUT_DIRECTORY "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}" - LINKER_LANGUAGE CXX - BUILD_RPATH "" - INSTALL_RPATH "" - RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc) +# Check if we're building for SPIRV +if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" OR + "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv") + # For SPIRV targets, compile each source file to bitcode individually + set(bc_files "") + foreach(src_file ${src_files}) + get_filename_component(basename ${src_file} NAME_WE) + set(bc_file "${CMAKE_CURRENT_BINARY_DIR}/${basename}.bc") -# If the user built with the GPU C library enabled we will use that instead. -if(TARGET libc) - target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC) -endif() -target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512) + add_custom_command( + OUTPUT ${bc_file} + COMMAND ${CMAKE_CXX_COMPILER} + ARGS ${compile_options} + -I${CMAKE_CURRENT_SOURCE_DIR}/include + -I${CMAKE_CURRENT_SOURCE_DIR}/../../libc + -I${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include + $<$<BOOL:${LIBOMPTARGET_GPU_LIBC_SUPPORT}>:-DOMPTARGET_HAS_LIBC> + -DSHARED_SCRATCHPAD_SIZE=512 + -o ${bc_file} + ${src_file} + DEPENDS ${src_file} + COMMENT "Compiling ${src_file} to bitcode" + ) + list(APPEND bc_files ${bc_file}) + endforeach() + + # Find llvm-link + find_program(LLVM_LINK llvm-link HINTS ${LLVM_TOOLS_BINARY_DIR}) + if(NOT LLVM_LINK) + message(FATAL_ERROR "llvm-link not found") + endif() + + # Use llvm-link to combine all bitcode files + set(output_bc "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}/libomptarget-${target_name}.bc") + add_custom_command( + OUTPUT ${output_bc} + COMMAND ${CMAKE_COMMAND} -E make_directory "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}" + COMMAND ${LLVM_LINK} ${bc_files} -o ${output_bc} + DEPENDS ${bc_files} + COMMENT "Linking bitcode files with llvm-link" + ) + + # Create a target for the linked bitcode + add_custom_target(libompdevice ALL DEPENDS ${output_bc}) -target_include_directories(libompdevice PRIVATE + # Install the bitcode file + install(FILES ${output_bc} + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ + DESTINATION ${OPENMP_INSTALL_LIBDIR}) +else() + # Trick to combine these into a bitcode file via the linker's LTO pass. + add_executable(libompdevice ${src_files}) + set_target_properties(libompdevice PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}" + LINKER_LANGUAGE CXX + BUILD_RPATH "" + INSTALL_RPATH "" + RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc) + + # If the user built with the GPU C library enabled we will use that instead. + if(TARGET libc) + target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC) + endif() + target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512) + + target_include_directories(libompdevice PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../libc ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include) -target_compile_options(libompdevice PRIVATE ${compile_options}) -target_link_options(libompdevice PRIVATE + target_compile_options(libompdevice PRIVATE ${compile_options}) + target_link_options(libompdevice PRIVATE "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm") -if(LLVM_DEFAULT_TARGET_TRIPLE) - target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}") -endif() -install(TARGETS libompdevice + if(LLVM_DEFAULT_TARGET_TRIPLE) + target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}") + endif() + install(TARGETS libompdevice PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ DESTINATION ${OPENMP_INSTALL_LIBDIR}) - +endif() add_library(ompdevice.all_objs OBJECT IMPORTED) set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}/libomptarget-${target_name}.bc) diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h index 2e5d92380f040..3a3cb46c3b7d9 100644 --- a/openmp/device/include/DeviceTypes.h +++ b/openmp/device/include/DeviceTypes.h @@ -128,7 +128,13 @@ struct IdentTy { using __kmpc_impl_lanemask_t = LaneMaskTy; -using ParallelRegionFnTy = void *; +#ifdef __SPIRV__ +using FnPtrTy = __attribute__((address_space(9))) void *; +#else +using FnPtrTy = void *; +#endif + +using ParallelRegionFnTy = FnPtrTy; using CriticalNameTy = int32_t[8]; diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h index 94b5e65196067..a67323b58f381 100644 --- a/openmp/device/include/LibC.h +++ b/openmp/device/include/LibC.h @@ -16,7 +16,14 @@ namespace ompx { +#if defined(__SPIRV__) +template <size_t N, typename... Args> +int printf(const char (&Format)[N], Args... args) { + return __spirv_ocl_printf(Format, args...); +} +#else int printf(const char *Format, ...); +#endif } // namespace ompx diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h index cd6013780a49c..338f5a7f8d591 100644 --- a/openmp/device/include/State.h +++ b/openmp/device/include/State.h @@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { __builtin_unreachable(); } -[[gnu::always_inline, gnu::flatten]] inline void *& +[[gnu::always_inline, gnu::flatten]] inline FnPtrTy& lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) { switch (Kind) { case state::VK_ParallelRegionFn: diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp index 34c945c979ffb..3782478932046 100644 --- a/openmp/device/src/Allocator.cpp +++ b/openmp/device/src/Allocator.cpp @@ -23,7 +23,7 @@ using namespace allocator; // Provide a default implementation of malloc / free for AMDGPU platforms built // without 'libc' support. extern "C" { -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) +#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC) [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } #else diff --git a/openmp/device/src/LibC.cpp b/openmp/device/src/LibC.cpp index 83f9233d94803..095d9944531fe 100644 --- a/openmp/device/src/LibC.cpp +++ b/openmp/device/src/LibC.cpp @@ -31,14 +31,16 @@ extern "C" { for (size_t I = 0; I < count; ++I) dstc[I] = C; } - +#if !defined(__SPIRV__) [[gnu::weak]] int printf(const char *Format, ...) { __builtin_va_list vlist; __builtin_va_start(vlist, Format); return ::vprintf(Format, vlist); } +#endif } +#if !defined(__SPIRV__) namespace ompx { [[clang::no_builtin("printf")]] int printf(const char *Format, ...) { __builtin_va_list vlist; @@ -46,3 +48,4 @@ namespace ompx { return ::vprintf(Format, vlist); } } // namespace ompx +#endif \ No newline at end of file diff --git a/openmp/device/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp index 08ce616aee1c4..1d18bddb89eea 100644 --- a/openmp/device/src/Parallelism.cpp +++ b/openmp/device/src/Parallelism.cpp @@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { // Invoke an outlined parallel function unwrapping arguments (up to 32). [[clang::always_inline]] void invokeMicrotask(int32_t global_tid, - int32_t bound_tid, void *fn, + int32_t bound_tid, FnPtrTy fn, void **args, int64_t nargs) { switch (nargs) { #include "generated_microtask_cases.gen" @@ -84,7 +84,7 @@ extern "C" { [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, int32_t num_threads, - void *fn, void **args, + FnPtrTy fn, void **args, const int64_t nargs) { uint32_t TId = mapping::getThreadIdInBlock(); uint32_t NumThreads = determineNumberOfThreads(num_threads); @@ -142,8 +142,8 @@ extern "C" { [[clang::always_inline]] void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, - int32_t num_threads, int proc_bind, void *fn, - void *wrapper_fn, void **args, int64_t nargs) { + int32_t num_threads, int proc_bind, FnPtrTy fn, + FnPtrTy wrapper_fn, void **args, int64_t nargs) { uint32_t TId = mapping::getThreadIdInBlock(); // Assert the parallelism level is zero if disabled by the user. @@ -260,7 +260,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, 1u, true, ident, /*ForceTeamState=*/true); state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, - (void *)nullptr, true, ident, + (FnPtrTy)nullptr, true, ident, /*ForceTeamState=*/true); state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, /*ForceTeamState=*/true); diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp index 501dc4a291ed1..385b47e9bf5dd 100644 --- a/openmp/device/src/Synchronization.cpp +++ b/openmp/device/src/Synchronization.cpp @@ -258,6 +258,102 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } #endif ///} +#if defined(__SPIRV__) +typedef enum { + CrossDevice = 0, + Device = 1, + Workgroup = 2, + Subgroup = 3, + Invocation = 4 +} Scope_t; +typedef enum { + Relaxed = 0x0, + Acquire = 0x2, + Release = 0x4, + AcquireRelease = 0x8, + SequentiallyConsistent = 0x10 +} MemorySemantics_t; + +extern "C" uint32_t __spirv_AtomicIAdd(uint32_t *, int, int, uint32_t); +extern "C" void __spirv_MemoryBarrier(int, int); +extern "C" void __spirv_ControlBarrier(uint32_t, uint32_t, uint32_t); + +MemorySemantics_t convertOrderingType(atomic::OrderingTy Ordering) { + switch (Ordering) { + default: + __builtin_unreachable(); + case atomic::relaxed: + return MemorySemantics_t::Relaxed; + case atomic::acquire: + return MemorySemantics_t::Acquire; + case atomic::release: + return MemorySemantics_t::Release; + case atomic::acq_rel: + return MemorySemantics_t::AcquireRelease; + case atomic::seq_cst: + return MemorySemantics_t::SequentiallyConsistent; + } +} +uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { + return __spirv_AtomicIAdd(Address, (int)MemScope, + convertOrderingType(Ordering), Val); +} +void namedBarrierInit() { __builtin_trap(); } // TODO +void namedBarrier() { __builtin_trap(); } // TODO +void fenceTeam(atomic::OrderingTy Ordering) { + return __spirv_MemoryBarrier(Scope_t::Workgroup, + convertOrderingType(Ordering)); +} +void fenceKernel(atomic::OrderingTy Ordering) { + return __spirv_MemoryBarrier(Scope_t::Invocation, + convertOrderingType(Ordering)); +} +void fenceSystem(atomic::OrderingTy Ordering) { + return __spirv_MemoryBarrier(Scope_t::Device, convertOrderingType(Ordering)); +} + +void syncWarp(__kmpc_impl_lanemask_t) { + __spirv_ControlBarrier(Scope_t::Invocation, Scope_t::Subgroup, + MemorySemantics_t::Acquire); +} +void syncThreads(atomic::OrderingTy Ordering) { + __spirv_ControlBarrier(Scope_t::Invocation, Scope_t::Workgroup, + MemorySemantics_t::Acquire); +} +void unsetLock(omp_lock_t *Lock) { + atomic::store((int32_t *)Lock, 0, atomic::release); +} +int testLock(omp_lock_t *Lock) { + return atomic::add((int32_t *)Lock, 0, atomic::relaxed); +} +void initLock(omp_lock_t *Lock) { unsetLock(Lock); } +void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } +void setLock(omp_lock_t *Lock) { + int32_t *lock_ptr = (int32_t *)Lock; + bool acquired = false; + int32_t expected; + while (!acquired) { + expected = 0; + if (expected == atomic::load(lock_ptr, atomic::relaxed)) + acquired = + atomic::cas(lock_ptr, expected, 1, atomic::acq_rel, atomic::release); + } +} +extern "C" int __attribute__((overloadable)) sub_group_scan_inclusive_min(int); +void unsetCriticalLock(omp_lock_t *Lock) { + int id = mapping::getThreadIdInWarp(); + if (id == sub_group_scan_inclusive_min(id)) + unsetLock(Lock); +} +void setCriticalLock(omp_lock_t *Lock) { + int id = mapping::getThreadIdInWarp(); + if (id == sub_group_scan_inclusive_min(id)) + setLock(Lock); +} +void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } +#endif + } // namespace impl void synchronize::init(bool IsSPMD) { _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
