Author: Wenju He Date: 2025-08-06T09:49:28+08:00 New Revision: af16fc2e2a50c1cbac49726ea70739ad6e193729
URL: https://github.com/llvm/llvm-project/commit/af16fc2e2a50c1cbac49726ea70739ad6e193729 DIFF: https://github.com/llvm/llvm-project/commit/af16fc2e2a50c1cbac49726ea70739ad6e193729.diff LOG: [libclc] Move mem_fence and barrier to clc library (#151446) __clc_mem_fence and __clc_work_group_barrier function have two parameters memory_scope and memory_order. The design allows the clc functions to implement SPIR-V ControlBarrier and MemoryBarrier functions in the future. The default memory ordering in clc is set to __ATOMIC_SEQ_CST, which is also the default and strongest ordering in OpenCL and C++. OpenCL cl_mem_fence_flags parameter is converted to combination of __MEMORY_SCOPE_DEVICE and __MEMORY_SCOPE_WRKGRP, which is passed to clc. llvm-diff shows no change to nvptx64--nvidiacl.bc. llvm-diff show a small change to amdgcn--amdhsa.bc and the number of LLVM IR instruction is reduced by 1: https://alive2.llvm.org/ce/z/_Uhqvt Added: libclc/clc/include/clc/mem_fence/clc_mem_fence.h libclc/clc/include/clc/synchronization/clc_work_group_barrier.h libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl libclc/opencl/include/clc/opencl/synchronization/utils.h Modified: libclc/clc/lib/amdgcn/SOURCES libclc/clc/lib/ptx-nvidiacl/SOURCES libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h libclc/opencl/lib/amdgcn/mem_fence/fence.cl libclc/opencl/lib/amdgcn/synchronization/barrier.cl libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl Removed: ################################################################################ diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h new file mode 100644 index 0000000000000..2321634c76842 --- /dev/null +++ b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h @@ -0,0 +1,17 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ +#define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ + +#include <clc/internal/clc.h> + +_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope, + int memory_order); + +#endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ diff --git a/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h new file mode 100644 index 0000000000000..5f864e1057b8b --- /dev/null +++ b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h @@ -0,0 +1,17 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ +#define __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ + +#include <clc/internal/clc.h> + +_CLC_OVERLOAD _CLC_DECL void __clc_work_group_barrier(int memory_scope, + int memory_order); + +#endif // __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ diff --git a/libclc/clc/lib/amdgcn/SOURCES b/libclc/clc/lib/amdgcn/SOURCES index d91f08533e149..76c3266e3af7b 100644 --- a/libclc/clc/lib/amdgcn/SOURCES +++ b/libclc/clc/lib/amdgcn/SOURCES @@ -1,4 +1,6 @@ math/clc_ldexp_override.cl +mem_fence/clc_mem_fence.cl +synchronization/clc_work_group_barrier.cl workitem/clc_get_global_offset.cl workitem/clc_get_global_size.cl workitem/clc_get_group_id.cl diff --git a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl new file mode 100644 index 0000000000000..9e6460313718e --- /dev/null +++ b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/mem_fence/clc_mem_fence.h> + +void __clc_amdgcn_s_waitcnt(unsigned flags); + +// s_waitcnt takes 16bit argument with a combined number of maximum allowed +// pending operations: +// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages +// [7] -- undefined +// [6:4] -- exports, GDS, and mem write +// [3:0] -- vector memory operations + +// Newer clang supports __builtin_amdgcn_s_waitcnt +#if __clang_major__ >= 5 +#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x) +#else +#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x) +_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt"); +#endif + +_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, + int memory_order) { + if (memory_scope & __MEMORY_SCOPE_DEVICE) { + // scalar loads are counted with LGKM but we don't know whether + // the compiler turned any loads to scalar + __waitcnt(0); + } else if (memory_scope & __MEMORY_SCOPE_WRKGRP) + __waitcnt(0xff); // LGKM is [12:8] +} +#undef __waitcnt diff --git a/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl new file mode 100644 index 0000000000000..ff3628fa7c339 --- /dev/null +++ b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/mem_fence/clc_mem_fence.h> +#include <clc/synchronization/clc_work_group_barrier.h> + +_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope, + int memory_order) { + __clc_mem_fence(memory_scope, memory_order); + __builtin_amdgcn_s_barrier(); +} diff --git a/libclc/clc/lib/ptx-nvidiacl/SOURCES b/libclc/clc/lib/ptx-nvidiacl/SOURCES index 05368c5e4d4e3..b6f50654f89c5 100644 --- a/libclc/clc/lib/ptx-nvidiacl/SOURCES +++ b/libclc/clc/lib/ptx-nvidiacl/SOURCES @@ -1,3 +1,5 @@ +mem_fence/clc_mem_fence.cl +synchronization/clc_work_group_barrier.cl workitem/clc_get_global_id.cl workitem/clc_get_group_id.cl workitem/clc_get_local_id.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl new file mode 100644 index 0000000000000..b3e2375e755a2 --- /dev/null +++ b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/mem_fence/clc_mem_fence.h> + +_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, + int memory_order) { + if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP)) + __nvvm_membar_cta(); +} diff --git a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl new file mode 100644 index 0000000000000..6cb37a38f06ac --- /dev/null +++ b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl @@ -0,0 +1,14 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/synchronization/clc_work_group_barrier.h> + +_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope, + int memory_order) { + __syncthreads(); +} diff --git a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h index 6636515fca47d..7b2f701c1ff99 100644 --- a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h +++ b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h @@ -13,5 +13,6 @@ typedef uint cl_mem_fence_flags; #define CLK_LOCAL_MEM_FENCE 1 #define CLK_GLOBAL_MEM_FENCE 2 +#define CLK_IMAGE_MEM_FENCE 4 #endif // __CLC_OPENCL_SYNCHRONIZATION_CL_MEM_FENCE_FLAGS_H__ diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h new file mode 100644 index 0000000000000..cf3baf28cb5f1 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ +#define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ + +#include <clc/internal/clc.h> +#include <clc/opencl/synchronization/cl_mem_fence_flags.h> + +_CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) { + int memory_scope = 0; + if (flag & CLK_GLOBAL_MEM_FENCE) + memory_scope |= __MEMORY_SCOPE_DEVICE; + if (flag & CLK_LOCAL_MEM_FENCE) + memory_scope |= __MEMORY_SCOPE_WRKGRP; + return memory_scope; +} + +#endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ diff --git a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl index 88b953005aae6..81216d6a26cf2 100644 --- a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl +++ b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl @@ -6,34 +6,15 @@ // //===----------------------------------------------------------------------===// +#include <clc/mem_fence/clc_mem_fence.h> #include <clc/opencl/explicit_fence/explicit_memory_fence.h> - -void __clc_amdgcn_s_waitcnt(unsigned flags); - -// s_waitcnt takes 16bit argument with a combined number of maximum allowed -// pending operations: -// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages -// [7] -- undefined -// [6:4] -- exports, GDS, and mem write -// [3:0] -- vector memory operations - -// Newer clang supports __builtin_amdgcn_s_waitcnt -#if __clang_major__ >= 5 -#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x) -#else -#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x) -_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt"); -#endif +#include <clc/opencl/synchronization/utils.h> _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { - if (flags & CLK_GLOBAL_MEM_FENCE) { - // scalar loads are counted with LGKM but we don't know whether - // the compiler turned any loads to scalar - __waitcnt(0); - } else if (flags & CLK_LOCAL_MEM_FENCE) - __waitcnt(0xff); // LGKM is [12:8] + int memory_scope = getCLCMemoryScope(flags); + int memory_order = __ATOMIC_SEQ_CST; + __clc_mem_fence(memory_scope, memory_order); } -#undef __waitcnt // We don't have separate mechanism for read and write fences _CLC_DEF _CLC_OVERLOAD void read_mem_fence(cl_mem_fence_flags flags) { diff --git a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl index 5203db72f484c..c8322e602302c 100644 --- a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl +++ b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl @@ -6,10 +6,12 @@ // //===----------------------------------------------------------------------===// -#include <clc/opencl/explicit_fence/explicit_memory_fence.h> #include <clc/opencl/synchronization/barrier.h> +#include <clc/opencl/synchronization/utils.h> +#include <clc/synchronization/clc_work_group_barrier.h> _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { - mem_fence(flags); - __builtin_amdgcn_s_barrier(); + int memory_scope = getCLCMemoryScope(flags); + int memory_order = __ATOMIC_SEQ_CST; + __clc_work_group_barrier(memory_scope, memory_order); } diff --git a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl index d24569ecda1bc..e22ed870a7e6b 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl @@ -6,11 +6,14 @@ // //===----------------------------------------------------------------------===// +#include <clc/mem_fence/clc_mem_fence.h> #include <clc/opencl/explicit_fence/explicit_memory_fence.h> +#include <clc/opencl/synchronization/utils.h> _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { - if (flags & (CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE)) - __nvvm_membar_cta(); + int memory_scope = getCLCMemoryScope(flags); + int memory_order = __ATOMIC_SEQ_CST; + __clc_mem_fence(memory_scope, memory_order); } // We do not have separate mechanism for read and write fences. diff --git a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl index 7c57478795dda..c8322e602302c 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl @@ -7,7 +7,11 @@ //===----------------------------------------------------------------------===// #include <clc/opencl/synchronization/barrier.h> +#include <clc/opencl/synchronization/utils.h> +#include <clc/synchronization/clc_work_group_barrier.h> _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { - __syncthreads(); + int memory_scope = getCLCMemoryScope(flags); + int memory_order = __ATOMIC_SEQ_CST; + __clc_work_group_barrier(memory_scope, memory_order); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits