https://github.com/wenju-he created https://github.com/llvm/llvm-project/pull/152275
It is necessary to add MemorySemantic argument which means the memory or address space to which the memory ordering is applied. The MemorySemantic is also necessary for implementing the SPIR-V MemoryBarrier instruction. Additionally, the implementation of __clc_mem_fence on Intel GPUs requires the MemorySemantic argument. Using __builtin_amdgcn_fence for AMDGPU is follow-up of https://github.com/llvm/llvm-project/pull/151446#discussion_r2254006508 llvm-diff shows no change to nvptx64--nvidiacl.bc. >From c48a94749e7e4ee261895826f2df2e2c48f040ef Mon Sep 17 00:00:00 2001 From: Wenju He <wenju...@intel.com> Date: Wed, 6 Aug 2025 11:07:15 +0200 Subject: [PATCH] [libclc] update __clc_mem_fence: add MemorySemantic arg and use __builtin_amdgcn_fence for AMDGPU It is necessary to add MemorySemantic argument which means the memory or address space to which the memory ordering is applied. The MemorySemantic is also necessary for implementing the SPIR-V MemoryBarrier instruction. Additionally, the implementation of __clc_mem_fence on Intel GPUs requires the MemorySemantic argument. Using __builtin_amdgcn_fence for AMDGPU is follow-up of https://github.com/llvm/llvm-project/pull/151446#discussion_r2254006508 llvm-diff shows no change to nvptx64--nvidiacl.bc. --- .../clc/include/clc/mem_fence/clc_mem_fence.h | 5 +- .../include/clc/mem_fence/clc_mem_semantic.h | 21 ++++++ .../synchronization/clc_work_group_barrier.h | 6 +- .../clc/lib/amdgcn/mem_fence/clc_mem_fence.cl | 67 ++++++++++++------- .../synchronization/clc_work_group_barrier.cl | 7 +- .../ptx-nvidiacl/mem_fence/clc_mem_fence.cl | 4 +- .../synchronization/clc_work_group_barrier.cl | 5 +- .../clc/opencl/synchronization/utils.h | 11 +++ libclc/opencl/lib/amdgcn/mem_fence/fence.cl | 3 +- .../lib/amdgcn/synchronization/barrier.cl | 3 +- .../lib/ptx-nvidiacl/mem_fence/fence.cl | 3 +- .../ptx-nvidiacl/synchronization/barrier.cl | 3 +- 12 files changed, 99 insertions(+), 39 deletions(-) create mode 100644 libclc/clc/include/clc/mem_fence/clc_mem_semantic.h diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h index 2321634c76842..92502270c802f 100644 --- a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h +++ b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h @@ -10,8 +10,9 @@ #define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ #include <clc/internal/clc.h> +#include <clc/mem_fence/clc_mem_semantic.h> -_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope, - int memory_order); +_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope, int memory_order, + MemorySemantic memory_semantic); #endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h b/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h new file mode 100644 index 0000000000000..662ef9ce087e9 --- /dev/null +++ b/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__ +#define __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__ + +// The memory or address space to which the memory ordering is applied. +typedef enum MemorySemantic { + MEMORY_PRIVATE = 0x1, + MEMORY_LOCAL = 0x2, + MEMORY_GLOBAL = 0x4, + MEMORY_CONSTANT = 0x8, + MEMORY_GENERIC = 0x10 +} MemorySemantic; + +#endif // __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__ diff --git a/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h index 5f864e1057b8b..bc8627782d3c0 100644 --- a/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h +++ b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h @@ -10,8 +10,10 @@ #define __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ #include <clc/internal/clc.h> +#include <clc/mem_fence/clc_mem_semantic.h> -_CLC_OVERLOAD _CLC_DECL void __clc_work_group_barrier(int memory_scope, - int memory_order); +_CLC_OVERLOAD _CLC_DECL void +__clc_work_group_barrier(int memory_scope, int memory_order, + MemorySemantic memory_semantic); #endif // __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ diff --git a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl index 9e6460313718e..880db3c2f29a3 100644 --- a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl +++ b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl @@ -8,30 +8,49 @@ #include <clc/mem_fence/clc_mem_fence.h> -void __clc_amdgcn_s_waitcnt(unsigned flags); +#define BUILTIN_FENCE_ORDER(memory_order, ...) \ + switch (memory_order) { \ + case __ATOMIC_ACQUIRE: \ + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, __VA_ARGS__); \ + break; \ + case __ATOMIC_RELEASE: \ + __builtin_amdgcn_fence(__ATOMIC_RELEASE, __VA_ARGS__); \ + break; \ + case __ATOMIC_ACQ_REL: \ + __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, __VA_ARGS__); \ + break; \ + case __ATOMIC_SEQ_CST: \ + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, __VA_ARGS__); \ + break; \ + default: \ + __builtin_unreachable(); \ + } \ + break; -// s_waitcnt takes 16bit argument with a combined number of maximum allowed -// pending operations: -// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages -// [7] -- undefined -// [6:4] -- exports, GDS, and mem write -// [3:0] -- vector memory operations +#define BUILTIN_FENCE(memory_scope, memory_order, ...) \ + switch (memory_scope) { \ + case __MEMORY_SCOPE_DEVICE: \ + BUILTIN_FENCE_ORDER(memory_order, "agent", ##__VA_ARGS__) \ + case __MEMORY_SCOPE_WRKGRP: \ + BUILTIN_FENCE_ORDER(memory_order, "workgroup", ##__VA_ARGS__) \ + case __MEMORY_SCOPE_WVFRNT: \ + BUILTIN_FENCE_ORDER(memory_order, "wavefront", ##__VA_ARGS__) \ + case __MEMORY_SCOPE_SINGLE: \ + BUILTIN_FENCE_ORDER(memory_order, "singlethread", ##__VA_ARGS__) \ + case __MEMORY_SCOPE_SYSTEM: \ + default: \ + BUILTIN_FENCE_ORDER(memory_order, "", ##__VA_ARGS__) \ + } -// Newer clang supports __builtin_amdgcn_s_waitcnt -#if __clang_major__ >= 5 -#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x) -#else -#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x) -_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt"); -#endif - -_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, - int memory_order) { - if (memory_scope & __MEMORY_SCOPE_DEVICE) { - // scalar loads are counted with LGKM but we don't know whether - // the compiler turned any loads to scalar - __waitcnt(0); - } else if (memory_scope & __MEMORY_SCOPE_WRKGRP) - __waitcnt(0xff); // LGKM is [12:8] +_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, int memory_order, + MemorySemantic memory_semantic) { + if (memory_semantic == MEMORY_LOCAL) { + BUILTIN_FENCE(memory_scope, memory_order, "local") + } else if (memory_semantic == MEMORY_GLOBAL) { + BUILTIN_FENCE(memory_scope, memory_order, "global") + } else if (memory_semantic == (MEMORY_LOCAL | MEMORY_GLOBAL)) { + BUILTIN_FENCE(memory_scope, memory_order, "local", "global") + } else { + BUILTIN_FENCE(memory_scope, memory_order) + } } -#undef __waitcnt diff --git a/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl index ff3628fa7c339..a034ce1855cc1 100644 --- a/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl +++ b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl @@ -9,8 +9,9 @@ #include <clc/mem_fence/clc_mem_fence.h> #include <clc/synchronization/clc_work_group_barrier.h> -_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope, - int memory_order) { - __clc_mem_fence(memory_scope, memory_order); +_CLC_OVERLOAD _CLC_DEF void +__clc_work_group_barrier(int memory_scope, int memory_order, + MemorySemantic memory_semantic) { + __clc_mem_fence(memory_scope, memory_order, memory_semantic); __builtin_amdgcn_s_barrier(); } diff --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl index b3e2375e755a2..fb8da1690dfed 100644 --- a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl +++ b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl @@ -8,8 +8,8 @@ #include <clc/mem_fence/clc_mem_fence.h> -_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, - int memory_order) { +_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, int memory_order, + MemorySemantic memory_semantic) { if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP)) __nvvm_membar_cta(); } diff --git a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl index 6cb37a38f06ac..d3c39c844752f 100644 --- a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl +++ b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl @@ -8,7 +8,8 @@ #include <clc/synchronization/clc_work_group_barrier.h> -_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope, - int memory_order) { +_CLC_OVERLOAD _CLC_DEF void +__clc_work_group_barrier(int memory_scope, int memory_order, + MemorySemantic memory_semantic) { __syncthreads(); } diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h index cf3baf28cb5f1..a0b63ae2a45d9 100644 --- a/libclc/opencl/include/clc/opencl/synchronization/utils.h +++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h @@ -10,6 +10,7 @@ #define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ #include <clc/internal/clc.h> +#include <clc/mem_fence/clc_mem_semantic.h> #include <clc/opencl/synchronization/cl_mem_fence_flags.h> _CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) { @@ -21,4 +22,14 @@ _CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) { return memory_scope; } +_CLC_INLINE MemorySemantic getCLCMemorySemantic(cl_mem_fence_flags flag) { + if ((flag & CLK_LOCAL_MEM_FENCE) && (flag & CLK_GLOBAL_MEM_FENCE)) + return MEMORY_LOCAL | MEMORY_GLOBAL; + if (flag & CLK_LOCAL_MEM_FENCE) + return MEMORY_LOCAL; + if (flag & CLK_GLOBAL_MEM_FENCE) + return MEMORY_GLOBAL; + __builtin_unreachable(); +} + #endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ diff --git a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl index 81216d6a26cf2..ccec510d95141 100644 --- a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl +++ b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl @@ -13,7 +13,8 @@ _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { int memory_scope = getCLCMemoryScope(flags); int memory_order = __ATOMIC_SEQ_CST; - __clc_mem_fence(memory_scope, memory_order); + MemorySemantic memory_semantic = getCLCMemorySemantic(flags); + __clc_mem_fence(memory_scope, memory_order, memory_semantic); } // We don't have separate mechanism for read and write fences diff --git a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl index c8322e602302c..13427d2f2102b 100644 --- a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl +++ b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl @@ -13,5 +13,6 @@ _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { int memory_scope = getCLCMemoryScope(flags); int memory_order = __ATOMIC_SEQ_CST; - __clc_work_group_barrier(memory_scope, memory_order); + MemorySemantic memory_semantic = getCLCMemorySemantic(flags); + __clc_work_group_barrier(memory_scope, memory_order, memory_semantic); } diff --git a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl index e22ed870a7e6b..6291fbc5b41ad 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl @@ -13,7 +13,8 @@ _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { int memory_scope = getCLCMemoryScope(flags); int memory_order = __ATOMIC_SEQ_CST; - __clc_mem_fence(memory_scope, memory_order); + MemorySemantic memory_semantic = getCLCMemorySemantic(flags); + __clc_mem_fence(memory_scope, memory_order, memory_semantic); } // We do not have separate mechanism for read and write fences. diff --git a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl index c8322e602302c..13427d2f2102b 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl @@ -13,5 +13,6 @@ _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { int memory_scope = getCLCMemoryScope(flags); int memory_order = __ATOMIC_SEQ_CST; - __clc_work_group_barrier(memory_scope, memory_order); + MemorySemantic memory_semantic = getCLCMemorySemantic(flags); + __clc_work_group_barrier(memory_scope, memory_order, memory_semantic); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits