llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> For AMDGPU these are identical to the uniform case. Stub out the missing cases with traps to avoid test failures from undefined symbols while keeping the structure consistent. --- Patch is 28.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/188929.diff 10 Files Affected: - (added) libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h (+29) - (added) libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.inc (+33) - (modified) libclc/clc/lib/amdgpu/CMakeLists.txt (+1) - (added) libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl (+384) - (added) libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.inc (+87) - (modified) libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl (+8-134) - (added) libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc (+26) - (modified) libclc/opencl/lib/generic/CMakeLists.txt (+1) - (added) libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl (+30) - (added) libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc (+48) ``````````diff diff --git a/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h new file mode 100644 index 0000000000000..cc73ea1f78bf1 --- /dev/null +++ b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__ +#define __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__ + +#include "clc/internal/clc.h" + +#define __CLC_BODY "clc/subgroup/clc_sub_group_non_uniform_reduce.inc" +#include "clc/integer/gentype.inc" + +#define __CLC_BODY "clc/subgroup/clc_sub_group_non_uniform_reduce.inc" +#include "clc/math/gentype.inc" + +_CLC_DECL _CLC_OVERLOAD int +__clc_sub_group_non_uniform_reduce_logical_and(int x); + +_CLC_DECL _CLC_OVERLOAD int +__clc_sub_group_non_uniform_reduce_logical_or(int x); + +_CLC_DECL _CLC_OVERLOAD int +__clc_sub_group_non_uniform_reduce_logical_xor(int x); + +#endif // __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__ diff --git a/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.inc b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.inc new file mode 100644 index 0000000000000..f7b82d3119ed9 --- /dev/null +++ b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.inc @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef __CLC_SCALAR +_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE +__clc_sub_group_non_uniform_reduce_add(__CLC_GENTYPE x); + +_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE +__clc_sub_group_non_uniform_reduce_mul(__CLC_GENTYPE x); + +_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE +__clc_sub_group_non_uniform_reduce_min(__CLC_GENTYPE x); + +_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE +__clc_sub_group_non_uniform_reduce_max(__CLC_GENTYPE x); + +#ifndef __CLC_FPSIZE +_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE +__clc_sub_group_non_uniform_reduce_and(__CLC_GENTYPE x); + +_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE +__clc_sub_group_non_uniform_reduce_or(__CLC_GENTYPE x); + +_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE +__clc_sub_group_non_uniform_reduce_xor(__CLC_GENTYPE x); +#endif // __CLC_FPSIZE + +#endif // __CLC_SCALAR diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt index 186b6ab0b85b1..a5cd47fab4462 100644 --- a/libclc/clc/lib/amdgpu/CMakeLists.txt +++ b/libclc/clc/lib/amdgpu/CMakeLists.txt @@ -29,6 +29,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES mem_fence/clc_mem_fence.cl subgroup/clc_subgroup.cl subgroup/clc_sub_group_broadcast.cl + subgroup/clc_sub_group_non_uniform_reduce.cl subgroup/clc_sub_group_reduce.cl subgroup/clc_sub_group_scan.cl synchronization/clc_sub_group_barrier.cl diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl new file mode 100644 index 0000000000000..00a485e9405a3 --- /dev/null +++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl @@ -0,0 +1,384 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clc/subgroup/clc_sub_group_non_uniform_reduce.h" + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_add(uint x) { + return __builtin_amdgcn_wave_reduce_add_u32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_add(int x) { + return (int)__clc_sub_group_non_uniform_reduce_add((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_add(ulong x) { + return __builtin_amdgcn_wave_reduce_add_u64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_add(long x) { + return (long)__clc_sub_group_non_uniform_reduce_add((ulong)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_min(uint x) { + return __builtin_amdgcn_wave_reduce_min_u32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_min(int x) { + return __builtin_amdgcn_wave_reduce_min_i32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_min(ulong x) { + return __builtin_amdgcn_wave_reduce_min_u64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_min(long x) { + return __builtin_amdgcn_wave_reduce_min_i64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_max(uint x) { + return __builtin_amdgcn_wave_reduce_max_u32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_max(int x) { + return __builtin_amdgcn_wave_reduce_max_i32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_max(ulong x) { + return __builtin_amdgcn_wave_reduce_max_u64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_max(long x) { + return __builtin_amdgcn_wave_reduce_max_i64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST float +__clc_sub_group_non_uniform_reduce_add(float x) { + return __builtin_amdgcn_wave_reduce_fadd_f32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST double +__clc_sub_group_non_uniform_reduce_add(double x) { + return __builtin_amdgcn_wave_reduce_fadd_f64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST float +__clc_sub_group_non_uniform_reduce_min(float x) { + return __builtin_amdgcn_wave_reduce_fmin_f32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST double +__clc_sub_group_non_uniform_reduce_min(double x) { + return __builtin_amdgcn_wave_reduce_fmin_f64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST float +__clc_sub_group_non_uniform_reduce_max(float x) { + return __builtin_amdgcn_wave_reduce_fmax_f32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST double +__clc_sub_group_non_uniform_reduce_max(double x) { + return __builtin_amdgcn_wave_reduce_fmax_f64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST half +__clc_sub_group_non_uniform_reduce_add(half x) { + // FIXME: There should be a direct half builtin available. + return (float)__clc_sub_group_non_uniform_reduce_add((float)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST half +__clc_sub_group_non_uniform_reduce_min(half x) { + // FIXME: There should be a direct half builtin available. + return (float)__clc_sub_group_non_uniform_reduce_min((float)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST half +__clc_sub_group_non_uniform_reduce_max(half x) { + // FIXME: There should be a direct half builtin available. + return (float)__clc_sub_group_non_uniform_reduce_max((float)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar +__clc_sub_group_non_uniform_reduce_add(uchar x) { + return (uchar)__clc_sub_group_non_uniform_reduce_add((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST char +__clc_sub_group_non_uniform_reduce_add(char x) { + return (char)__clc_sub_group_non_uniform_reduce_add((int)x); +} + +// FIXME: There should be a direct short builtin available. +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort +__clc_sub_group_non_uniform_reduce_add(ushort x) { + return (ushort)__clc_sub_group_non_uniform_reduce_add((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST short +__clc_sub_group_non_uniform_reduce_add(short x) { + return (int)__clc_sub_group_non_uniform_reduce_add((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar +__clc_sub_group_non_uniform_reduce_min(uchar x) { + return (uchar)__clc_sub_group_non_uniform_reduce_min((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST char +__clc_sub_group_non_uniform_reduce_min(char x) { + return (char)__clc_sub_group_non_uniform_reduce_min((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort +__clc_sub_group_non_uniform_reduce_min(ushort x) { + return (ushort)__clc_sub_group_non_uniform_reduce_min((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST short +__clc_sub_group_non_uniform_reduce_min(short x) { + return (int)__clc_sub_group_non_uniform_reduce_min((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar +__clc_sub_group_non_uniform_reduce_max(uchar x) { + return (uchar)__clc_sub_group_non_uniform_reduce_max((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST char +__clc_sub_group_non_uniform_reduce_max(char x) { + return (char)__clc_sub_group_non_uniform_reduce_max((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort +__clc_sub_group_non_uniform_reduce_max(ushort x) { + return (ushort)__clc_sub_group_non_uniform_reduce_max((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST short +__clc_sub_group_non_uniform_reduce_max(short x) { + return (int)__clc_sub_group_non_uniform_reduce_max((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_and(uint x) { + return __builtin_amdgcn_wave_reduce_and_b32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_and(int x) { + return (int)__clc_sub_group_non_uniform_reduce_and((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_and(ulong x) { + return __builtin_amdgcn_wave_reduce_and_b64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_and(long x) { + return (long)__clc_sub_group_non_uniform_reduce_and((ulong)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_or(uint x) { + return __builtin_amdgcn_wave_reduce_or_b32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_or(int x) { + return (int)__clc_sub_group_non_uniform_reduce_or((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_or(ulong x) { + return __builtin_amdgcn_wave_reduce_or_b64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_or(long x) { + return (long)__clc_sub_group_non_uniform_reduce_or((ulong)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_xor(uint x) { + return __builtin_amdgcn_wave_reduce_xor_b32(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_xor(int x) { + return (int)__clc_sub_group_non_uniform_reduce_xor((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_xor(ulong x) { + return __builtin_amdgcn_wave_reduce_xor_b64(x, 0); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_xor(long x) { + return (long)__clc_sub_group_non_uniform_reduce_xor((ulong)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar +__clc_sub_group_non_uniform_reduce_and(uchar x) { + return (uchar)__clc_sub_group_non_uniform_reduce_and((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST char +__clc_sub_group_non_uniform_reduce_and(char x) { + return (char)__clc_sub_group_non_uniform_reduce_and((int)x); +} + +// FIXME: There should be a direct short builtin available. +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort +__clc_sub_group_non_uniform_reduce_and(ushort x) { + return (ushort)__clc_sub_group_non_uniform_reduce_and((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST short +__clc_sub_group_non_uniform_reduce_and(short x) { + return (int)__clc_sub_group_non_uniform_reduce_and((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar +__clc_sub_group_non_uniform_reduce_or(uchar x) { + return (uchar)__clc_sub_group_non_uniform_reduce_or((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST char +__clc_sub_group_non_uniform_reduce_or(char x) { + return (char)__clc_sub_group_non_uniform_reduce_or((int)x); +} + +// FIXME: There should be a direct short builtin available. +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort +__clc_sub_group_non_uniform_reduce_or(ushort x) { + return (ushort)__clc_sub_group_non_uniform_reduce_or((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST short +__clc_sub_group_non_uniform_reduce_or(short x) { + return (int)__clc_sub_group_non_uniform_reduce_or((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar +__clc_sub_group_non_uniform_reduce_xor(uchar x) { + return (uchar)__clc_sub_group_non_uniform_reduce_xor((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST char +__clc_sub_group_non_uniform_reduce_xor(char x) { + return (char)__clc_sub_group_non_uniform_reduce_xor((int)x); +} + +// FIXME: There should be a direct short builtin available. +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort +__clc_sub_group_non_uniform_reduce_xor(ushort x) { + return (ushort)__clc_sub_group_non_uniform_reduce_xor((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST short +__clc_sub_group_non_uniform_reduce_xor(short x) { + return (int)__clc_sub_group_non_uniform_reduce_xor((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_mul(uint x) { + (void)x; + // TODO: + __builtin_trap(); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_mul(int x) { + return (int)__clc_sub_group_non_uniform_reduce_mul((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_mul(ulong x) { + (void)x; + // TODO: + __builtin_trap(); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_mul(long x) { + return (long)__clc_sub_group_non_uniform_reduce_mul((ulong)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST char +__clc_sub_group_non_uniform_reduce_mul(char x) { + return (char)__clc_sub_group_non_uniform_reduce_mul((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar +__clc_sub_group_non_uniform_reduce_mul(uchar x) { + return (uchar)__clc_sub_group_non_uniform_reduce_mul((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST short +__clc_sub_group_non_uniform_reduce_mul(short x) { + return (short)__clc_sub_group_non_uniform_reduce_mul((int)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort +__clc_sub_group_non_uniform_reduce_mul(ushort x) { + return (ushort)__clc_sub_group_non_uniform_reduce_mul((uint)x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_logical_and(int predicate) { + // TODO: + (void)predicate; + __builtin_trap(); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_logical_or(int predicate) { + // TODO: + (void)predicate; + __builtin_trap(); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_logical_xor(int predicate) { + // TODO: + (void)predicate; + __builtin_trap(); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST float +__clc_sub_group_non_uniform_reduce_mul(float x) { + (void)x; + __builtin_trap(); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST double +__clc_sub_group_non_uniform_reduce_mul(double x) { + (void)x; + __builtin_trap(); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST half +__clc_sub_group_non_uniform_reduce_mul(half x) { + (void)x; + __builtin_trap(); +} diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.inc b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.inc new file mode 100644 index 0000000000000..ac331c5268aa9 --- /dev/null +++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.inc @@ -0,0 +1,87 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_add(int x) { + return __clc_sub_group_reduce_add(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_add(uint x) { + return __clc_sub_group_reduce_add(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_mul(int x) { + return __clc_sub_group_reduce_mul(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_mul(uint x) { + return __clc_sub_group_reduce_mul(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_min(int x) { + return __clc_sub_group_reduce_min(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_min(uint x) { + return __clc_sub_group_reduce_min(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST int +__clc_sub_group_non_uniform_reduce_max(int x) { + return __clc_sub_group_reduce_max(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint +__clc_sub_group_non_uniform_reduce_max(uint x) { + return __clc_sub_group_reduce_max(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_add(long x) { + return __clc_sub_group_reduce_add(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_add(ulong x) { + return __clc_sub_group_reduce_add(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_mul(long x) { + return __clc_sub_group_reduce_mul(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_mul(ulong x) { + return __clc_sub_group_reduce_mul(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_min(long x) { + return __clc_sub_group_reduce_min(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_min(ulong x) { + return __clc_sub_group_reduce_min(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST long +__clc_sub_group_non_uniform_reduce_max(long x) { + return __clc_sub_group_reduce_max(x); +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong +__clc_sub_group_non_uniform_reduce_max(ulong x) { + return __clc_sub_group_reduce_max(x); +} diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl index 66d0130839d38..c74fe96459800 100644 --- a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl +++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl @@ -6,140 +6,14 @@ // //===----------------------------------------------------------------------===// -#include "clc/subgroup/clc_sub_group_broadcast.h" +#include "clc/subgroup/clc_sub_group_non_uniform_reduce.h" +#include "clc/subgroup/clc_sub_group_reduce.h" -_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_add(uint x) { - return __builtin_amdgcn_wave_reduce_add_u32(x, 0); -} +// The implementation is the same as the nonuniform case, so just call the +// nonuniform versions of every function. -_CLC_DEF _CLC_OVERLOAD _CLC_CONST int __clc_sub_group_reduce_add(int x) { - return (int)__clc_sub_group_reduce_add((uint)x); -} +#define __CLC_BODY "clc_sub_group_reduce.inc" +#include "clc/integer/gentype.inc" -_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong __clc_sub_group_reduce_add(ulong x) { - return __builtin_amdgcn_wave_reduce_add_u64(x, 0); -} - -_CLC_DEF _CLC_OVERLOAD _CLC_CONST long __clc_sub_group_reduce_add(long x) { - return (long)__clc_sub_group_reduce_add((ulong)x); -} - -_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_min(uint x) { - return __builtin_amdgcn_wave_reduce_min_u32(x, 0); -} - -_CLC_DEF _CLC_OVERLOAD _CLC_CONST int __clc_sub_group_reduce_min(int x) { - return __builtin_amdgcn_wave_reduce_min_i32(x, 0); -} - -_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong __clc_sub_group_reduce_min(ulong x) { - return __builtin_amdgcn_wave_reduce_min_u64(x, 0); -} - -_CLC_DEF _CLC_OVERLOAD _CLC_CONST long __clc_sub_group_reduce_min(long x) { - return __builtin_amdgcn_wave_reduce_min_i64(x, 0); -} - -_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_max(uint x) { - return __builtin_amdgcn_wave_reduce_max_u32(x,... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/188929 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
