https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/188929

For AMDGPU these are identical to the uniform case. Stub out the missing
cases with traps to avoid test failures from undefined symbols while keeping
the structure consistent.

>From 4b715b3428240636ae3db41c8d5d75ab175e1099 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <[email protected]>
Date: Wed, 25 Mar 2026 11:41:49 +0100
Subject: [PATCH] libclc: Partially implement nonuniform subgroup reduce
 functions

For AMDGPU these are identical to the uniform case. Stub out the missing
cases with traps to avoid test failures from undefined symbols while keeping
the structure consistent.
---
 .../clc_sub_group_non_uniform_reduce.h        |  29 ++
 .../clc_sub_group_non_uniform_reduce.inc      |  33 ++
 libclc/clc/lib/amdgpu/CMakeLists.txt          |   1 +
 .../clc_sub_group_non_uniform_reduce.cl       | 384 ++++++++++++++++++
 .../clc_sub_group_non_uniform_reduce.inc      |  87 ++++
 .../amdgpu/subgroup/clc_sub_group_reduce.cl   | 142 +------
 .../amdgpu/subgroup/clc_sub_group_reduce.inc  |  26 ++
 libclc/opencl/lib/generic/CMakeLists.txt      |   1 +
 .../subgroup/sub_group_non_uniform_reduce.cl  |  30 ++
 .../subgroup/sub_group_non_uniform_reduce.inc |  48 +++
 10 files changed, 647 insertions(+), 134 deletions(-)
 create mode 100644 
libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h
 create mode 100644 
libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.inc
 create mode 100644 
libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl
 create mode 100644 
libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.inc
 create mode 100644 libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc
 create mode 100644 
libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl
 create mode 100644 
libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc

diff --git a/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h 
b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h
new file mode 100644
index 0000000000000..cc73ea1f78bf1
--- /dev/null
+++ b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__
+#define __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__
+
+#include "clc/internal/clc.h"
+
+#define __CLC_BODY "clc/subgroup/clc_sub_group_non_uniform_reduce.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "clc/subgroup/clc_sub_group_non_uniform_reduce.inc"
+#include "clc/math/gentype.inc"
+
+_CLC_DECL _CLC_OVERLOAD int
+__clc_sub_group_non_uniform_reduce_logical_and(int x);
+
+_CLC_DECL _CLC_OVERLOAD int
+__clc_sub_group_non_uniform_reduce_logical_or(int x);
+
+_CLC_DECL _CLC_OVERLOAD int
+__clc_sub_group_non_uniform_reduce_logical_xor(int x);
+
+#endif // __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__
diff --git 
a/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.inc 
b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.inc
new file mode 100644
index 0000000000000..f7b82d3119ed9
--- /dev/null
+++ b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.inc
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_add(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_mul(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_min(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_max(__CLC_GENTYPE x);
+
+#ifndef __CLC_FPSIZE
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_and(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_or(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_xor(__CLC_GENTYPE x);
+#endif // __CLC_FPSIZE
+
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt 
b/libclc/clc/lib/amdgpu/CMakeLists.txt
index 186b6ab0b85b1..a5cd47fab4462 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -29,6 +29,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
   mem_fence/clc_mem_fence.cl
   subgroup/clc_subgroup.cl
   subgroup/clc_sub_group_broadcast.cl
+  subgroup/clc_sub_group_non_uniform_reduce.cl
   subgroup/clc_sub_group_reduce.cl
   subgroup/clc_sub_group_scan.cl
   synchronization/clc_sub_group_barrier.cl
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl 
b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl
new file mode 100644
index 0000000000000..00a485e9405a3
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl
@@ -0,0 +1,384 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/subgroup/clc_sub_group_non_uniform_reduce.h"
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_add(uint x) {
+  return __builtin_amdgcn_wave_reduce_add_u32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_add(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_add((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_add(ulong x) {
+  return __builtin_amdgcn_wave_reduce_add_u64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_add(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_add((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_min(uint x) {
+  return __builtin_amdgcn_wave_reduce_min_u32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_min(int x) {
+  return __builtin_amdgcn_wave_reduce_min_i32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_min(ulong x) {
+  return __builtin_amdgcn_wave_reduce_min_u64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_min(long x) {
+  return __builtin_amdgcn_wave_reduce_min_i64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_max(uint x) {
+  return __builtin_amdgcn_wave_reduce_max_u32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_max(int x) {
+  return __builtin_amdgcn_wave_reduce_max_i32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_max(ulong x) {
+  return __builtin_amdgcn_wave_reduce_max_u64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_max(long x) {
+  return __builtin_amdgcn_wave_reduce_max_i64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST float
+__clc_sub_group_non_uniform_reduce_add(float x) {
+  return __builtin_amdgcn_wave_reduce_fadd_f32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST double
+__clc_sub_group_non_uniform_reduce_add(double x) {
+  return __builtin_amdgcn_wave_reduce_fadd_f64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST float
+__clc_sub_group_non_uniform_reduce_min(float x) {
+  return __builtin_amdgcn_wave_reduce_fmin_f32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST double
+__clc_sub_group_non_uniform_reduce_min(double x) {
+  return __builtin_amdgcn_wave_reduce_fmin_f64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST float
+__clc_sub_group_non_uniform_reduce_max(float x) {
+  return __builtin_amdgcn_wave_reduce_fmax_f32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST double
+__clc_sub_group_non_uniform_reduce_max(double x) {
+  return __builtin_amdgcn_wave_reduce_fmax_f64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST half
+__clc_sub_group_non_uniform_reduce_add(half x) {
+  // FIXME: There should be a direct half builtin available.
+  return (float)__clc_sub_group_non_uniform_reduce_add((float)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST half
+__clc_sub_group_non_uniform_reduce_min(half x) {
+  // FIXME: There should be a direct half builtin available.
+  return (float)__clc_sub_group_non_uniform_reduce_min((float)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST half
+__clc_sub_group_non_uniform_reduce_max(half x) {
+  // FIXME: There should be a direct half builtin available.
+  return (float)__clc_sub_group_non_uniform_reduce_max((float)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_add(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_add((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_add(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_add((int)x);
+}
+
+// FIXME: There should be a direct short builtin available.
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_add(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_add((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_add(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_add((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_min(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_min((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_min(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_min((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_min(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_min((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_min(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_min((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_max(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_max((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_max(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_max((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_max(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_max((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_max(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_max((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_and(uint x) {
+  return __builtin_amdgcn_wave_reduce_and_b32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_and(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_and((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_and(ulong x) {
+  return __builtin_amdgcn_wave_reduce_and_b64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_and(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_and((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_or(uint x) {
+  return __builtin_amdgcn_wave_reduce_or_b32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_or(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_or((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_or(ulong x) {
+  return __builtin_amdgcn_wave_reduce_or_b64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_or(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_or((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_xor(uint x) {
+  return __builtin_amdgcn_wave_reduce_xor_b32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_xor(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_xor((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_xor(ulong x) {
+  return __builtin_amdgcn_wave_reduce_xor_b64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_xor(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_xor((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_and(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_and((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_and(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_and((int)x);
+}
+
+// FIXME: There should be a direct short builtin available.
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_and(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_and((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_and(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_and((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_or(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_or((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_or(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_or((int)x);
+}
+
+// FIXME: There should be a direct short builtin available.
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_or(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_or((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_or(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_or((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_xor(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_xor((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_xor(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_xor((int)x);
+}
+
+// FIXME: There should be a direct short builtin available.
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_xor(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_xor((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_xor(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_xor((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_mul(uint x) {
+  (void)x;
+  // TODO:
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_mul(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_mul((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_mul(ulong x) {
+  (void)x;
+  // TODO:
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_mul(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_mul((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_mul(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_mul((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_mul(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_mul((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_mul(short x) {
+  return (short)__clc_sub_group_non_uniform_reduce_mul((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_mul(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_mul((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_logical_and(int predicate) {
+  // TODO:
+  (void)predicate;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_logical_or(int predicate) {
+  // TODO:
+  (void)predicate;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_logical_xor(int predicate) {
+  // TODO:
+  (void)predicate;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST float
+__clc_sub_group_non_uniform_reduce_mul(float x) {
+  (void)x;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST double
+__clc_sub_group_non_uniform_reduce_mul(double x) {
+  (void)x;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST half
+__clc_sub_group_non_uniform_reduce_mul(half x) {
+  (void)x;
+  __builtin_trap();
+}
diff --git 
a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.inc 
b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.inc
new file mode 100644
index 0000000000000..ac331c5268aa9
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.inc
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_add(int x) {
+  return __clc_sub_group_reduce_add(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_add(uint x) {
+  return __clc_sub_group_reduce_add(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_mul(int x) {
+  return __clc_sub_group_reduce_mul(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_mul(uint x) {
+  return __clc_sub_group_reduce_mul(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_min(int x) {
+  return __clc_sub_group_reduce_min(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_min(uint x) {
+  return __clc_sub_group_reduce_min(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_max(int x) {
+  return __clc_sub_group_reduce_max(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_max(uint x) {
+  return __clc_sub_group_reduce_max(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_add(long x) {
+  return __clc_sub_group_reduce_add(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_add(ulong x) {
+  return __clc_sub_group_reduce_add(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_mul(long x) {
+  return __clc_sub_group_reduce_mul(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_mul(ulong x) {
+  return __clc_sub_group_reduce_mul(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_min(long x) {
+  return __clc_sub_group_reduce_min(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_min(ulong x) {
+  return __clc_sub_group_reduce_min(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_max(long x) {
+  return __clc_sub_group_reduce_max(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_max(ulong x) {
+  return __clc_sub_group_reduce_max(x);
+}
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl 
b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl
index 66d0130839d38..c74fe96459800 100644
--- a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl
@@ -6,140 +6,14 @@
 //
 
//===----------------------------------------------------------------------===//
 
-#include "clc/subgroup/clc_sub_group_broadcast.h"
+#include "clc/subgroup/clc_sub_group_non_uniform_reduce.h"
+#include "clc/subgroup/clc_sub_group_reduce.h"
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_add(uint x) {
-  return __builtin_amdgcn_wave_reduce_add_u32(x, 0);
-}
+// The implementation is the same as the nonuniform case, so just call the
+// nonuniform versions of every function.
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST int __clc_sub_group_reduce_add(int x) {
-  return (int)__clc_sub_group_reduce_add((uint)x);
-}
+#define __CLC_BODY "clc_sub_group_reduce.inc"
+#include "clc/integer/gentype.inc"
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong __clc_sub_group_reduce_add(ulong x) {
-  return __builtin_amdgcn_wave_reduce_add_u64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST long __clc_sub_group_reduce_add(long x) {
-  return (long)__clc_sub_group_reduce_add((ulong)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_min(uint x) {
-  return __builtin_amdgcn_wave_reduce_min_u32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST int __clc_sub_group_reduce_min(int x) {
-  return __builtin_amdgcn_wave_reduce_min_i32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong __clc_sub_group_reduce_min(ulong x) {
-  return __builtin_amdgcn_wave_reduce_min_u64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST long __clc_sub_group_reduce_min(long x) {
-  return __builtin_amdgcn_wave_reduce_min_i64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_max(uint x) {
-  return __builtin_amdgcn_wave_reduce_max_u32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST int __clc_sub_group_reduce_max(int x) {
-  return __builtin_amdgcn_wave_reduce_max_i32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong __clc_sub_group_reduce_max(ulong x) {
-  return __builtin_amdgcn_wave_reduce_max_u64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST long __clc_sub_group_reduce_max(long x) {
-  return __builtin_amdgcn_wave_reduce_max_i64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST float __clc_sub_group_reduce_add(float x) {
-  return __builtin_amdgcn_wave_reduce_fadd_f32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST double __clc_sub_group_reduce_add(double x) {
-  return __builtin_amdgcn_wave_reduce_fadd_f64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST float __clc_sub_group_reduce_min(float x) {
-  return __builtin_amdgcn_wave_reduce_fmin_f32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST double __clc_sub_group_reduce_min(double x) {
-  return __builtin_amdgcn_wave_reduce_fmin_f64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST float __clc_sub_group_reduce_max(float x) {
-  return __builtin_amdgcn_wave_reduce_fmax_f32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST double __clc_sub_group_reduce_max(double x) {
-  return __builtin_amdgcn_wave_reduce_fmax_f64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST half __clc_sub_group_reduce_add(half x) {
-  // FIXME: There should be a direct half builtin available.
-  return (float)__clc_sub_group_reduce_add((float)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST half __clc_sub_group_reduce_min(half x) {
-  // FIXME: There should be a direct half builtin available.
-  return (float)__clc_sub_group_reduce_min((float)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST half __clc_sub_group_reduce_max(half x) {
-  // FIXME: There should be a direct half builtin available.
-  return (float)__clc_sub_group_reduce_max((float)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar __clc_sub_group_reduce_add(uchar x) {
-  return (uchar)__clc_sub_group_reduce_add((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST char __clc_sub_group_reduce_add(char x) {
-  return (char)__clc_sub_group_reduce_add((int)x);
-}
-
-// FIXME: There should be a direct short builtin available.
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort __clc_sub_group_reduce_add(ushort x) {
-  return (ushort)__clc_sub_group_reduce_add((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST short __clc_sub_group_reduce_add(short x) {
-  return (int)__clc_sub_group_reduce_add((int)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar __clc_sub_group_reduce_min(uchar x) {
-  return (uchar)__clc_sub_group_reduce_min((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST char __clc_sub_group_reduce_min(char x) {
-  return (char)__clc_sub_group_reduce_min((int)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort __clc_sub_group_reduce_min(ushort x) {
-  return (ushort)__clc_sub_group_reduce_min((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST short __clc_sub_group_reduce_min(short x) {
-  return (int)__clc_sub_group_reduce_min((int)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar __clc_sub_group_reduce_max(uchar x) {
-  return (uchar)__clc_sub_group_reduce_max((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST char __clc_sub_group_reduce_max(char x) {
-  return (char)__clc_sub_group_reduce_max((int)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort __clc_sub_group_reduce_max(ushort x) {
-  return (ushort)__clc_sub_group_reduce_max((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST short __clc_sub_group_reduce_max(short x) {
-  return (int)__clc_sub_group_reduce_max((int)x);
-}
+#define __CLC_BODY "clc_sub_group_reduce.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc 
b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc
new file mode 100644
index 0000000000000..1d487fdd7e3e9
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_sub_group_reduce_add(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_add(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_sub_group_reduce_min(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_min(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_sub_group_reduce_max(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_max(x);
+}
+
+#endif
diff --git a/libclc/opencl/lib/generic/CMakeLists.txt 
b/libclc/opencl/lib/generic/CMakeLists.txt
index e6565c7f9ed67..4ad60248139ae 100644
--- a/libclc/opencl/lib/generic/CMakeLists.txt
+++ b/libclc/opencl/lib/generic/CMakeLists.txt
@@ -208,6 +208,7 @@ libclc_configure_source_list(OPENCL_GENERIC_SOURCES
   shared/vstore.cl
   subgroup/sub_group_broadcast.cl
   subgroup/sub_group_reduce.cl
+  subgroup/sub_group_non_uniform_reduce.cl
   subgroup/sub_group_scan_exclusive.cl
   subgroup/sub_group_scan_inclusive.cl
   subgroup/subgroup.cl
diff --git a/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl 
b/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl
new file mode 100644
index 0000000000000..e00717b979ea1
--- /dev/null
+++ b/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/subgroup/clc_sub_group_non_uniform_reduce.h"
+
+#define __CLC_BODY "sub_group_non_uniform_reduce.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "sub_group_non_uniform_reduce.inc"
+#include "clc/math/gentype.inc"
+
+_CLC_DEF _CLC_OVERLOAD int
+sub_group_non_uniform_reduce_logical_and(int predicate) {
+  return __clc_sub_group_non_uniform_reduce_logical_and(predicate);
+}
+
+_CLC_DEF _CLC_OVERLOAD int
+sub_group_non_uniform_reduce_logical_or(int predicate) {
+  return __clc_sub_group_non_uniform_reduce_logical_or(predicate);
+}
+
+_CLC_DEF _CLC_OVERLOAD int
+sub_group_non_uniform_reduce_logical_xor(int predicate) {
+  return __clc_sub_group_non_uniform_reduce_logical_xor(predicate);
+}
diff --git 
a/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc 
b/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc
new file mode 100644
index 0000000000000..4e5c8dcd8523f
--- /dev/null
+++ b/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_add(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_add(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_mul(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_mul(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_min(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_min(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_max(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_max(x);
+}
+
+#ifndef __CLC_FPSIZE
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_and(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_and(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_or(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_or(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_xor(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_xor(x);
+}
+#endif
+
+#endif // __CLC_SCALAR

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to