[clang] [libc] [Clang] Add more scan / reduce operations to 'gpuintrin.h' (PR #185525)

Joseph Huber via cfe-commits Tue, 10 Mar 2026 08:10:33 -0700

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/185525


>From e520a89bbe178c2d57319841de372a856e1a262c Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Mon, 9 Mar 2026 17:07:24 -0500
Subject: [PATCH 1/3] [Clang] Add more scan / reduce operations to
 'gpuintrin.h'

Summary:
This builds off the pattern to add support for more of the standard
operations. The reductions could concievably use the AMDGPU builtins
later once we can enable DPP or other optimizations.
---
 clang/lib/Headers/gpuintrin.h                 |  49 +++++++--
 clang/test/Headers/Inputs/include/stdint.h    |   3 +
 .../src/__support/GPU/scan_reduce.cpp         | 104 ++++++++++++++++++
 3 files changed, 148 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4f7eea0cf6188..00558d785c011 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -207,7 +207,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, 
double __x,
 // unprocessed lanes, above or below the current lane in the case of a suffix 
or
 // prefix scan. Each iteration we shuffle in the unprocessed neighbors and then
 // clear the bits that this operation handled.
-#define __DO_LANE_OP(__type, __op, __identity, __prefix, __suffix)             
\
+#define __DO_LANE_OPS(__type, __op, __identity, __prefix, __suffix)            
\
   _DEFAULT_FN_ATTRS static __inline__ __type                                   
\
   __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask,              
\
                                             __type __x) {                      
\
@@ -216,7 +216,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, 
double __x,
       uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id();    
\
       __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x,  
\
                                                      __gpu_num_lanes());       
\
-      __x = __x __op(__above ? __result : (__type)__identity);                 
\
+      __x = __op(__x, __above ? __result : (__type)__identity);                
\
       for (uint32_t __i = 0; __i < __step; ++__i)                              
\
         __above &= __above - 1;                                                
\
     }                                                                          
\
@@ -232,7 +232,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, 
double __x,
           __below ? (63 - __builtin_clzg(__below)) : __gpu_lane_id();          
\
       __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x,  
\
                                                      __gpu_num_lanes());       
\
-      __x = __x __op(__below ? __result : (__type)__identity);                 
\
+      __x = __op(__x, __below ? __result : (__type)__identity);                
\
       for (uint32_t __i = 0; __i < __step; ++__i)                              
\
         __below ^= (1ull << (63 - __builtin_clzg(__below, 0))) & __below;      
\
     }                                                                          
\
@@ -245,11 +245,44 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t 
__idx, double __x,
         __lane_mask,                                                           
\
         __gpu_suffix_scan_##__prefix##_##__suffix(__lane_mask, __x));          
\
   }
-__DO_LANE_OP(uint32_t, +, 0, sum, u32);
-__DO_LANE_OP(uint64_t, +, 0, sum, u64);
-__DO_LANE_OP(float, +, 0, sum, f32);
-__DO_LANE_OP(double, +, 0, sum, f64);
-#undef __DO_LANE_OP
+
+#define __GPU_OP(__x, __y) ((__x) + (__y))
+__DO_LANE_OPS(uint32_t, __GPU_OP, 0, sum, u32);
+__DO_LANE_OPS(uint64_t, __GPU_OP, 0, sum, u64);
+__DO_LANE_OPS(float, __GPU_OP, 0, sum, f32);
+__DO_LANE_OPS(double, __GPU_OP, 0, sum, f64);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) ((__x) & (__y))
+__DO_LANE_OPS(uint32_t, __GPU_OP, UINT32_MAX, and, u32);
+__DO_LANE_OPS(uint64_t, __GPU_OP, UINT64_MAX, and, u64);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) ((__x) | (__y))
+__DO_LANE_OPS(uint32_t, __GPU_OP, 0, or, u32);
+__DO_LANE_OPS(uint64_t, __GPU_OP, 0, or, u64);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) ((__x) ^ (__y))
+__DO_LANE_OPS(uint32_t, __GPU_OP, 0, xor, u32);
+__DO_LANE_OPS(uint64_t, __GPU_OP, 0, xor, u64);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) ((__x) < (__y) ? (__x) : (__y))
+__DO_LANE_OPS(uint32_t, __GPU_OP, UINT32_MAX, min, u32);
+__DO_LANE_OPS(uint64_t, __GPU_OP, UINT64_MAX, min, u64);
+__DO_LANE_OPS(float, __GPU_OP, __builtin_inff(), min, f32);
+__DO_LANE_OPS(double, __GPU_OP, __builtin_inf(), min, f64);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) ((__x) > (__y) ? (__x) : (__y))
+__DO_LANE_OPS(uint32_t, __GPU_OP, 0, max, u32);
+__DO_LANE_OPS(uint64_t, __GPU_OP, 0, max, u64);
+__DO_LANE_OPS(float, __GPU_OP, -__builtin_inff(), max, f32);
+__DO_LANE_OPS(double, __GPU_OP, -__builtin_inf(), max, f64);
+#undef __GPU_OP
+
+#undef __DO_LANE_OPS
 
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
diff --git a/clang/test/Headers/Inputs/include/stdint.h 
b/clang/test/Headers/Inputs/include/stdint.h
index eb09272b40c33..c4836441096b2 100644
--- a/clang/test/Headers/Inputs/include/stdint.h
+++ b/clang/test/Headers/Inputs/include/stdint.h
@@ -36,4 +36,7 @@ typedef unsigned __INTPTR_TYPE__ uintptr_t;
 #define UINTPTR_MAX   __UINTPTR_MAX__
 #endif
 
+#define UINT32_MAX  __UINT32_C(4294967295)
+#define UINT64_MAX  __UINT64_C(18446744073709551615)
+
 #endif /* STDINT_H */
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp 
b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index 3861a612c587a..78b3f041e8ce8 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -97,13 +97,117 @@ static void test_scan_divergent() {
   }
 }
 
+static void test_reduce_bitwise() {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t id = gpu::get_lane_id();
+
+  EXPECT_EQ(__gpu_lane_and_u32(mask, 0xFFu), 0xFFu);
+  EXPECT_EQ(__gpu_lane_and_u32(mask, id == 0 ? 0x0Fu : 0xFFu), 0x0Fu);
+
+  EXPECT_EQ(__gpu_lane_or_u32(mask, id == 0 ? 0xF0u : 0x0Fu), 0xFFu);
+  EXPECT_EQ(__gpu_lane_or_u32(mask, 0u), 0u);
+
+  EXPECT_EQ(__gpu_lane_xor_u32(mask, 1u), 0u);
+  EXPECT_EQ(__gpu_lane_xor_u32(mask, id == 0 ? 0xFFu : 0u), 0xFFu);
+}
+
+static void test_reduce_min_max() {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t id = gpu::get_lane_id();
+  uint32_t n = gpu::get_lane_size();
+
+  EXPECT_EQ(__gpu_lane_min_u32(mask, id), 0u);
+  EXPECT_EQ(__gpu_lane_max_u32(mask, id), n - 1);
+  EXPECT_EQ(__gpu_lane_min_u32(mask, n - 1 - id), 0u);
+  EXPECT_EQ(__gpu_lane_max_u32(mask, n - 1 - id), n - 1);
+
+  EXPECT_EQ(__gpu_lane_min_f32(mask, static_cast<float>(id)), 0.0f);
+  EXPECT_EQ(__gpu_lane_max_f32(mask, static_cast<float>(id)),
+            static_cast<float>(n - 1));
+}
+
+static void test_scan_bitwise() {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t id = gpu::get_lane_id();
+
+  EXPECT_EQ(__gpu_prefix_scan_and_u32(mask, 0xFFu), 0xFFu);
+  EXPECT_EQ(__gpu_prefix_scan_and_u32(mask, id == 0 ? 0x0Fu : 0xFFu), 0x0Fu);
+
+  EXPECT_EQ(__gpu_prefix_scan_or_u32(mask, 0x0Fu), 0x0Fu);
+  uint32_t or_expected = id == 0 ? 0xF0u : 0xFFu;
+  EXPECT_EQ(__gpu_prefix_scan_or_u32(mask, id == 0 ? 0xF0u : 0x0Fu),
+            or_expected);
+
+  uint32_t xor_expected = id % 2 == 0 ? 0x0Fu : 0u;
+  EXPECT_EQ(__gpu_prefix_scan_xor_u32(mask, 0x0Fu), xor_expected);
+}
+
+static void test_scan_min_max() {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t id = gpu::get_lane_id();
+  uint32_t n = gpu::get_lane_size();
+
+  EXPECT_EQ(__gpu_prefix_scan_min_u32(mask, n - 1 - id), n - 1 - id);
+  EXPECT_EQ(__gpu_prefix_scan_max_u32(mask, id), id);
+
+  EXPECT_EQ(__gpu_prefix_scan_min_u32(mask, id), 0u);
+  EXPECT_EQ(__gpu_prefix_scan_max_u32(mask, n - 1 - id), n - 1);
+
+  EXPECT_EQ(__gpu_prefix_scan_min_f32(mask, static_cast<float>(n - 1 - id)),
+            static_cast<float>(n - 1 - id));
+  EXPECT_EQ(__gpu_prefix_scan_max_f32(mask, static_cast<float>(id)),
+            static_cast<float>(id));
+}
+
+static void test_float_min_max() {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t id = gpu::get_lane_id();
+  uint32_t n = gpu::get_lane_size();
+
+  float centered = static_cast<float>(id) - static_cast<float>(n / 2);
+  EXPECT_EQ(__gpu_lane_min_f32(mask, centered), -static_cast<float>(n / 2));
+  EXPECT_EQ(__gpu_lane_max_f32(mask, centered), static_cast<float>(n / 2 - 1));
+
+  float alt =
+      id % 2 == 0 ? static_cast<float>(id + 1) : -static_cast<float>(id + 1);
+  EXPECT_EQ(__gpu_lane_min_f32(mask, alt), -static_cast<float>(n));
+  EXPECT_EQ(__gpu_lane_max_f32(mask, alt), static_cast<float>(n - 1));
+
+  float v_val = id < n / 2 ? static_cast<float>(n / 2 - id)
+                           : static_cast<float>(id - n / 2);
+  float min_expected = id < n / 2 ? static_cast<float>(n / 2 - id) : 0.0f;
+  EXPECT_EQ(__gpu_prefix_scan_min_f32(mask, v_val), min_expected);
+
+  float inv_v =
+      id < n / 2 ? static_cast<float>(id) : static_cast<float>(n - 1 - id);
+  float max_expected =
+      id < n / 2 ? static_cast<float>(id) : static_cast<float>(n / 2 - 1);
+  EXPECT_EQ(__gpu_prefix_scan_max_f32(mask, inv_v), max_expected);
+
+  double d_centered = static_cast<double>(id) - static_cast<double>(n / 2);
+  EXPECT_EQ(__gpu_lane_min_f64(mask, d_centered), -static_cast<double>(n / 2));
+  EXPECT_EQ(__gpu_lane_max_f64(mask, d_centered),
+            static_cast<double>(n / 2 - 1));
+
+  double desc = static_cast<double>(n - 1 - id);
+  EXPECT_EQ(__gpu_prefix_scan_min_f64(mask, desc),
+            static_cast<double>(n - 1 - id));
+  EXPECT_EQ(__gpu_prefix_scan_max_f64(mask, desc), static_cast<double>(n - 1));
+}
+
 TEST_MAIN(int, char **, char **) {
   if (gpu::get_thread_id() >= gpu::get_lane_size())
     return 0;
 
   test_reduce();
+  test_reduce_bitwise();
+  test_reduce_min_max();
 
   test_scan();
+  test_scan_bitwise();
+  test_scan_min_max();
+
+  test_float_min_max();
 
   test_scan_divergent();
 

>From 0991d66749c6ba8d6cecba413c63a4030fca264c Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Tue, 10 Mar 2026 07:44:47 -0500
Subject: [PATCH 2/3] fix floating point reductions

---
 clang/lib/Headers/gpuintrin.h                 | 20 ++++++++++---
 .../src/__support/GPU/scan_reduce.cpp         | 28 +++++++++----------
 2 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 00558d785c011..083cadb5752da 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -271,15 +271,27 @@ __DO_LANE_OPS(uint64_t, __GPU_OP, 0, xor, u64);
 #define __GPU_OP(__x, __y) ((__x) < (__y) ? (__x) : (__y))
 __DO_LANE_OPS(uint32_t, __GPU_OP, UINT32_MAX, min, u32);
 __DO_LANE_OPS(uint64_t, __GPU_OP, UINT64_MAX, min, u64);
-__DO_LANE_OPS(float, __GPU_OP, __builtin_inff(), min, f32);
-__DO_LANE_OPS(double, __GPU_OP, __builtin_inf(), min, f64);
 #undef __GPU_OP
 
 #define __GPU_OP(__x, __y) ((__x) > (__y) ? (__x) : (__y))
 __DO_LANE_OPS(uint32_t, __GPU_OP, 0, max, u32);
 __DO_LANE_OPS(uint64_t, __GPU_OP, 0, max, u64);
-__DO_LANE_OPS(float, __GPU_OP, -__builtin_inff(), max, f32);
-__DO_LANE_OPS(double, __GPU_OP, -__builtin_inf(), max, f64);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) __builtin_fminf((__x), (__y))
+__DO_LANE_OPS(float, __GPU_OP, __builtin_inff(), fmin, f32);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) __builtin_fmin((__x), (__y))
+__DO_LANE_OPS(double, __GPU_OP, __builtin_inf(), fmin, f64);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) __builtin_fmaxf((__x), (__y))
+__DO_LANE_OPS(float, __GPU_OP, -__builtin_inff(), fmax, f32);
+#undef __GPU_OP
+
+#define __GPU_OP(__x, __y) __builtin_fmax((__x), (__y))
+__DO_LANE_OPS(double, __GPU_OP, -__builtin_inf(), fmax, f64);
 #undef __GPU_OP
 
 #undef __DO_LANE_OPS
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp 
b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index 78b3f041e8ce8..1e03291585132 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -121,8 +121,8 @@ static void test_reduce_min_max() {
   EXPECT_EQ(__gpu_lane_min_u32(mask, n - 1 - id), 0u);
   EXPECT_EQ(__gpu_lane_max_u32(mask, n - 1 - id), n - 1);
 
-  EXPECT_EQ(__gpu_lane_min_f32(mask, static_cast<float>(id)), 0.0f);
-  EXPECT_EQ(__gpu_lane_max_f32(mask, static_cast<float>(id)),
+  EXPECT_EQ(__gpu_lane_fmin_f32(mask, static_cast<float>(id)), 0.0f);
+  EXPECT_EQ(__gpu_lane_fmax_f32(mask, static_cast<float>(id)),
             static_cast<float>(n - 1));
 }
 
@@ -153,9 +153,9 @@ static void test_scan_min_max() {
   EXPECT_EQ(__gpu_prefix_scan_min_u32(mask, id), 0u);
   EXPECT_EQ(__gpu_prefix_scan_max_u32(mask, n - 1 - id), n - 1);
 
-  EXPECT_EQ(__gpu_prefix_scan_min_f32(mask, static_cast<float>(n - 1 - id)),
+  EXPECT_EQ(__gpu_prefix_scan_fmin_f32(mask, static_cast<float>(n - 1 - id)),
             static_cast<float>(n - 1 - id));
-  EXPECT_EQ(__gpu_prefix_scan_max_f32(mask, static_cast<float>(id)),
+  EXPECT_EQ(__gpu_prefix_scan_fmax_f32(mask, static_cast<float>(id)),
             static_cast<float>(id));
 }
 
@@ -165,34 +165,34 @@ static void test_float_min_max() {
   uint32_t n = gpu::get_lane_size();
 
   float centered = static_cast<float>(id) - static_cast<float>(n / 2);
-  EXPECT_EQ(__gpu_lane_min_f32(mask, centered), -static_cast<float>(n / 2));
-  EXPECT_EQ(__gpu_lane_max_f32(mask, centered), static_cast<float>(n / 2 - 1));
+  EXPECT_EQ(__gpu_lane_fmin_f32(mask, centered), -static_cast<float>(n / 2));
+  EXPECT_EQ(__gpu_lane_fmax_f32(mask, centered), static_cast<float>(n / 2 - 
1));
 
   float alt =
       id % 2 == 0 ? static_cast<float>(id + 1) : -static_cast<float>(id + 1);
-  EXPECT_EQ(__gpu_lane_min_f32(mask, alt), -static_cast<float>(n));
-  EXPECT_EQ(__gpu_lane_max_f32(mask, alt), static_cast<float>(n - 1));
+  EXPECT_EQ(__gpu_lane_fmin_f32(mask, alt), -static_cast<float>(n));
+  EXPECT_EQ(__gpu_lane_fmax_f32(mask, alt), static_cast<float>(n - 1));
 
   float v_val = id < n / 2 ? static_cast<float>(n / 2 - id)
                            : static_cast<float>(id - n / 2);
   float min_expected = id < n / 2 ? static_cast<float>(n / 2 - id) : 0.0f;
-  EXPECT_EQ(__gpu_prefix_scan_min_f32(mask, v_val), min_expected);
+  EXPECT_EQ(__gpu_prefix_scan_fmin_f32(mask, v_val), min_expected);
 
   float inv_v =
       id < n / 2 ? static_cast<float>(id) : static_cast<float>(n - 1 - id);
   float max_expected =
       id < n / 2 ? static_cast<float>(id) : static_cast<float>(n / 2 - 1);
-  EXPECT_EQ(__gpu_prefix_scan_max_f32(mask, inv_v), max_expected);
+  EXPECT_EQ(__gpu_prefix_scan_fmax_f32(mask, inv_v), max_expected);
 
   double d_centered = static_cast<double>(id) - static_cast<double>(n / 2);
-  EXPECT_EQ(__gpu_lane_min_f64(mask, d_centered), -static_cast<double>(n / 2));
-  EXPECT_EQ(__gpu_lane_max_f64(mask, d_centered),
+  EXPECT_EQ(__gpu_lane_fmin_f64(mask, d_centered), -static_cast<double>(n / 
2));
+  EXPECT_EQ(__gpu_lane_fmax_f64(mask, d_centered),
             static_cast<double>(n / 2 - 1));
 
   double desc = static_cast<double>(n - 1 - id);
-  EXPECT_EQ(__gpu_prefix_scan_min_f64(mask, desc),
+  EXPECT_EQ(__gpu_prefix_scan_fmin_f64(mask, desc),
             static_cast<double>(n - 1 - id));
-  EXPECT_EQ(__gpu_prefix_scan_max_f64(mask, desc), static_cast<double>(n - 1));
+  EXPECT_EQ(__gpu_prefix_scan_fmax_f64(mask, desc), static_cast<double>(n - 
1));
 }
 
 TEST_MAIN(int, char **, char **) {

>From 38cc7cd6bd28296a7a2ea96a7d735e9417efb988 Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Tue, 10 Mar 2026 10:10:03 -0500
Subject: [PATCH 3/3] minnum/maxnum

---
 clang/lib/Headers/gpuintrin.h                 | 18 ++++-------
 .../src/__support/GPU/scan_reduce.cpp         | 31 ++++++++++---------
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 083cadb5752da..ef1446a3ac77b 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -278,20 +278,14 @@ __DO_LANE_OPS(uint32_t, __GPU_OP, 0, max, u32);
 __DO_LANE_OPS(uint64_t, __GPU_OP, 0, max, u64);
 #undef __GPU_OP
 
-#define __GPU_OP(__x, __y) __builtin_fminf((__x), (__y))
-__DO_LANE_OPS(float, __GPU_OP, __builtin_inff(), fmin, f32);
+#define __GPU_OP(__x, __y) __builtin_elementwise_minnum((__x), (__y))
+__DO_LANE_OPS(float, __GPU_OP, __builtin_inff(), minnum, f32);
+__DO_LANE_OPS(double, __GPU_OP, __builtin_inf(), minnum, f64);
 #undef __GPU_OP
 
-#define __GPU_OP(__x, __y) __builtin_fmin((__x), (__y))
-__DO_LANE_OPS(double, __GPU_OP, __builtin_inf(), fmin, f64);
-#undef __GPU_OP
-
-#define __GPU_OP(__x, __y) __builtin_fmaxf((__x), (__y))
-__DO_LANE_OPS(float, __GPU_OP, -__builtin_inff(), fmax, f32);
-#undef __GPU_OP
-
-#define __GPU_OP(__x, __y) __builtin_fmax((__x), (__y))
-__DO_LANE_OPS(double, __GPU_OP, -__builtin_inf(), fmax, f64);
+#define __GPU_OP(__x, __y) __builtin_elementwise_maxnum((__x), (__y))
+__DO_LANE_OPS(float, __GPU_OP, -__builtin_inff(), maxnum, f32);
+__DO_LANE_OPS(double, __GPU_OP, -__builtin_inf(), maxnum, f64);
 #undef __GPU_OP
 
 #undef __DO_LANE_OPS
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp 
b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index 1e03291585132..ff92c9c888a43 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -121,8 +121,8 @@ static void test_reduce_min_max() {
   EXPECT_EQ(__gpu_lane_min_u32(mask, n - 1 - id), 0u);
   EXPECT_EQ(__gpu_lane_max_u32(mask, n - 1 - id), n - 1);
 
-  EXPECT_EQ(__gpu_lane_fmin_f32(mask, static_cast<float>(id)), 0.0f);
-  EXPECT_EQ(__gpu_lane_fmax_f32(mask, static_cast<float>(id)),
+  EXPECT_EQ(__gpu_lane_minnum_f32(mask, static_cast<float>(id)), 0.0f);
+  EXPECT_EQ(__gpu_lane_maxnum_f32(mask, static_cast<float>(id)),
             static_cast<float>(n - 1));
 }
 
@@ -153,9 +153,9 @@ static void test_scan_min_max() {
   EXPECT_EQ(__gpu_prefix_scan_min_u32(mask, id), 0u);
   EXPECT_EQ(__gpu_prefix_scan_max_u32(mask, n - 1 - id), n - 1);
 
-  EXPECT_EQ(__gpu_prefix_scan_fmin_f32(mask, static_cast<float>(n - 1 - id)),
+  EXPECT_EQ(__gpu_prefix_scan_minnum_f32(mask, static_cast<float>(n - 1 - id)),
             static_cast<float>(n - 1 - id));
-  EXPECT_EQ(__gpu_prefix_scan_fmax_f32(mask, static_cast<float>(id)),
+  EXPECT_EQ(__gpu_prefix_scan_maxnum_f32(mask, static_cast<float>(id)),
             static_cast<float>(id));
 }
 
@@ -165,34 +165,37 @@ static void test_float_min_max() {
   uint32_t n = gpu::get_lane_size();
 
   float centered = static_cast<float>(id) - static_cast<float>(n / 2);
-  EXPECT_EQ(__gpu_lane_fmin_f32(mask, centered), -static_cast<float>(n / 2));
-  EXPECT_EQ(__gpu_lane_fmax_f32(mask, centered), static_cast<float>(n / 2 - 
1));
+  EXPECT_EQ(__gpu_lane_minnum_f32(mask, centered), -static_cast<float>(n / 2));
+  EXPECT_EQ(__gpu_lane_maxnum_f32(mask, centered),
+            static_cast<float>(n / 2 - 1));
 
   float alt =
       id % 2 == 0 ? static_cast<float>(id + 1) : -static_cast<float>(id + 1);
-  EXPECT_EQ(__gpu_lane_fmin_f32(mask, alt), -static_cast<float>(n));
-  EXPECT_EQ(__gpu_lane_fmax_f32(mask, alt), static_cast<float>(n - 1));
+  EXPECT_EQ(__gpu_lane_minnum_f32(mask, alt), -static_cast<float>(n));
+  EXPECT_EQ(__gpu_lane_maxnum_f32(mask, alt), static_cast<float>(n - 1));
 
   float v_val = id < n / 2 ? static_cast<float>(n / 2 - id)
                            : static_cast<float>(id - n / 2);
   float min_expected = id < n / 2 ? static_cast<float>(n / 2 - id) : 0.0f;
-  EXPECT_EQ(__gpu_prefix_scan_fmin_f32(mask, v_val), min_expected);
+  EXPECT_EQ(__gpu_prefix_scan_minnum_f32(mask, v_val), min_expected);
 
   float inv_v =
       id < n / 2 ? static_cast<float>(id) : static_cast<float>(n - 1 - id);
   float max_expected =
       id < n / 2 ? static_cast<float>(id) : static_cast<float>(n / 2 - 1);
-  EXPECT_EQ(__gpu_prefix_scan_fmax_f32(mask, inv_v), max_expected);
+  EXPECT_EQ(__gpu_prefix_scan_maxnum_f32(mask, inv_v), max_expected);
 
   double d_centered = static_cast<double>(id) - static_cast<double>(n / 2);
-  EXPECT_EQ(__gpu_lane_fmin_f64(mask, d_centered), -static_cast<double>(n / 
2));
-  EXPECT_EQ(__gpu_lane_fmax_f64(mask, d_centered),
+  EXPECT_EQ(__gpu_lane_minnum_f64(mask, d_centered),
+            -static_cast<double>(n / 2));
+  EXPECT_EQ(__gpu_lane_maxnum_f64(mask, d_centered),
             static_cast<double>(n / 2 - 1));
 
   double desc = static_cast<double>(n - 1 - id);
-  EXPECT_EQ(__gpu_prefix_scan_fmin_f64(mask, desc),
+  EXPECT_EQ(__gpu_prefix_scan_minnum_f64(mask, desc),
             static_cast<double>(n - 1 - id));
-  EXPECT_EQ(__gpu_prefix_scan_fmax_f64(mask, desc), static_cast<double>(n - 
1));
+  EXPECT_EQ(__gpu_prefix_scan_maxnum_f64(mask, desc),
+            static_cast<double>(n - 1));
 }
 
 TEST_MAIN(int, char **, char **) {

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [libc] [Clang] Add more scan / reduce operations to 'gpuintrin.h' (PR #185525)

Reply via email to