[clang] [llvm] [Clang][RISCV] packed reduction sum intrinsics (PR #206441)

via cfe-commits Mon, 29 Jun 2026 02:29:36 -0700

llvmorg-github-actions[bot] wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-x86

Author: SiHuaN (sihuan)

<details>
<summary>Changes</summary>

Add the __riscv_predsum/predsumu_* header wrappers over new
__builtin_riscv_* builtins, lowering to the llvm.riscv.predsum/predsumu
intrinsics.

Stacked on #<!-- -->206430 (its commit appears first here); please review/merge
that PR first. Once it lands I will rebase so only the Clang commit
remains.

---

Patch is 27.09 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/206441.diff


8 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsRISCV.td (+18) 
- (modified) clang/lib/CodeGen/TargetBuiltins/RISCV.cpp (+42) 
- (modified) clang/lib/Headers/riscv_packed_simd.h (+25) 
- (modified) clang/test/CodeGen/RISCV/rvp-intrinsics.c (+254) 
- (modified) cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c 
(+114) 
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+5-1) 
- (modified) llvm/test/CodeGen/RISCV/rvp-simd-32.ll (-4) 
- (modified) llvm/test/CodeGen/RISCV/rvp-simd-64.ll (-4) 


``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td 
b/clang/include/clang/Basic/BuiltinsRISCV.td
index ee20fefadd7c3..3f84528fdca70 100644
--- a/clang/include/clang/Basic/BuiltinsRISCV.td
+++ b/clang/include/clang/Basic/BuiltinsRISCV.td
@@ -215,6 +215,24 @@ def pabd_i16x4 : RISCVBuiltin<"_Vector<4, unsigned 
short>(_Vector<4, short>, _Ve
 def pabdu_u8x8 : RISCVBuiltin<"_Vector<8, unsigned char>(_Vector<8, unsigned 
char>, _Vector<8, unsigned char>)">;
 def pabdu_u16x4 : RISCVBuiltin<"_Vector<4, unsigned short>(_Vector<4, unsigned 
short>, _Vector<4, unsigned short>)">;
 
+// Packed Reduction Sum (32-bit)
+def predsum_i8x4_i32 : RISCVBuiltin<"int(_Vector<4, signed char>, int)">;
+def predsumu_u8x4_u32 : RISCVBuiltin<"unsigned int(_Vector<4, unsigned char>, 
unsigned int)">;
+def predsum_i16x2_i32 : RISCVBuiltin<"int(_Vector<2, short>, int)">;
+def predsumu_u16x2_u32 : RISCVBuiltin<"unsigned int(_Vector<2, unsigned 
short>, unsigned int)">;
+
+// Packed Reduction Sum (64-bit)
+def predsum_i8x8_i32 : RISCVBuiltin<"int(_Vector<8, signed char>, int)">;
+def predsumu_u8x8_u32 : RISCVBuiltin<"unsigned int(_Vector<8, unsigned char>, 
unsigned int)">;
+def predsum_i16x4_i32 : RISCVBuiltin<"int(_Vector<4, short>, int)">;
+def predsumu_u16x4_u32 : RISCVBuiltin<"unsigned int(_Vector<4, unsigned 
short>, unsigned int)">;
+def predsum_i8x8_i64 : RISCVBuiltin<"int64_t(_Vector<8, signed char>, 
int64_t)">;
+def predsumu_u8x8_u64 : RISCVBuiltin<"uint64_t(_Vector<8, unsigned char>, 
uint64_t)">;
+def predsum_i16x4_i64 : RISCVBuiltin<"int64_t(_Vector<4, short>, int64_t)">;
+def predsumu_u16x4_u64 : RISCVBuiltin<"uint64_t(_Vector<4, unsigned short>, 
uint64_t)">;
+def predsum_i32x2_i64 : RISCVBuiltin<"int64_t(_Vector<2, int>, int64_t)">;
+def predsumu_u32x2_u64 : RISCVBuiltin<"uint64_t(_Vector<2, unsigned int>, 
uint64_t)">;
+
 } // Features = "experimental-p"
 
 
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp 
b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
index d5b027fe5f8fe..bb8fa86e7a564 100644
--- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
@@ -1327,6 +1327,48 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned 
BuiltinID,
     break;
   }
 
+  // Packed Reduction Sum
+  case RISCV::BI__builtin_riscv_predsum_i8x4_i32:
+  case RISCV::BI__builtin_riscv_predsum_i16x2_i32:
+  case RISCV::BI__builtin_riscv_predsum_i8x8_i32:
+  case RISCV::BI__builtin_riscv_predsum_i16x4_i32:
+  case RISCV::BI__builtin_riscv_predsum_i8x8_i64:
+  case RISCV::BI__builtin_riscv_predsum_i16x4_i64:
+  case RISCV::BI__builtin_riscv_predsum_i32x2_i64:
+  case RISCV::BI__builtin_riscv_predsumu_u8x4_u32:
+  case RISCV::BI__builtin_riscv_predsumu_u16x2_u32:
+  case RISCV::BI__builtin_riscv_predsumu_u8x8_u32:
+  case RISCV::BI__builtin_riscv_predsumu_u16x4_u32:
+  case RISCV::BI__builtin_riscv_predsumu_u8x8_u64:
+  case RISCV::BI__builtin_riscv_predsumu_u16x4_u64:
+  case RISCV::BI__builtin_riscv_predsumu_u32x2_u64: {
+    switch (BuiltinID) {
+    default:
+      llvm_unreachable("unexpected builtin ID");
+    case RISCV::BI__builtin_riscv_predsum_i8x4_i32:
+    case RISCV::BI__builtin_riscv_predsum_i16x2_i32:
+    case RISCV::BI__builtin_riscv_predsum_i8x8_i32:
+    case RISCV::BI__builtin_riscv_predsum_i16x4_i32:
+    case RISCV::BI__builtin_riscv_predsum_i8x8_i64:
+    case RISCV::BI__builtin_riscv_predsum_i16x4_i64:
+    case RISCV::BI__builtin_riscv_predsum_i32x2_i64:
+      ID = Intrinsic::riscv_predsum;
+      break;
+    case RISCV::BI__builtin_riscv_predsumu_u8x4_u32:
+    case RISCV::BI__builtin_riscv_predsumu_u16x2_u32:
+    case RISCV::BI__builtin_riscv_predsumu_u8x8_u32:
+    case RISCV::BI__builtin_riscv_predsumu_u16x4_u32:
+    case RISCV::BI__builtin_riscv_predsumu_u8x8_u64:
+    case RISCV::BI__builtin_riscv_predsumu_u16x4_u64:
+    case RISCV::BI__builtin_riscv_predsumu_u32x2_u64:
+      ID = Intrinsic::riscv_predsumu;
+      break;
+    }
+
+    IntrinsicTypes = {ResultType, Ops[0]->getType()};
+    break;
+  }
+
   // Zk builtins
 
   // Zknh
diff --git a/clang/lib/Headers/riscv_packed_simd.h 
b/clang/lib/Headers/riscv_packed_simd.h
index 5aa00f1519671..c61e156ca6a7f 100644
--- a/clang/lib/Headers/riscv_packed_simd.h
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -103,6 +103,12 @@ typedef uint32_t uint32x2_t 
__attribute__((__vector_size__(8)));
     return (rty)builtin(__rs1, __rs2);                                         
\
   }
 
+#define __packed_reduction(name, rty, ty, builtin)                             
\
+  static __inline__ rty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1,            
\
+                                                          rty __rs2) {         
\
+    return builtin(__rs1, __rs2);                                              
\
+  }
+
 // clang-format off: macro call sites have no trailing semicolons, which
 // confuses clang-format into a deeply nested expression.
 
@@ -424,6 +430,24 @@ __packed_binary_builtin_cast(pabd_i16x4, int16x4_t, 
uint16x4_t, __builtin_riscv_
 __packed_binary_builtin_cast(pabdu_u8x8, uint8x8_t, uint8x8_t, 
__builtin_riscv_pabdu_u8x8)
 __packed_binary_builtin_cast(pabdu_u16x4, uint16x4_t, uint16x4_t, 
__builtin_riscv_pabdu_u16x4)
 
+/* Packed Reduction Sum (32-bit) */
+__packed_reduction(predsum_i8x4_i32, int32_t, int8x4_t, 
__builtin_riscv_predsum_i8x4_i32)
+__packed_reduction(predsumu_u8x4_u32, uint32_t, uint8x4_t, 
__builtin_riscv_predsumu_u8x4_u32)
+__packed_reduction(predsum_i16x2_i32, int32_t, int16x2_t, 
__builtin_riscv_predsum_i16x2_i32)
+__packed_reduction(predsumu_u16x2_u32, uint32_t, uint16x2_t, 
__builtin_riscv_predsumu_u16x2_u32)
+
+/* Packed Reduction Sum (64-bit) */
+__packed_reduction(predsum_i8x8_i32, int32_t, int8x8_t, 
__builtin_riscv_predsum_i8x8_i32)
+__packed_reduction(predsumu_u8x8_u32, uint32_t, uint8x8_t, 
__builtin_riscv_predsumu_u8x8_u32)
+__packed_reduction(predsum_i16x4_i32, int32_t, int16x4_t, 
__builtin_riscv_predsum_i16x4_i32)
+__packed_reduction(predsumu_u16x4_u32, uint32_t, uint16x4_t, 
__builtin_riscv_predsumu_u16x4_u32)
+__packed_reduction(predsum_i8x8_i64, int64_t, int8x8_t, 
__builtin_riscv_predsum_i8x8_i64)
+__packed_reduction(predsumu_u8x8_u64, uint64_t, uint8x8_t, 
__builtin_riscv_predsumu_u8x8_u64)
+__packed_reduction(predsum_i16x4_i64, int64_t, int16x4_t, 
__builtin_riscv_predsum_i16x4_i64)
+__packed_reduction(predsumu_u16x4_u64, uint64_t, uint16x4_t, 
__builtin_riscv_predsumu_u16x4_u64)
+__packed_reduction(predsum_i32x2_i64, int64_t, int32x2_t, 
__builtin_riscv_predsum_i32x2_i64)
+__packed_reduction(predsumu_u32x2_u64, uint64_t, uint32x2_t, 
__builtin_riscv_predsumu_u32x2_u64)
+
 // clang-format on
 
 #undef __packed_splat2
@@ -443,6 +467,7 @@ __packed_binary_builtin_cast(pabdu_u16x4, uint16x4_t, 
uint16x4_t, __builtin_risc
 #undef __packed_cmp
 #undef __packed_pabs
 #undef __packed_binary_builtin_cast
+#undef __packed_reduction
 #undef __DEFAULT_FN_ATTRS
 
 #if defined(__cplusplus)
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c 
b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index d3f153109b904..290f61787ceff 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -5889,3 +5889,257 @@ uint8x8_t test_pabdu_u8x8(uint8x8_t rs1, uint8x8_t rs2) 
{
 uint16x4_t test_pabdu_u16x4(uint16x4_t rs1, uint16x4_t rs2) {
   return __riscv_pabdu_u16x4(rs1, rs2);
 }
+
+/* Packed Reduction Sum (32-bit) */
+// RV32-LABEL: define dso_local i32 @test_predsum_i8x4_i32(
+// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i8(<4 x i8> 
[[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsum_i8x4_i32(
+// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i8(<4 x i8> 
[[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int32_t test_predsum_i8x4_i32(int8x4_t rs1, int32_t rs2) {
+  return __riscv_predsum_i8x4_i32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsumu_u8x4_u32(
+// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i8(<4 x 
i8> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsumu_u8x4_u32(
+// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i8(<4 x 
i8> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_predsumu_u8x4_u32(uint8x4_t rs1, uint32_t rs2) {
+  return __riscv_predsumu_u8x4_u32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsum_i16x2_i32(
+// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v2i16(<2 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsum_i16x2_i32(
+// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v2i16(<2 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int32_t test_predsum_i16x2_i32(int16x2_t rs1, int32_t rs2) {
+  return __riscv_predsum_i16x2_i32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsumu_u16x2_u32(
+// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v2i16(<2 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsumu_u16x2_u32(
+// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v2i16(<2 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_predsumu_u16x2_u32(uint16x2_t rs1, uint32_t rs2) {
+  return __riscv_predsumu_u16x2_u32(rs1, rs2);
+}
+
+/* Packed Reduction Sum (64-bit) */
+// RV32-LABEL: define dso_local i32 @test_predsum_i8x8_i32(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v8i8(<8 x i8> 
[[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsum_i8x8_i32(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v8i8(<8 x i8> 
[[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int32_t test_predsum_i8x8_i32(int8x8_t rs1, int32_t rs2) {
+  return __riscv_predsum_i8x8_i32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsumu_u8x8_u32(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v8i8(<8 x 
i8> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsumu_u8x8_u32(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v8i8(<8 x 
i8> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_predsumu_u8x8_u32(uint8x8_t rs1, uint32_t rs2) {
+  return __riscv_predsumu_u8x8_u32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsum_i16x4_i32(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i16(<4 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsum_i16x4_i32(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i16(<4 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int32_t test_predsum_i16x4_i32(int16x4_t rs1, int32_t rs2) {
+  return __riscv_predsum_i16x4_i32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsumu_u16x4_u32(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i16(<4 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsumu_u16x4_u32(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i16(<4 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_predsumu_u16x4_u32(uint16x4_t rs1, uint32_t rs2) {
+  return __riscv_predsumu_u16x4_u32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsum_i8x8_i64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v8i8(<8 x i8> 
[[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsum_i8x8_i64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v8i8(<8 x i8> 
[[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int64_t test_predsum_i8x8_i64(int8x8_t rs1, int64_t rs2) {
+  return __riscv_predsum_i8x8_i64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsumu_u8x8_u64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v8i8(<8 x 
i8> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsumu_u8x8_u64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v8i8(<8 x 
i8> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint64_t test_predsumu_u8x8_u64(uint8x8_t rs1, uint64_t rs2) {
+  return __riscv_predsumu_u8x8_u64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsum_i16x4_i64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v4i16(<4 x 
i16> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsum_i16x4_i64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v4i16(<4 x 
i16> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int64_t test_predsum_i16x4_i64(int16x4_t rs1, int64_t rs2) {
+  return __riscv_predsum_i16x4_i64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsumu_u16x4_u64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v4i16(<4 x 
i16> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsumu_u16x4_u64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v4i16(<4 x 
i16> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint64_t test_predsumu_u16x4_u64(uint16x4_t rs1, uint64_t rs2) {
+  return __riscv_predsumu_u16x4_u64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsum_i32x2_i64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v2i32(<2 x 
i32> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsum_i32x2_i64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v2i32(<2 x 
i32> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int64_t test_predsum_i32x2_i64(int32x2_t rs1, int64_t rs2) {
+  return __riscv_predsum_i32x2_i64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsumu_u32x2_u64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v2i32(<2 x 
i32> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsumu_u32x2_u64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v2i32(<2 x 
i32> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint64_t test_predsumu_u32x2_u64(uint32x2_t rs1, uint64_t rs2) {
+  return __riscv_predsumu_u32x2_u64(rs1, rs2);
+}
diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c 
b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
index 020a6be70aadb..4b939675cbeb5 100644
--- a/cross-project-tests/intrinsic-header-tests/ri...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/206441
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [Clang][RISCV] packed reduction sum intrinsics (PR #206441)

Reply via email to