https://github.com/sihuan created 
https://github.com/llvm/llvm-project/pull/174068

This patch adds initial intrinsic support for the RISC-V P extension, 
introducing padd and psub operations.

The implementation is based on the `Packed Addition and Subtraction` section of 
the P extension intrinsic specification: 

>From 7bcd9d9f0f93157b37703f621c7bfda0a034ec78 Mon Sep 17 00:00:00 2001
From: SiHuaN <[email protected]>
Date: Wed, 31 Dec 2025 13:36:54 +0800
Subject: [PATCH] [RISCV] Preliminary P-ext intrinsics support

This patch adds initial intrinsic support for the RISC-V P extension,
introducing padd and psub operations.
---
 clang/include/clang/Basic/BuiltinsRISCV.td  |  17 ++
 clang/lib/CodeGen/TargetBuiltins/RISCV.cpp  |  31 ++-
 clang/test/CodeGen/RISCV/rvp-intrinsics.c   | 223 ++++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsRISCV.td     |  12 ++
 llvm/lib/Target/RISCV/RISCVCallingConv.cpp  |  75 ++++---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |   5 +
 llvm/lib/Target/RISCV/RISCVInstrInfoP.td    |  36 ++++
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td  |  14 +-
 llvm/test/CodeGen/RISCV/rvp-intrinsics.ll   | 135 ++++++++++++
 9 files changed, 514 insertions(+), 34 deletions(-)
 create mode 100644 clang/test/CodeGen/RISCV/rvp-intrinsics.c
 create mode 100644 llvm/test/CodeGen/RISCV/rvp-intrinsics.ll

diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td 
b/clang/include/clang/Basic/BuiltinsRISCV.td
index 2dad5ede2d64b..1c43371cd52fc 100644
--- a/clang/include/clang/Basic/BuiltinsRISCV.td
+++ b/clang/include/clang/Basic/BuiltinsRISCV.td
@@ -137,6 +137,23 @@ def sm3p0 : RISCVBuiltin<"unsigned int(unsigned int)">;
 def sm3p1 : RISCVBuiltin<"unsigned int(unsigned int)">;
 } // Features = "zksh"
 
+//===----------------------------------------------------------------------===//
+// P extension.
+//===----------------------------------------------------------------------===//
+let Features = "experimental-p" in {
+def padd_v4i8  : RISCVBuiltin<"_Vector<4, char>(_Vector<4, char>, _Vector<4, 
char>)">;
+def padd_v2i16 : RISCVBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, 
short>)">;
+def padd_v8i8  : RISCVBuiltin<"_Vector<8, char>(_Vector<8, char>, _Vector<8, 
char>)">;
+def padd_v4i16 : RISCVBuiltin<"_Vector<4, short>(_Vector<4, short>, _Vector<4, 
short>)">;
+def padd_v2i32 : RISCVBuiltin<"_Vector<2, int>(_Vector<2, int>, _Vector<2, 
int>)">;
+
+def psub_v4i8  : RISCVBuiltin<"_Vector<4, char>(_Vector<4, char>, _Vector<4, 
char>)">;
+def psub_v2i16 : RISCVBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, 
short>)">;
+def psub_v8i8  : RISCVBuiltin<"_Vector<8, char>(_Vector<8, char>, _Vector<8, 
char>)">;
+def psub_v4i16 : RISCVBuiltin<"_Vector<4, short>(_Vector<4, short>, _Vector<4, 
short>)">;
+def psub_v2i32 : RISCVBuiltin<"_Vector<2, int>(_Vector<2, int>, _Vector<2, 
int>)">;
+} // Features = "experimental-p"
+
 } // Attributes = [Const, NoThrow]
 
 
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp 
b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
index 2e11037f0dcd0..8cc8b03db0137 100644
--- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
@@ -1143,7 +1143,17 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned 
BuiltinID,
   case RISCV::BI__builtin_riscv_brev8_32:
   case RISCV::BI__builtin_riscv_brev8_64:
   case RISCV::BI__builtin_riscv_zip_32:
-  case RISCV::BI__builtin_riscv_unzip_32: {
+  case RISCV::BI__builtin_riscv_unzip_32:
+  case RISCV::BI__builtin_riscv_padd_v4i8:
+  case RISCV::BI__builtin_riscv_padd_v2i16:
+  case RISCV::BI__builtin_riscv_padd_v8i8:
+  case RISCV::BI__builtin_riscv_padd_v4i16:
+  case RISCV::BI__builtin_riscv_padd_v2i32:
+  case RISCV::BI__builtin_riscv_psub_v4i8:
+  case RISCV::BI__builtin_riscv_psub_v2i16:
+  case RISCV::BI__builtin_riscv_psub_v8i8:
+  case RISCV::BI__builtin_riscv_psub_v4i16:
+  case RISCV::BI__builtin_riscv_psub_v2i32: {
     switch (BuiltinID) {
     default: llvm_unreachable("unexpected builtin ID");
     // Zbb
@@ -1187,11 +1197,26 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned 
BuiltinID,
     case RISCV::BI__builtin_riscv_unzip_32:
       ID = Intrinsic::riscv_unzip;
       break;
-    }
 
+    // P extension
+    case RISCV::BI__builtin_riscv_padd_v4i8:
+    case RISCV::BI__builtin_riscv_padd_v2i16:
+    case RISCV::BI__builtin_riscv_padd_v8i8:
+    case RISCV::BI__builtin_riscv_padd_v4i16:
+    case RISCV::BI__builtin_riscv_padd_v2i32:
+      ID = Intrinsic::riscv_padd;
+      break;
+    case RISCV::BI__builtin_riscv_psub_v4i8:
+    case RISCV::BI__builtin_riscv_psub_v2i16:
+    case RISCV::BI__builtin_riscv_psub_v8i8:
+    case RISCV::BI__builtin_riscv_psub_v4i16:
+    case RISCV::BI__builtin_riscv_psub_v2i32:
+      ID = Intrinsic::riscv_psub;
+      break;
+    }
+  }
     IntrinsicTypes = {ResultType};
     break;
-  }
 
   // Zk builtins
 
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c 
b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
new file mode 100644
index 0000000000000..2d047f2438e8b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -0,0 +1,223 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple riscv32 -target-feature +experimental-p -emit-llvm 
%s -O2 -o - | FileCheck %s --check-prefix=RV32
+// RUN: %clang_cc1 -triple riscv64 -target-feature +experimental-p -emit-llvm 
%s -O2 -o - | FileCheck %s --check-prefix=RV64
+
+#include <stdint.h>
+
+typedef int8_t v4i8 __attribute__((vector_size(4)));
+typedef int16_t v2i16 __attribute__((vector_size(4)));
+typedef int8_t v8i8 __attribute__((vector_size(8)));
+typedef int16_t v4i16 __attribute__((vector_size(8)));
+typedef int32_t v2i32 __attribute__((vector_size(8)));
+
+// RV32-LABEL: @test_padd_v4i8(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE:%.*]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE:%.*]] to <4 x i8>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <4 x i8> @llvm.riscv.padd.v4i8(<4 x 
i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v4i8(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[A_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1:%.*]] = trunc i64 [[B_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1]] to <4 x i8>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <4 x i8> @llvm.riscv.padd.v4i8(<4 x 
i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+v4i8 test_padd_v4i8(v4i8 a, v4i8 b) {
+  return __builtin_riscv_padd_v4i8(a, b);
+}
+
+// RV32-LABEL: @test_padd_v2i16(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE:%.*]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE:%.*]] to <2 x i16>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <2 x i16> @llvm.riscv.padd.v2i16(<2 
x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v2i16(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[A_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1:%.*]] = trunc i64 [[B_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1]] to <2 x i16>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <2 x i16> @llvm.riscv.padd.v2i16(<2 
x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+v2i16 test_padd_v2i16(v2i16 a, v2i16 b) {
+  return __builtin_riscv_padd_v2i16(a, b);
+}
+
+// RV32-LABEL: @test_padd_v8i8(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <8 x i8>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <8 x i8> @llvm.riscv.padd.v8i8(<8 x 
i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v8i8(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <8 x i8>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <8 x i8> @llvm.riscv.padd.v8i8(<8 x 
i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v8i8 test_padd_v8i8(v8i8 a, v8i8 b) {
+  return __builtin_riscv_padd_v8i8(a, b);
+}
+
+// RV32-LABEL: @test_padd_v4i16(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <4 x i16>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <4 x i16> @llvm.riscv.padd.v4i16(<4 
x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v4i16(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <4 x i16>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <4 x i16> @llvm.riscv.padd.v4i16(<4 
x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v4i16 test_padd_v4i16(v4i16 a, v4i16 b) {
+  return __builtin_riscv_padd_v4i16(a, b);
+}
+
+// RV32-LABEL: @test_padd_v2i32(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <2 x i32>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <2 x i32> @llvm.riscv.padd.v2i32(<2 
x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v2i32(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <2 x i32>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <2 x i32> @llvm.riscv.padd.v2i32(<2 
x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v2i32 test_padd_v2i32(v2i32 a, v2i32 b) {
+  return __builtin_riscv_padd_v2i32(a, b);
+}
+
+// RV32-LABEL: @test_psub_v4i8(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE:%.*]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE:%.*]] to <4 x i8>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <4 x i8> @llvm.riscv.psub.v4i8(<4 x 
i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v4i8(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[A_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1:%.*]] = trunc i64 [[B_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1]] to <4 x i8>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <4 x i8> @llvm.riscv.psub.v4i8(<4 x 
i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+v4i8 test_psub_v4i8(v4i8 a, v4i8 b) {
+  return __builtin_riscv_psub_v4i8(a, b);
+}
+
+// RV32-LABEL: @test_psub_v2i16(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE:%.*]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE:%.*]] to <2 x i16>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <2 x i16> @llvm.riscv.psub.v2i16(<2 
x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v2i16(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[A_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1:%.*]] = trunc i64 [[B_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1]] to <2 x i16>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <2 x i16> @llvm.riscv.psub.v2i16(<2 
x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+v2i16 test_psub_v2i16(v2i16 a, v2i16 b) {
+  return __builtin_riscv_psub_v2i16(a, b);
+}
+
+// RV32-LABEL: @test_psub_v8i8(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <8 x i8>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <8 x i8> @llvm.riscv.psub.v8i8(<8 x 
i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v8i8(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <8 x i8>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <8 x i8> @llvm.riscv.psub.v8i8(<8 x 
i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v8i8 test_psub_v8i8(v8i8 a, v8i8 b) {
+  return __builtin_riscv_psub_v8i8(a, b);
+}
+
+// RV32-LABEL: @test_psub_v4i16(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <4 x i16>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <4 x i16> @llvm.riscv.psub.v4i16(<4 
x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v4i16(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <4 x i16>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <4 x i16> @llvm.riscv.psub.v4i16(<4 
x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v4i16 test_psub_v4i16(v4i16 a, v4i16 b) {
+  return __builtin_riscv_psub_v4i16(a, b);
+}
+
+// RV32-LABEL: @test_psub_v2i32(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <2 x i32>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <2 x i32> @llvm.riscv.psub.v2i32(<2 
x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v2i32(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <2 x i32>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <2 x i32> @llvm.riscv.psub.v2i32(<2 
x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v2i32 test_psub_v2i32(v2i32 a, v2i32 b) {
+  return __builtin_riscv_psub_v2i32(a, b);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td 
b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 9088e5e6a357b..c35e09e372e89 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1978,6 +1978,18 @@ let TargetPrefix = "riscv" in {
   defm vfncvt_sat_f_f_q_alt : RISCVConversionRoundingMode;
 } // TargetPrefix = "riscv"
 
+//===----------------------------------------------------------------------===//
+// Packed SIMD (P) Extension
+
+let TargetPrefix = "riscv" in {
+  def int_riscv_padd : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                             [LLVMMatchType<0>, 
LLVMMatchType<0>],
+                                             [IntrNoMem]>;
+  def int_riscv_psub : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                             [LLVMMatchType<0>, 
LLVMMatchType<0>],
+                                             [IntrNoMem]>;
+}
+
 // Vendor extensions
 
//===----------------------------------------------------------------------===//
 include "llvm/IR/IntrinsicsRISCVXTHead.td"
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp 
b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index 78f47794a5b66..c8688d8aefaf3 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -545,37 +545,53 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
   unsigned StoreSizeBytes = XLen / 8;
   Align StackAlign = Align(XLen / 8);
 
+  static const MCPhysReg ArgGPRPairs[] = {RISCV::X10_X11, RISCV::X12_X13,
+                                          RISCV::X14_X15, RISCV::X16_X17};
+
   if (ValVT.isVector() || ValVT.isRISCVVectorTuple()) {
-    Reg = allocateRVVReg(ValVT, ValNo, State, TLI);
-    if (Reg) {
-      // Fixed-length vectors are located in the corresponding scalable-vector
-      // container types.
-      if (ValVT.isFixedLengthVector()) {
-        LocVT = TLI.getContainerForFixedLengthVector(LocVT);
-        State.addLoc(
-            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-        return false;
-      }
-    } else {
-      // For return values, the vector must be passed fully via registers or
-      // via the stack.
-      // FIXME: The proposed vector ABI only mandates v8-v15 for return values,
-      // but we're using all of them.
-      if (IsRet)
-        return true;
-      // Try using a GPR to pass the address
-      if ((Reg = State.AllocateReg(ArgGPRs))) {
-        LocVT = XLenVT;
-        LocInfo = CCValAssign::Indirect;
-      } else if (ValVT.isScalableVector()) {
-        LocVT = XLenVT;
-        LocInfo = CCValAssign::Indirect;
+    bool IsPVectorInGPR = false;
+    if (Subtarget.enablePExtSIMDCodeGen() && ValVT.isVector()) {
+      const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
+      if (RC == &RISCV::GPRRegClass || RC == &RISCV::GPRPairRegClass)
+        IsPVectorInGPR = true;
+    }
+
+    if (!IsPVectorInGPR) {
+      Reg = allocateRVVReg(ValVT, ValNo, State, TLI);
+      if (Reg) {
+        // Fixed-length vectors are located in the corresponding 
scalable-vector
+        // container types.
+        if (ValVT.isFixedLengthVector()) {
+          LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+          State.addLoc(
+              CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+          return false;
+        }
       } else {
-        StoreSizeBytes = ValVT.getStoreSize();
-        // Align vectors to their element sizes, being careful for vXi1
-        // vectors.
-        StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
+        // For return values, the vector must be passed fully via registers or
+        // via the stack.
+        // FIXME: The proposed vector ABI only mandates v8-v15 for return
+        // values, but we're using all of them.
+        if (IsRet)
+          return true;
+        // Try using a GPR to pass the address
+        if ((Reg = State.AllocateReg(ArgGPRs))) {
+          LocVT = XLenVT;
+          LocInfo = CCValAssign::Indirect;
+        } else if (ValVT.isScalableVector()) {
+          LocVT = XLenVT;
+          LocInfo = CCValAssign::Indirect;
+        } else {
+          StoreSizeBytes = ValVT.getStoreSize();
+          // Align vectors to their element sizes, being careful for vXi1
+          // vectors.
+          StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 
8).valueOrOne();
+        }
       }
+    } else if (XLen == 32 && ValVT.getSizeInBits() == 64) {
+      Reg = State.AllocateReg(ArgGPRPairs);
+    } else {
+      Reg = State.AllocateReg(ArgGPRs);
     }
   } else {
     Reg = State.AllocateReg(ArgGPRs);
@@ -604,7 +620,8 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
 
   assert(((ValVT.isFloatingPoint() && !ValVT.isVector()) || LocVT == XLenVT ||
           (TLI.getSubtarget().hasVInstructions() &&
-           (ValVT.isVector() || ValVT.isRISCVVectorTuple()))) &&
+           (ValVT.isVector() || ValVT.isRISCVVectorTuple())) ||
+          (Subtarget.enablePExtSIMDCodeGen() && ValVT.isVector())) &&
          "Expected an XLenVT or vector types at this stage");
 
   if (Reg) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp 
b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c60f740d37576..d084c1cfdc8b4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -293,9 +293,14 @@ RISCVTargetLowering::RISCVTargetLowering(const 
TargetMachine &TM,
       addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass);
       addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass);
       addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass);
     } else {
       addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass);
       addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v8i8, &RISCV::GPRPairRegClass);
+      addRegisterClass(MVT::v4i16, &RISCV::GPRPairRegClass);
+      addRegisterClass(MVT::v2i32, &RISCV::GPRPairRegClass);
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td 
b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 92a9c06fc534b..6ff47656d344e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1685,3 +1685,39 @@ let Predicates = [HasStdExtP, IsRV64] in {
   def : Pat<(v2i32 (build_vector (XLenVT GPR:$a), (XLenVT GPR:$b))),
             (PACK GPR:$a, GPR:$b)>;
 } // Predicates = [HasStdExtP, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// P-extension Intrinsic patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtP] in {
+  def : Pat<(XLenVecI8VT (int_riscv_padd (XLenVecI8VT GPR:$rs1), (XLenVecI8VT 
GPR:$rs2))), (PADD_B GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(XLenVecI8VT (int_riscv_psub (XLenVecI8VT GPR:$rs1), (XLenVecI8VT 
GPR:$rs2))), (PSUB_B GPR:$rs1, GPR:$rs2)>;
+
+  def : Pat<(XLenVecI16VT (int_riscv_padd (XLenVecI16VT GPR:$rs1), 
(XLenVecI16VT GPR:$rs2))), (PADD_H GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(XLenVecI16VT (int_riscv_psub (XLenVecI16VT GPR:$rs1), 
(XLenVecI16VT GPR:$rs2))), (PSUB_H GPR:$rs1, GPR:$rs2)>;
+}
+
+let Predicates = [HasStdExtP, IsRV64] in {
+  def : Pat<(v2i32 (int_riscv_padd (v2i32 GPR:$rs1), (v2i32 GPR:$rs2))), 
(PADD_W GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(v2i32 (int_riscv_psub (v2i32 GPR:$rs1), (v2i32 GPR:$rs2))), 
(PSUB_W GPR:$rs1, GPR:$rs2)>;
+
+  // Sub-XLEN vectors on RV64
+  def : Pat<(v4i8 (int_riscv_padd (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (PADD_B 
GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(v4i8 (int_riscv_psub (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (PSUB_B 
GPR:$rs1, GPR:$rs2)>;
+
+  def : Pat<(v2i16 (int_riscv_padd (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), 
(PADD_H GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(v2i16 (int_riscv_psub (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), 
(PSUB_H GPR:$rs1, GPR:$rs2)>;
+}
+
+let Predicates = [HasStdExtP, IsRV32] in {
+  // 64-bit vectors (v8i8, v4i16, v2i32) using register pairs
+  def : Pat<(v8i8 (int_riscv_padd (v8i8 GPRPairRV32:$rs1), (v8i8 
GPRPairRV32:$rs2))), (PADD_DB GPRPairRV32:$rs1, GPRPairRV32:$rs2)>;
+  def : Pat<(v8i8 (int_riscv_psub (v8i8 GPRPairRV32:$rs1), (v8i8 
GPRPairRV32:$rs2))), (PSUB_DB GPRPairRV32:$rs1, GPRPairRV32:$rs2)>;
+
+  def : Pat<(v4i16 (int_riscv_padd (v4i16 GPRPairRV32:$rs1), (v4i16 
GPRPairRV32:$rs2))), (PADD_DH GPRPairRV32:$rs1, GPRPairRV32:$rs2)>;
+  def : Pat<(v4i16 (int_riscv_psub (v4i16 GPRPairRV32:$rs1), (v4i16 
GPRPairRV32:$rs2))), (PSUB_DH GPRPairRV32:$rs1, GPRPairRV32:$rs2)>;
+
+  def : Pat<(v2i32 (int_riscv_padd (v2i32 GPRPairRV32:$rs1), (v2i32 
GPRPairRV32:$rs2))), (PADD_DW GPRPairRV32:$rs1, GPRPairRV32:$rs2)>;
+  def : Pat<(v2i32 (int_riscv_psub (v2i32 GPRPairRV32:$rs1), (v2i32 
GPRPairRV32:$rs2))), (PSUB_DW GPRPairRV32:$rs1, GPRPairRV32:$rs2)>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td 
b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index e3657badfa9a4..98eaa1420f338 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -230,6 +230,13 @@ def XLenVecI16VT : ValueTypeByHwMode<[RV32,  RV64],
                                      [v2i16, v4i16]>;
 def XLenVecI32VT : ValueTypeByHwMode<[RV64],
                                      [v2i32]>;
+
+def XLenPairVecI8VT : ValueTypeByHwMode<[RV32],
+                                        [v8i8]>;
+def XLenPairVecI16VT : ValueTypeByHwMode<[RV32],
+                                         [v4i16]>;
+def XLenPairVecI32VT : ValueTypeByHwMode<[RV32],
+                                         [v2i32]>;
 def XLenRI : RegInfoByHwMode<
       [RV32,              RV64],
       [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
@@ -248,7 +255,8 @@ class RISCVRegisterClass<list<ValueType> regTypes, int 
align, dag regList>
 class GPRRegisterClass<dag regList>
     : RISCVRegisterClass<[XLenVT, XLenFVT,
                           // P extension packed vector types:
-                          XLenVecI8VT, XLenVecI16VT, XLenVecI32VT], 32, 
regList> {
+                          XLenVecI8VT, XLenVecI16VT, XLenVecI32VT,
+                          v4i8, v2i16], 32, regList> {
   let RegInfos = XLenRI;
 }
 
@@ -369,7 +377,9 @@ let RegAltNameIndices = [ABIRegAltName] in {
 }
 
 let RegInfos = XLenPairRI, CopyCost = 2 in {
-def GPRPair : RISCVRegisterClass<[XLenPairVT, XLenPairFVT], 64, (add
+def GPRPair : RISCVRegisterClass<[XLenPairVT, XLenPairFVT,
+                                XLenPairVecI8VT, XLenPairVecI16VT,
+                                XLenPairVecI32VT], 64, (add
     X10_X11, X12_X13, X14_X15, X16_X17,
     X6_X7,
     X28_X29, X30_X31,
diff --git a/llvm/test/CodeGen/RISCV/rvp-intrinsics.ll 
b/llvm/test/CodeGen/RISCV/rvp-intrinsics.ll
new file mode 100644
index 0000000000000..71d75de1a71d8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvp-intrinsics.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p 
-riscv-enable-p-ext-simd-codegen -verify-machineinstrs < %s | FileCheck %s 
--check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p 
-riscv-enable-p-ext-simd-codegen -verify-machineinstrs < %s | FileCheck %s 
--check-prefixes=CHECK,RV64
+
+define <4 x i8> @test_padd_v4i8(<4 x i8> %a, <4 x i8> %b) {
+; CHECK-LABEL: test_padd_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    padd.b a0, a0, a1
+; CHECK-NEXT:    ret
+  %res = call <4 x i8> @llvm.riscv.padd.v4i8(<4 x i8> %a, <4 x i8> %b)
+  ret <4 x i8> %res
+}
+
+define <2 x i16> @test_padd_v2i16(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: test_padd_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    padd.h a0, a0, a1
+; CHECK-NEXT:    ret
+  %res = call <2 x i16> @llvm.riscv.padd.v2i16(<2 x i16> %a, <2 x i16> %b)
+  ret <2 x i16> %res
+}
+
+define <8 x i8> @test_padd_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; RV32-LABEL: test_padd_v8i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    padd.db a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_padd_v8i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    padd.b a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call <8 x i8> @llvm.riscv.padd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %res
+}
+
+define <4 x i16> @test_padd_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; RV32-LABEL: test_padd_v4i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    padd.dh a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_padd_v4i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    padd.h a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call <4 x i16> @llvm.riscv.padd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i16> %res
+}
+
+define <2 x i32> @test_padd_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; RV32-LABEL: test_padd_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    padd.dw a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_padd_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    padd.w a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call <2 x i32> @llvm.riscv.padd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %res
+}
+
+define <4 x i8> @test_psub_v4i8(<4 x i8> %a, <4 x i8> %b) {
+; CHECK-LABEL: test_psub_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    psub.b a0, a0, a1
+; CHECK-NEXT:    ret
+  %res = call <4 x i8> @llvm.riscv.psub.v4i8(<4 x i8> %a, <4 x i8> %b)
+  ret <4 x i8> %res
+}
+
+define <2 x i16> @test_psub_v2i16(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: test_psub_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    psub.h a0, a0, a1
+; CHECK-NEXT:    ret
+  %res = call <2 x i16> @llvm.riscv.psub.v2i16(<2 x i16> %a, <2 x i16> %b)
+  ret <2 x i16> %res
+}
+
+define <8 x i8> @test_psub_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; RV32-LABEL: test_psub_v8i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    psub.db a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_psub_v8i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    psub.b a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call <8 x i8> @llvm.riscv.psub.v8i8(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %res
+}
+
+define <4 x i16> @test_psub_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; RV32-LABEL: test_psub_v4i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    psub.dh a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_psub_v4i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    psub.h a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call <4 x i16> @llvm.riscv.psub.v4i16(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i16> %res
+}
+
+define <2 x i32> @test_psub_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; RV32-LABEL: test_psub_v2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    psub.dw a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_psub_v2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    psub.w a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call <2 x i32> @llvm.riscv.psub.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %res
+}
+
+declare <4 x i8> @llvm.riscv.padd.v4i8(<4 x i8>, <4 x i8>)
+declare <2 x i16> @llvm.riscv.padd.v2i16(<2 x i16>, <2 x i16>)
+declare <8 x i8> @llvm.riscv.padd.v8i8(<8 x i8>, <8 x i8>)
+declare <4 x i16> @llvm.riscv.padd.v4i16(<4 x i16>, <4 x i16>)
+declare <2 x i32> @llvm.riscv.padd.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i8> @llvm.riscv.psub.v4i8(<4 x i8>, <4 x i8>)
+declare <2 x i16> @llvm.riscv.psub.v2i16(<2 x i16>, <2 x i16>)
+declare <8 x i8> @llvm.riscv.psub.v8i8(<8 x i8>, <8 x i8>)
+declare <4 x i16> @llvm.riscv.psub.v4i16(<4 x i16>, <4 x i16>)
+declare <2 x i32> @llvm.riscv.psub.v2i32(<2 x i32>, <2 x i32>)

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to