https://github.com/E00N777 updated https://github.com/llvm/llvm-project/pull/204088
>From d9703bac55a58e353aa111d84043a3f8dcb67142 Mon Sep 17 00:00:00 2001 From: E00N777 <[email protected]> Date: Tue, 16 Jun 2026 16:32:44 +0800 Subject: [PATCH] [CIR][AArch64] Lower NEON Widen && Widening subtraction intrinsics --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 13 +- clang/test/CodeGen/AArch64/neon-intrinsics.c | 230 ------------------ clang/test/CodeGen/AArch64/neon/subtraction.c | 200 ++++++++++++++- clang/test/CodeGen/AArch64/neon/widen.c | 98 ++++++++ 4 files changed, 309 insertions(+), 232 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/neon/widen.c diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 84b9bb1007763..7d3ed537acdef 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -882,7 +882,18 @@ static mlir::Value emitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vld3q_lane_v: case NEON::BI__builtin_neon_vld4_lane_v: case NEON::BI__builtin_neon_vld4q_lane_v: - case NEON::BI__builtin_neon_vmovl_v: + cgf.cgm.errorNYI(expr->getSourceRange(), + std::string("Reached code-path for ARM builtin call ") + + ctx.BuiltinInfo.getName(builtinID) + + "(ARM builtins are not supported ATM)"); + return mlir::Value{}; + case NEON::BI__builtin_neon_vmovl_v: { + cir::VectorType dTy = + cgf.getBuilder().getExtendedOrTruncatedElementVectorType( + ty, /*isExtended=*/false, !usgn); + ops[0] = cgf.getBuilder().createBitcast(loc, ops[0], dTy); + return cgf.getBuilder().createIntCast(ops[0], ty); + } case NEON::BI__builtin_neon_vmovn_v: case NEON::BI__builtin_neon_vmull_v: case NEON::BI__builtin_neon_vpadal_v: diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c index bbcc8978804f7..9fde7f37fc192 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c @@ -5298,74 +5298,6 @@ uint64x2_t test_vshll_high_n_u32(uint32x4_t a) { return vshll_high_n_u32(a, 19); } -// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> -// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] -// -int16x8_t test_vmovl_s8(int8x8_t a) { - return vmovl_s8(a); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_s16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] -// -int32x4_t test_vmovl_s16(int16x4_t a) { - return vmovl_s16(a); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_s32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] -// -int64x2_t test_vmovl_s32(int32x2_t a) { - return vmovl_s32(a); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_u8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> -// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] -// -uint16x8_t test_vmovl_u8(uint8x8_t a) { - return vmovl_u8(a); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_u16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] -// -uint32x4_t test_vmovl_u16(uint16x4_t a) { - return vmovl_u16(a); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_u32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] -// -uint64x2_t test_vmovl_u32(uint32x2_t a) { - return vmovl_u32(a); -} - // CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_high_s8( // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -5782,94 +5714,6 @@ uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) { return vaddw_high_u32(a, b); } -// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I5_I]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <8 x i16> [[SUB_I]] -// -int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) { - return vsubl_s8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_s16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I5_I]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <4 x i32> [[SUB_I]] -// -int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) { - return vsubl_s16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_s32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I5_I]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <2 x i64> [[SUB_I]] -// -int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) { - return vsubl_s32(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_u8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I5_I]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <8 x i16> [[SUB_I]] -// -uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) { - return vsubl_u8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_u16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I5_I]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <4 x i32> [[SUB_I]] -// -uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) { - return vsubl_u16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_u32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I5_I]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <2 x i64> [[SUB_I]] -// -uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) { - return vsubl_u32(a, b); -} - // CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_high_s8( // CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -5970,80 +5814,6 @@ uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) { return vsubl_high_u32(a, b); } -// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_s8( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <8 x i16> [[SUB_I]] -// -int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) { - return vsubw_s8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_s16( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <4 x i32> [[SUB_I]] -// -int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) { - return vsubw_s16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_s32( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <2 x i64> [[SUB_I]] -// -int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) { - return vsubw_s32(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_u8( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <8 x i16> [[SUB_I]] -// -uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) { - return vsubw_u8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_u16( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <4 x i32> [[SUB_I]] -// -uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) { - return vsubw_u16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_u32( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] -// CHECK-NEXT: ret <2 x i64> [[SUB_I]] -// -uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) { - return vsubw_u32(a, b); -} - // CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_high_s8( // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon/subtraction.c b/clang/test/CodeGen/AArch64/neon/subtraction.c index 8297e444b2bd9..7810445fa0094 100644 --- a/clang/test/CodeGen/AArch64/neon/subtraction.c +++ b/clang/test/CodeGen/AArch64/neon/subtraction.c @@ -7,7 +7,7 @@ //============================================================================= // NOTES // -// Tests for vector permutation intrinsics: Subtraction, Widening subtraction, Narrowing subtraction and Saturating subtract elements. +// Tests for vector subtraction intrinsics: Subtraction, Widening subtraction, Narrowing subtraction and Saturating subtract elements. // // ACLE section headings based on v2025Q2 of the ACLE specification: // * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#subtract @@ -264,3 +264,201 @@ uint64_t test_vsubd_u64(uint64_t a, uint64_t b) { // LLVM: ret i64 [[VSUBD_I]] return vsubd_u64(a, b); } + +//===------------------------------------------------------===// +// 2.1.1.5.3. Widening subtraction +// https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#widening-subtraction +// TODO: Migrate the vsubl_high_* / vsubw_high_* intrinsics +//===------------------------------------------------------===// + +// LLVM-LABEL: @test_vsubl_s8( +// CIR-LABEL: @vsubl_s8( +int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) { +// CIR: [[VMOVL0:%.*]] = cir.call @vmovl_s8({{.*}}) : {{.*}} -> !cir.vector<8 x !s16i> +// CIR: [[VMOVL1:%.*]] = cir.call @vmovl_s8({{.*}}) : {{.*}} -> !cir.vector<8 x !s16i> +// CIR: {{%.*}} = cir.sub [[VMOVL0]], [[VMOVL1]] : !cir.vector<8 x !s16i> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) +// LLVM: [[VMOVL0:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// LLVM: [[VMOVL1:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// LLVM: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL0]], [[VMOVL1]] +// LLVM: ret <8 x i16> [[SUB_I]] + return vsubl_s8(a, b); +} + +// LLVM-LABEL: @test_vsubl_s16( +// CIR-LABEL: @vsubl_s16( +int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) { +// CIR: [[VMOVL0:%.*]] = cir.call @vmovl_s16({{.*}}) : {{.*}} -> !cir.vector<4 x !s32i> +// CIR: [[VMOVL1:%.*]] = cir.call @vmovl_s16({{.*}}) : {{.*}} -> !cir.vector<4 x !s32i> +// CIR: {{%.*}} = cir.sub [[VMOVL0]], [[VMOVL1]] : !cir.vector<4 x !s32i> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// LLVM: [[VMOVL0:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// LLVM: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// LLVM: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// LLVM: [[VMOVL1:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// LLVM: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL0]], [[VMOVL1]] +// LLVM: ret <4 x i32> [[SUB_I]] + return vsubl_s16(a, b); +} + +// LLVM-LABEL: @test_vsubl_s32( +// CIR-LABEL: @vsubl_s32( +int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) { +// CIR: [[VMOVL0:%.*]] = cir.call @vmovl_s32({{.*}}) : {{.*}} -> !cir.vector<2 x !s64i> +// CIR: [[VMOVL1:%.*]] = cir.call @vmovl_s32({{.*}}) : {{.*}} -> !cir.vector<2 x !s64i> +// CIR: {{%.*}} = cir.sub [[VMOVL0]], [[VMOVL1]] : !cir.vector<2 x !s64i> + +// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i32> {{.*}} [[B:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// LLVM: [[VMOVL0:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// LLVM: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// LLVM: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// LLVM: [[VMOVL1:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// LLVM: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL0]], [[VMOVL1]] +// LLVM: ret <2 x i64> [[SUB_I]] + return vsubl_s32(a, b); +} + +// LLVM-LABEL: @test_vsubl_u8( +// CIR-LABEL: @vsubl_u8( +uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) { +// CIR: [[VMOVL0:%.*]] = cir.call @vmovl_u8({{.*}}) : {{.*}} -> !cir.vector<8 x !u16i> +// CIR: [[VMOVL1:%.*]] = cir.call @vmovl_u8({{.*}}) : {{.*}} -> !cir.vector<8 x !u16i> +// CIR: {{%.*}} = cir.sub [[VMOVL0]], [[VMOVL1]] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) +// LLVM: [[VMOVL0:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// LLVM: [[VMOVL1:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// LLVM: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL0]], [[VMOVL1]] +// LLVM: ret <8 x i16> [[SUB_I]] + return vsubl_u8(a, b); +} + +// LLVM-LABEL: @test_vsubl_u16( +// CIR-LABEL: @vsubl_u16( +uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) { +// CIR: [[VMOVL0:%.*]] = cir.call @vmovl_u16({{.*}}) : {{.*}} -> !cir.vector<4 x !u32i> +// CIR: [[VMOVL1:%.*]] = cir.call @vmovl_u16({{.*}}) : {{.*}} -> !cir.vector<4 x !u32i> +// CIR: {{%.*}} = cir.sub [[VMOVL0]], [[VMOVL1]] : !cir.vector<4 x !u32i> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// LLVM: [[VMOVL0:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// LLVM: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// LLVM: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// LLVM: [[VMOVL1:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// LLVM: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL0]], [[VMOVL1]] +// LLVM: ret <4 x i32> [[SUB_I]] + return vsubl_u16(a, b); +} + +// LLVM-LABEL: @test_vsubl_u32( +// CIR-LABEL: @vsubl_u32( +uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) { +// CIR: [[VMOVL0:%.*]] = cir.call @vmovl_u32({{.*}}) : {{.*}} -> !cir.vector<2 x !u64i> +// CIR: [[VMOVL1:%.*]] = cir.call @vmovl_u32({{.*}}) : {{.*}} -> !cir.vector<2 x !u64i> +// CIR: {{%.*}} = cir.sub [[VMOVL0]], [[VMOVL1]] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i32> {{.*}} [[B:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// LLVM: [[VMOVL0:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// LLVM: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// LLVM: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// LLVM: [[VMOVL1:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// LLVM: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL0]], [[VMOVL1]] +// LLVM: ret <2 x i64> [[SUB_I]] + return vsubl_u32(a, b); +} + +// LLVM-LABEL: @test_vsubw_s8( +// CIR-LABEL: @vsubw_s8( +int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) { +// CIR: [[VMOVL_I:%.*]] = cir.call @vmovl_s8({{.*}}) : {{.*}} -> !cir.vector<8 x !s16i> +// CIR: {{%.*}} = cir.sub {{%.*}}, [[VMOVL_I]] : !cir.vector<8 x !s16i> + +// LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) +// LLVM: [[VMOVL_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// LLVM: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I]] +// LLVM: ret <8 x i16> [[SUB_I]] + return vsubw_s8(a, b); +} + +// LLVM-LABEL: @test_vsubw_s16( +// CIR-LABEL: @vsubw_s16( +int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) { +// CIR: [[VMOVL_I:%.*]] = cir.call @vmovl_s16({{.*}}) : {{.*}} -> !cir.vector<4 x !s32i> +// CIR: {{%.*}} = cir.sub {{%.*}}, [[VMOVL_I]] : !cir.vector<4 x !s32i> + +// LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// LLVM: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// LLVM: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I]] +// LLVM: ret <4 x i32> [[SUB_I]] + return vsubw_s16(a, b); +} + +// LLVM-LABEL: @test_vsubw_s32( +// CIR-LABEL: @vsubw_s32( +int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) { +// CIR: [[VMOVL_I:%.*]] = cir.call @vmovl_s32({{.*}}) : {{.*}} -> !cir.vector<2 x !s64i> +// CIR: {{%.*}} = cir.sub {{%.*}}, [[VMOVL_I]] : !cir.vector<2 x !s64i> + +// LLVM-SAME: <2 x i64> {{.*}} [[A:%.*]], <2 x i32> {{.*}} [[B:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// LLVM: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// LLVM: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I]] +// LLVM: ret <2 x i64> [[SUB_I]] + return vsubw_s32(a, b); +} + +// LLVM-LABEL: @test_vsubw_u8( +// CIR-LABEL: @vsubw_u8( +uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) { +// CIR: [[VMOVL_I:%.*]] = cir.call @vmovl_u8({{.*}}) : {{.*}} -> !cir.vector<8 x !u16i> +// CIR: {{%.*}} = cir.sub {{%.*}}, [[VMOVL_I]] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]], <8 x i8> {{.*}} [[B:%.*]]) +// LLVM: [[VMOVL_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// LLVM: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I]] +// LLVM: ret <8 x i16> [[SUB_I]] + return vsubw_u8(a, b); +} + +// LLVM-LABEL: @test_vsubw_u16( +// CIR-LABEL: @vsubw_u16( +uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) { +// CIR: [[VMOVL_I:%.*]] = cir.call @vmovl_u16({{.*}}) : {{.*}} -> !cir.vector<4 x !u32i> +// CIR: {{%.*}} = cir.sub {{%.*}}, [[VMOVL_I]] : !cir.vector<4 x !u32i> + +// LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]], <4 x i16> {{.*}} [[B:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// LLVM: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// LLVM: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I]] +// LLVM: ret <4 x i32> [[SUB_I]] + return vsubw_u16(a, b); +} + +// LLVM-LABEL: @test_vsubw_u32( +// CIR-LABEL: @vsubw_u32( +uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) { +// CIR: [[VMOVL_I:%.*]] = cir.call @vmovl_u32({{.*}}) : {{.*}} -> !cir.vector<2 x !u64i> +// CIR: {{%.*}} = cir.sub {{%.*}}, [[VMOVL_I]] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64> {{.*}} [[A:%.*]], <2 x i32> {{.*}} [[B:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// LLVM: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// LLVM: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I]] +// LLVM: ret <2 x i64> [[SUB_I]] + return vsubw_u32(a, b); +} diff --git a/clang/test/CodeGen/AArch64/neon/widen.c b/clang/test/CodeGen/AArch64/neon/widen.c new file mode 100644 index 0000000000000..327506a771328 --- /dev/null +++ b/clang/test/CodeGen/AArch64/neon/widen.c @@ -0,0 +1,98 @@ +// REQUIRES: aarch64-registered-target || arm-registered-target + +// RUN: %clang_cc1_cg_arm64_neon -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM +// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %} +// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -fclangir -emit-cir %s -disable-O0-optnone | FileCheck %s --check-prefixes=CIR %} + +//============================================================================= +// NOTES +// +// Tests for vector Widen intrinsics +// +// ACLE section headings based on v2025Q2 of the ACLE specification: +// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#widen +// +// TODO: Migrate the vmovl_high_* intrinsics, which depend on 'Vector shift left and widen' that has not yet been implemented. +// +//============================================================================= + +#include <arm_neon.h> + +//===------------------------------------------------------===// +// 5.1.5.2. Widen +// https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#widen +//===------------------------------------------------------===// + +// LLVM-LABEL: @test_vmovl_s8( +// CIR-LABEL: @vmovl_s8( +int16x8_t test_vmovl_s8(int8x8_t a) { +// CIR: [[VMOVL_I:%.*]] = cir.cast integral {{%.*}} : !cir.vector<8 x !s8i> -> !cir.vector<8 x !s16i> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]]) +// LLVM: [[VMOVL_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// LLVM: ret <8 x i16> [[VMOVL_I]] + return vmovl_s8(a); +} + +// LLVM-LABEL: @test_vmovl_s16( +// CIR-LABEL: @vmovl_s16( +int32x4_t test_vmovl_s16(int16x4_t a) { +// CIR: [[VMOVL_I:%.*]] = cir.cast integral {{%.*}} : !cir.vector<4 x !s16i> -> !cir.vector<4 x !s32i> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// LLVM: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// LLVM: ret <4 x i32> [[VMOVL_I]] + return vmovl_s16(a); +} + +// LLVM-LABEL: @test_vmovl_s32( +// CIR-LABEL: @vmovl_s32( +int64x2_t test_vmovl_s32(int32x2_t a) { +// CIR: [[VMOVL_I:%.*]] = cir.cast integral {{%.*}} : !cir.vector<2 x !s32i> -> !cir.vector<2 x !s64i> + +// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// LLVM: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// LLVM: ret <2 x i64> [[VMOVL_I]] + return vmovl_s32(a); +} + +// LLVM-LABEL: @test_vmovl_u8( +// CIR-LABEL: @vmovl_u8( +uint16x8_t test_vmovl_u8(uint8x8_t a) { +// CIR: [[VMOVL_I:%.*]] = cir.cast integral {{%.*}} : !cir.vector<8 x !u8i> -> !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]]) +// LLVM: [[VMOVL_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// LLVM: ret <8 x i16> [[VMOVL_I]] + return vmovl_u8(a); +} + +// LLVM-LABEL: @test_vmovl_u16( +// CIR-LABEL: @vmovl_u16( +uint32x4_t test_vmovl_u16(uint16x4_t a) { +// CIR: [[VMOVL_I:%.*]] = cir.cast integral {{%.*}} : !cir.vector<4 x !u16i> -> !cir.vector<4 x !u32i> + +// LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// LLVM: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// LLVM: ret <4 x i32> [[VMOVL_I]] + return vmovl_u16(a); +} + +// LLVM-LABEL: @test_vmovl_u32( +// CIR-LABEL: @vmovl_u32( +uint64x2_t test_vmovl_u32(uint32x2_t a) { +// CIR: [[VMOVL_I:%.*]] = cir.cast integral {{%.*}} : !cir.vector<2 x !u32i> -> !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]]) +// LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// LLVM: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// LLVM: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// LLVM: ret <2 x i64> [[VMOVL_I]] + return vmovl_u32(a); +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
