[clang] [CIR] Vector saturating rounding shift right and narrow intrinsics (PR #198947)

Kartik Ohlan via cfe-commits Mon, 25 May 2026 12:17:08 -0700

https://github.com/Ko496-glitch updated 
https://github.com/llvm/llvm-project/pull/198947


>From fbc5a8c1420b6e2975bb78959b625863599b9ff0 Mon Sep 17 00:00:00 2001
From: Kartik Ohlan <[email protected]>
Date: Wed, 20 May 2026 20:12:24 -0400
Subject: [PATCH 1/8] added test cases

---
 clang/test/CodeGen/AArch64/neon/intrinsics.c | 261 +++++++++++++++++++
 1 file changed, 261 insertions(+)

diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c 
b/clang/test/CodeGen/AArch64/neon/intrinsics.c
index bf14d1abc9d8e..db813c4482ac0 100644
--- a/clang/test/CodeGen/AArch64/neon/intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -24,6 +24,267 @@
 
 #include <arm_neon.h>
 
+//===------------------------------------------------------===//
+// 2.1.3.2.7  Vector saturating rounding shift right and narrow
+// 
+//===------------------------------------------------------===//
+
+// ALL-LABEL: @test_vqrshrun_n_s16(
+uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+
+  // LLVM-SAME: <8 x i16> noundef [[A:%.*]])
+  // LLVM:      {{.*}} = bitcast <8 x i16> [[A]] to <16 x i8>
+  // LLVM:      [[TMP:%.*]] = bitcast <16 x i8> {{.*}} to <8 x i16>
+  // LLVM:      [[VQRSHRUN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP]], i32 3)
+  // LLVM:      ret <8 x i8> [[VQRSHRUN_N1]]
+  return vqrshrun_n_s16(a, 3);
+}
+
+// ALL-LABEL: @test_vqrshrun_n_s32(
+uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+
+  // LLVM-SAME: <4 x i32> {{.*}}[[A:%.*]])
+  // LLVM:      [[BC1:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+  // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <4 x i32>
+  // LLVM:      [[VQRSHRUN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[BC2]], i32 9)
+  // LLVM:      ret <4 x i16> [[VQRSHRUN_N1]]
+  return vqrshrun_n_s32(a, 9);
+}
+
+// ALL-LABEL: @test_vqrshrun_n_s64(
+uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+
+  // LLVM-SAME: <2 x i64> {{.*}}[[A:%.*]])
+  // LLVM:      [[BC1:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+  // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <2 x i64>
+  // LLVM:      [[VQRSHRUN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[BC2]], i32 19)
+  // LLVM:      ret <2 x i32> [[VQRSHRUN_N1]]
+  return vqrshrun_n_s64(a, 19);
+}
+
+//ALL-LABEL:  @test_vqrshrund_n_s64(
+uint32_t test_vqrshrund_n_s64(int64_t a) {
+  //CIR: {{%.*}} =  cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+  // 
+  // 
+  // LLVM-SAME: i64 {{.*}}[[A:%.*]])
+  // LLVM:   [[VQRSHRUND_N_S64:%.*]] = call i32 
@llvm.aarch64.neon.sqrshrun.i32(i64 [[A]], i32 32)
+  // LLVM:  ret i32 [[VQRSHRUND_N_S64]]
+  return (uint32_t)vqrshrund_n_s64(a, 32);
+}
+
+// ALL-LABEL: @test_vqrshrn_n_s16(
+int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+
+  // LLVM-SAME: <8 x i16> {{.*}}[[A:%.*]])
+  // LLVM:      [[BC1:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+  // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <8 x i16>
+  // LLVM:      [[VQRSHRN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[BC2]], i32 3)
+  // LLVM:      ret <8 x i8> [[VQRSHRN_N1]]
+  return vqrshrn_n_s16(a, 3);
+}
+
+// ALL-LABEL: @test_vqrshrn_n_s32(
+int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+
+  // LLVM-SAME: <4 x i32> {{.*}}[[A:%.*]])
+  // LLVM:      [[BC1:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+  // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <4 x i32>
+  // LLVM:      [[VQRSHRN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[BC2]], i32 9)
+  // LLVM:      ret <4 x i16> [[VQRSHRN_N1]]
+  return vqrshrn_n_s32(a, 9);
+}
+
+// ALL-LABEL: @test_vqrshrn_n_s64(
+int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+
+  // LLVM-SAME: <2 x i64> {{.*}}[[A:%.*]])
+  // LLVM:      [[BC1:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+  // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <2 x i64>
+  // LLVM:      [[VQRSHRN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[BC2]], i32 19)
+  // LLVM:      ret <2 x i32> [[VQRSHRN_N1]]
+  return vqrshrn_n_s64(a, 19);
+}
+
+// ALL-LABEL: @test_vqrshrun_high_n_s16(
+uint8x16_t test_vqrshrun_high_n_s16(uint8x8_t a, int16x8_t b) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+
+  // LLVM-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+  // LLVM:      [[VQRSHRUN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> 
[[VQRSHRUN_N3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // LLVM:      ret <16 x i8> [[SHUFFLE_I]]
+  return vqrshrun_high_n_s16(a, b, 3);
+}
+
+// ALL-LABEL: @test_vqrshrun_high_n_s32(
+uint16x8_t test_vqrshrun_high_n_s32(uint16x4_t a, int32x4_t b) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+  
+  // LLVM-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+  // LLVM:      [[VQRSHRUN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> 
[[VQRSHRUN_N3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7>
+  // LLVM:      ret <8 x i16> [[SHUFFLE_I]]
+  return vqrshrun_high_n_s32(a, b, 9);
+}
+
+// ALL-LABEL: @test_vqrshrun_high_n_s64(
+uint32x4_t test_vqrshrun_high_n_s64(uint32x2_t a, int64x2_t b) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+
+  // LLVM-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+  // LLVM:      [[VQRSHRUN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> 
[[VQRSHRUN_N3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM:      ret <4 x i32> [[SHUFFLE_I]]
+  return vqrshrun_high_n_s64(a, b, 19);
+}
+
+// ALL-LABEL: @test_vqrshrn_n_u16(
+uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+
+  // LLVM-SAME: <8 x i16> {{.*}}[[A:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+  // LLVM:      [[VQRSHRN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+  // LLVM:      ret <8 x i8> [[VQRSHRN_N1]]
+  return vqrshrn_n_u16(a, 3);
+}
+
+// ALL-LABEL: @test_vqrshrn_n_u32(
+uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+
+  // LLVM-SAME: <4 x i32> {{.*}}[[A:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+  // LLVM:      [[VQRSHRN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+  // LLVM:      ret <4 x i16> [[VQRSHRN_N1]]
+  return vqrshrn_n_u32(a, 9);
+}
+
+// ALL-LABEL: @test_vqrshrn_n_u64(
+uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+
+  // LLVM-SAME: <2 x i64> {{.*}}[[A:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+  // LLVM:      [[VQRSHRN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+  // LLVM:      ret <2 x i32> [[VQRSHRN_N1]]
+  return vqrshrn_n_u64(a, 19);
+}
+
+// ALL-LABEL: @test_vqrshrnd_n_s64(
+int32_t test_vqrshrnd_n_s64(int64_t a) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+
+  // LLVM-SAME: i64 noundef [[A:%.*]])
+  // LLVM:      [[VQRSHRND_N_S64:%.*]] = call i32 
@llvm.aarch64.neon.sqrshrn.i32(i64 [[A]], i32 32)
+  // LLVM:      ret i32 [[VQRSHRND_N_S64]]
+  return (int32_t)vqrshrnd_n_s64(a, 32);
+}
+
+// ALL-LABEL: @test_vqrshrnd_n_u64(
+uint32_t test_vqrshrnd_n_u64(uint64_t a) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+
+  // LLVM-SAME: i64 noundef [[A:%.*]])
+  // LLVM:      [[VQRSHRND_N_U64:%.*]] = call i32 
@llvm.aarch64.neon.uqrshrn.i32(i64 [[A]], i32 32)
+  // LLVM:      ret i32 [[VQRSHRND_N_U64]]
+  return (uint32_t)vqrshrnd_n_u64(a, 32);
+}
+
+// ALL-LABEL: @test_vqrshrn_high_n_s16(
+int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+
+  // LLVM-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+  // LLVM:      [[VQRSHRN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> 
[[VQRSHRN_N3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // LLVM:      ret <16 x i8> [[SHUFFLE_I]]
+  return vqrshrn_high_n_s16(a, b, 3);
+}
+
+// ALL-LABEL: @test_vqrshrn_high_n_s32(
+int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+
+  // LLVM-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+  // LLVM:      [[VQRSHRN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> 
[[VQRSHRN_N3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 
7>
+  // LLVM:      ret <8 x i16> [[SHUFFLE_I]]
+  return vqrshrn_high_n_s32(a, b, 9);
+}
+
+// ALL-LABEL:  @test_vqrshrn_high_n_s64(
+int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+
+  // LLVM-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+  // LLVM:      [[VQRSHRN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> 
[[VQRSHRN_N3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM:      ret <4 x i32> [[SHUFFLE_I]]
+  return vqrshrn_high_n_s64(a, b, 19);
+}
+
+// ALL-LABEL:  @test_vqrshrn_high_n_u16(
+uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+
+  // LLVM-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+  // LLVM:      [[VQRSHRN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> 
[[VQRSHRN_N3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // LLVM:      ret <16 x i8> [[SHUFFLE_I]]
+  return vqrshrn_high_n_u16(a, b, 3);
+}
+
+// ALL-LABEL: @test_vqrshrn_high_n_u32(
+uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+
+  // LLVM-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+  // LLVM:      [[VQRSHRN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> 
[[VQRSHRN_N3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 
7>
+  // LLVM:      ret <8 x i16> [[SHUFFLE_I]]
+  return vqrshrn_high_n_u32(a, b, 9);
+}
+
+// ALL-LABEL: @test_vqrshrn_high_n_u64(
+uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
+  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+
+  // LLVM-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]])
+  // LLVM:      [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+  // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+  // LLVM:      [[VQRSHRN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+  // LLVM:      [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> 
[[VQRSHRN_N3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // LLVM:      ret <4 x i32> [[SHUFFLE_I]]
+  return vqrshrn_high_n_u64(a, b, 19);
+}
+
 // LLVM-LABEL: @test_vnegd_s64
 // CIR-LABEL: @vnegd_s64
 int64_t test_vnegd_s64(int64_t a) {

>From 4e668c2e45ce7a1012bd236836404d1cd24783e8 Mon Sep 17 00:00:00 2001
From: Kartik Ohlan <[email protected]>
Date: Wed, 20 May 2026 20:40:30 -0400
Subject: [PATCH 2/8] added implementation

---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 28 +++++++++++++++++--
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 294e87168b0e5..6dc9a4f91a50e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -317,6 +317,9 @@ static mlir::Value emitCommonNeonSISDBuiltinExpr(
   case NEON::BI__builtin_neon_vcvtd_n_f64_u64:
   case NEON::BI__builtin_neon_vcvtd_n_s64_f64:
   case NEON::BI__builtin_neon_vcvtd_n_u64_f64:
+  case NEON::BI__builtin_neon_vqrshrund_n_s64:
+  case NEON::BI__builtin_neon_vqrshrnd_n_s64:
+  case NEON::BI__builtin_neon_vqrshrnd_n_u64:
     return emitNeonCall(cgf.cgm, cgf.getBuilder(),
                         {cgf.convertType(expr->getArg(0)->getType())}, ops,
                         llvmIntrName, cgf.convertType(expr->getType()), loc);
@@ -2553,10 +2556,7 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vrecpsd_f64:
   case NEON::BI__builtin_neon_vrecpsh_f16:
   case NEON::BI__builtin_neon_vqshrun_n_v:
-  case NEON::BI__builtin_neon_vqrshrun_n_v:
-  case NEON::BI__builtin_neon_vqshrn_n_v:
   case NEON::BI__builtin_neon_vrshrn_n_v:
-  case NEON::BI__builtin_neon_vqrshrn_n_v:
   case NEON::BI__builtin_neon_vrndah_f16:
   case NEON::BI__builtin_neon_vrnda_v:
   case NEON::BI__builtin_neon_vrndaq_v:
@@ -2597,6 +2597,28 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
                  std::string("unimplemented AArch64 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
+  case NEON::BI__builtin_neon_vqrshrn_n_v: {
+    cir::VectorType argTy = builder.getExtendedOrTruncatedElementVectorType(
+        ty, /*isExtended=*/true, /*isSigned=*/!usgn);
+    llvm::StringRef intrName =
+        usgn ? "aarch64.neon.uqrshrn" : "aarch64.neon.sqrshrn";
+    return emitNeonCall(cgm, builder, {argTy, sInt32Ty}, ops, intrName, ty,
+                        loc);
+  }
+  case NEON::BI__builtin_neon_vqrshrun_n_v: {
+    cir::VectorType argTy =
+        builder.getExtendedOrTruncatedElementVectorType(ty, true, true);
+    return emitNeonCall(cgm, builder, {argTy, sInt32Ty}, ops,
+                        "aarch64.neon.sqrshrun", ty, loc);
+  }
+  case NEON::BI__builtin_neon_vqshrn_n_v: {
+    cir::VectorType argTy =
+        builder.getExtendedOrTruncatedElementVectorType(ty, true, !usgn);
+    llvm::StringRef intrName =
+        usgn ? "aarch64.neon.uqrshrn" : "aarch64.neon.sqrshrn";
+    return emitNeonCall(cgm, builder, {argTy, sInt32Ty}, ops, intrName, ty,
+                        loc);
+  }
   case NEON::BI__builtin_neon_vcvt_f64_v:
   case NEON::BI__builtin_neon_vcvtq_f64_v:
     ops[0] = builder.createBitcast(ops[0], ty);

>From 3ed69ef245006acc82a203ec5d8bc9ede38203e0 Mon Sep 17 00:00:00 2001
From: Kartik Ohlan <[email protected]>
Date: Wed, 20 May 2026 20:45:37 -0400
Subject: [PATCH 3/8] Update CIRGenBuiltinAArch64.cpp

restructured the cases
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index aa356472c273e..33c6234971bf2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -328,9 +328,6 @@ static mlir::Value emitCommonNeonSISDBuiltinExpr(
   case NEON::BI__builtin_neon_vcvtd_n_f64_u64:
   case NEON::BI__builtin_neon_vcvtd_n_s64_f64:
   case NEON::BI__builtin_neon_vcvtd_n_u64_f64:
-  case NEON::BI__builtin_neon_vqrshrund_n_s64:
-  case NEON::BI__builtin_neon_vqrshrnd_n_s64:
-  case NEON::BI__builtin_neon_vqrshrnd_n_u64:
   case NEON::BI__builtin_neon_vaddlv_s32:
   case NEON::BI__builtin_neon_vaddlv_u32:
   case NEON::BI__builtin_neon_vaddlvq_s32:
@@ -372,6 +369,9 @@ static mlir::Value emitCommonNeonSISDBuiltinExpr(
   case NEON::BI__builtin_neon_vmaxv_f32:
   case NEON::BI__builtin_neon_vmaxvq_f32:
   case NEON::BI__builtin_neon_vmaxvq_f64:
+  case NEON::BI__builtin_neon_vqrshrund_n_s64:
+  case NEON::BI__builtin_neon_vqrshrnd_n_s64:
+  case NEON::BI__builtin_neon_vqrshrnd_n_u64:
     return emitNeonCall(cgf.cgm, cgf.getBuilder(),
                         {cgf.convertType(expr->getArg(0)->getType())}, ops,
                         llvmIntrName, cgf.convertType(expr->getType()), loc);

>From ea39b6fdc9f32e107506415869a9f07297b19d17 Mon Sep 17 00:00:00 2001
From: Kartik Ohlan <[email protected]>
Date: Wed, 20 May 2026 21:26:40 -0400
Subject: [PATCH 4/8] removed test cases from  neon-intrinsics.c

---
 clang/test/CodeGen/AArch64/neon-intrinsics.c | 255 -------------------
 1 file changed, 255 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 424d476ad33c9..f9fd973ff6a4d 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -6252,81 +6252,6 @@ uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, 
uint64x2_t b) {
   return vrshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrun_n_s16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
-// CHECK-NEXT:    ret <8 x i8> [[VQRSHRUN_N1]]
-//
-uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
-  return vqrshrun_n_s16(a, 3);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrun_n_s32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
-// CHECK-NEXT:    ret <4 x i16> [[VQRSHRUN_N1]]
-//
-uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
-  return vqrshrun_n_s32(a, 9);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrun_n_s64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
-// CHECK-NEXT:    ret <2 x i32> [[VQRSHRUN_N1]]
-//
-uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
-  return vqrshrun_n_s64(a, 19);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrun_high_n_s16(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRUN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> 
[[VQRSHRUN_N3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-uint8x16_t test_vqrshrun_high_n_s16(uint8x8_t a, int16x8_t b) {
-  return vqrshrun_high_n_s16(a, b, 3);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrun_high_n_s32(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRUN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> 
[[VQRSHRUN_N3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-uint16x8_t test_vqrshrun_high_n_s32(uint16x4_t a, int32x4_t b) {
-  return vqrshrun_high_n_s32(a, b, 9);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrun_high_n_s64(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRUN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> 
[[VQRSHRUN_N3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
-//
-uint32x4_t test_vqrshrun_high_n_s64(uint32x2_t a, int64x2_t b) {
-  return vqrshrun_high_n_s64(a, b, 19);
-}
-
 // CHECK-LABEL: define dso_local <8 x i8> @test_vqshrn_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -6477,156 +6402,6 @@ uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, 
uint64x2_t b) {
   return vqshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrn_n_s16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
-// CHECK-NEXT:    ret <8 x i8> [[VQRSHRN_N1]]
-//
-int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
-  return vqrshrn_n_s16(a, 3);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrn_n_s32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
-// CHECK-NEXT:    ret <4 x i16> [[VQRSHRN_N1]]
-//
-int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
-  return vqrshrn_n_s32(a, 9);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrn_n_s64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
-// CHECK-NEXT:    ret <2 x i32> [[VQRSHRN_N1]]
-//
-int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
-  return vqrshrn_n_s64(a, 19);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrn_n_u16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
-// CHECK-NEXT:    ret <8 x i8> [[VQRSHRN_N1]]
-//
-uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
-  return vqrshrn_n_u16(a, 3);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrn_n_u32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
-// CHECK-NEXT:    ret <4 x i16> [[VQRSHRN_N1]]
-//
-uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
-  return vqrshrn_n_u32(a, 9);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrn_n_u64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
-// CHECK-NEXT:    ret <2 x i32> [[VQRSHRN_N1]]
-//
-uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
-  return vqrshrn_n_u64(a, 19);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrn_high_n_s16(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> 
[[VQRSHRN_N3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  return vqrshrn_high_n_s16(a, b, 3);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrn_high_n_s32(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> 
[[VQRSHRN_N3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 
7>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  return vqrshrn_high_n_s32(a, b, 9);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrn_high_n_s64(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> 
[[VQRSHRN_N3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
-//
-int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  return vqrshrn_high_n_s64(a, b, 19);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrn_high_n_u16(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> 
[[VQRSHRN_N3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, 
i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  return vqrshrn_high_n_u16(a, b, 3);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrn_high_n_u32(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> 
[[VQRSHRN_N3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 
7>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  return vqrshrn_high_n_u32(a, b, 9);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrn_high_n_u64(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) 
#[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> 
[[VQRSHRN_N3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
-//
-uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  return vqrshrn_high_n_u64(a, b, 19);
-}
-
 // CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -15691,16 +15466,6 @@ int16_t test_vqrshrns_n_s32(int32_t a) {
   return (int16_t)vqrshrns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: define dso_local i32 @test_vqrshrnd_n_s64(
-// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VQRSHRND_N_S64:%.*]] = call i32 
@llvm.aarch64.neon.sqrshrn.i32(i64 [[A]], i32 32)
-// CHECK-NEXT:    ret i32 [[VQRSHRND_N_S64]]
-//
-int32_t test_vqrshrnd_n_s64(int64_t a) {
-  return (int32_t)vqrshrnd_n_s64(a, 32);
-}
-
 // CHECK-LABEL: define dso_local i8 @test_vqrshrnh_n_u16(
 // CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -15725,16 +15490,6 @@ uint16_t test_vqrshrns_n_u32(uint32_t a) {
   return (uint16_t)vqrshrns_n_u32(a, 16);
 }
 
-// CHECK-LABEL: define dso_local i32 @test_vqrshrnd_n_u64(
-// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VQRSHRND_N_U64:%.*]] = call i32 
@llvm.aarch64.neon.uqrshrn.i32(i64 [[A]], i32 32)
-// CHECK-NEXT:    ret i32 [[VQRSHRND_N_U64]]
-//
-uint32_t test_vqrshrnd_n_u64(uint64_t a) {
-  return (uint32_t)vqrshrnd_n_u64(a, 32);
-}
-
 // CHECK-LABEL: define dso_local i8 @test_vqshrunh_n_s16(
 // CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -15793,16 +15548,6 @@ uint16_t test_vqrshruns_n_s32(int32_t a) {
   return (uint16_t)vqrshruns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: define dso_local i32 @test_vqrshrund_n_s64(
-// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VQRSHRUND_N_S64:%.*]] = call i32 
@llvm.aarch64.neon.sqrshrun.i32(i64 [[A]], i32 32)
-// CHECK-NEXT:    ret i32 [[VQRSHRUND_N_S64]]
-//
-uint32_t test_vqrshrund_n_s64(int64_t a) {
-  return (uint32_t)vqrshrund_n_s64(a, 32);
-}
-
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]

>From 10397b1b790d6d5f9e8a17bc248955cf8d3f9690 Mon Sep 17 00:00:00 2001
From: Kartik Ohlan <[email protected]>
Date: Thu, 21 May 2026 01:26:06 -0400
Subject: [PATCH 5/8] fixed the implementation

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 33c6234971bf2..4f564153481e9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2681,6 +2681,7 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vrecpsd_f64:
   case NEON::BI__builtin_neon_vrecpsh_f16:
   case NEON::BI__builtin_neon_vqshrun_n_v:
+  case NEON::BI__builtin_neon_vqshrn_n_v:
   case NEON::BI__builtin_neon_vrshrn_n_v:
   case NEON::BI__builtin_neon_vrndah_f16:
   case NEON::BI__builtin_neon_vrnda_v:
@@ -2736,14 +2737,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return emitNeonCall(cgm, builder, {argTy, sInt32Ty}, ops,
                         "aarch64.neon.sqrshrun", ty, loc);
   }
-  case NEON::BI__builtin_neon_vqshrn_n_v: {
-    cir::VectorType argTy =
-        builder.getExtendedOrTruncatedElementVectorType(ty, true, !usgn);
-    llvm::StringRef intrName =
-        usgn ? "aarch64.neon.uqrshrn" : "aarch64.neon.sqrshrn";
-    return emitNeonCall(cgm, builder, {argTy, sInt32Ty}, ops, intrName, ty,
-                        loc);
-  }
   case NEON::BI__builtin_neon_vcvt_f64_v:
   case NEON::BI__builtin_neon_vcvtq_f64_v:
     ops[0] = builder.createBitcast(ops[0], ty);

>From 669016b184aeb57a7bf57ce8e59ec9fe6833fcdb Mon Sep 17 00:00:00 2001
From: Kartik Ohlan <[email protected]>
Date: Thu, 21 May 2026 11:50:41 -0400
Subject: [PATCH 6/8] added some cleanup

---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  2 +-
 clang/test/CodeGen/AArch64/neon/intrinsics.c  | 86 +++++++++----------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 4f564153481e9..d54d588d9bda6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2733,7 +2733,7 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   }
   case NEON::BI__builtin_neon_vqrshrun_n_v: {
     cir::VectorType argTy =
-        builder.getExtendedOrTruncatedElementVectorType(ty, true, true);
+        builder.getExtendedOrTruncatedElementVectorType(ty, 
/*isExtended=*/true, /*isSigned=*/true);
     return emitNeonCall(cgm, builder, {argTy, sInt32Ty}, ops,
                         "aarch64.neon.sqrshrun", ty, loc);
   }
diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c 
b/clang/test/CodeGen/AArch64/neon/intrinsics.c
index fe8df65b703d1..dd39864717c69 100644
--- a/clang/test/CodeGen/AArch64/neon/intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -26,14 +26,14 @@
 
 //===------------------------------------------------------===//
 // 2.1.3.2.7  Vector saturating rounding shift right and narrow
-// 
+// TODO: Implement SISD variants
 //===------------------------------------------------------===//
 
 // ALL-LABEL: @test_vqrshrun_n_s16(
 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+  // CIR:   cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" 
 
-  // LLVM-SAME: <8 x i16> noundef [[A:%.*]])
+  // LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]])
   // LLVM:      {{.*}} = bitcast <8 x i16> [[A]] to <16 x i8>
   // LLVM:      [[TMP:%.*]] = bitcast <16 x i8> {{.*}} to <8 x i16>
   // LLVM:      [[VQRSHRUN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP]], i32 3)
@@ -43,9 +43,9 @@ uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
 
 // ALL-LABEL: @test_vqrshrun_n_s32(
 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+  // CIR:   cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" 
 
-  // LLVM-SAME: <4 x i32> {{.*}}[[A:%.*]])
+  // LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]])
   // LLVM:      [[BC1:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
   // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <4 x i32>
   // LLVM:      [[VQRSHRUN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[BC2]], i32 9)
@@ -55,9 +55,9 @@ uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
 
 // ALL-LABEL: @test_vqrshrun_n_s64(
 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+  // CIR:   cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" 
 
-  // LLVM-SAME: <2 x i64> {{.*}}[[A:%.*]])
+  // LLVM-SAME: <2 x i64> {{.*}} [[A:%.*]])
   // LLVM:      [[BC1:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
   // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <2 x i64>
   // LLVM:      [[VQRSHRUN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[BC2]], i32 19)
@@ -67,10 +67,9 @@ uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
 
 //ALL-LABEL:  @test_vqrshrund_n_s64(
 uint32_t test_vqrshrund_n_s64(int64_t a) {
-  //CIR: {{%.*}} =  cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
-  // 
-  // 
-  // LLVM-SAME: i64 {{.*}}[[A:%.*]])
+  //CIR:  cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" 
+
+  // LLVM-SAME: i64 {{.*}} [[A:%.*]])
   // LLVM:   [[VQRSHRUND_N_S64:%.*]] = call i32 
@llvm.aarch64.neon.sqrshrun.i32(i64 [[A]], i32 32)
   // LLVM:  ret i32 [[VQRSHRUND_N_S64]]
   return (uint32_t)vqrshrund_n_s64(a, 32);
@@ -78,9 +77,9 @@ uint32_t test_vqrshrund_n_s64(int64_t a) {
 
 // ALL-LABEL: @test_vqrshrn_n_s16(
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
-  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" 
 
-  // LLVM-SAME: <8 x i16> {{.*}}[[A:%.*]])
+  // LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]])
   // LLVM:      [[BC1:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
   // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <8 x i16>
   // LLVM:      [[VQRSHRN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[BC2]], i32 3)
@@ -90,9 +89,9 @@ int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
 
 // ALL-LABEL: @test_vqrshrn_n_s32(
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
-  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" 
 
-  // LLVM-SAME: <4 x i32> {{.*}}[[A:%.*]])
+  // LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]])
   // LLVM:      [[BC1:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
   // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <4 x i32>
   // LLVM:      [[VQRSHRN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[BC2]], i32 9)
@@ -102,9 +101,9 @@ int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
 
 // ALL-LABEL: @test_vqrshrn_n_s64(
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
-  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" 
 
-  // LLVM-SAME: <2 x i64> {{.*}}[[A:%.*]])
+  // LLVM-SAME: <2 x i64> {{.*}} [[A:%.*]])
   // LLVM:      [[BC1:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
   // LLVM:      [[BC2:%.*]] = bitcast <16 x i8> [[BC1]] to <2 x i64>
   // LLVM:      [[VQRSHRN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[BC2]], i32 19)
@@ -114,9 +113,9 @@ int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
 
 // ALL-LABEL: @test_vqrshrun_high_n_s16(
 uint8x16_t test_vqrshrun_high_n_s16(uint8x8_t a, int16x8_t b) {
-  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" 
 
-  // LLVM-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]])
+  // LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i16> {{.*}}  [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   // LLVM:      [[VQRSHRUN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
@@ -127,9 +126,9 @@ uint8x16_t test_vqrshrun_high_n_s16(uint8x8_t a, int16x8_t 
b) {
 
 // ALL-LABEL: @test_vqrshrun_high_n_s32(
 uint16x8_t test_vqrshrun_high_n_s32(uint16x4_t a, int32x4_t b) {
-  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" 
   
-  // LLVM-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]])
+  // LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i32> {{.*}}  [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   // LLVM:      [[VQRSHRUN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
@@ -140,9 +139,9 @@ uint16x8_t test_vqrshrun_high_n_s32(uint16x4_t a, int32x4_t 
b) {
 
 // ALL-LABEL: @test_vqrshrun_high_n_s64(
 uint32x4_t test_vqrshrun_high_n_s64(uint32x2_t a, int64x2_t b) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" {{%.*}}
+  // CIR:    cir.call_llvm_intrinsic "aarch64.neon.sqrshrun" 
 
-  // LLVM-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]])
+  // LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i64> {{.*}} [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   // LLVM:      [[VQRSHRUN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
@@ -153,9 +152,9 @@ uint32x4_t test_vqrshrun_high_n_s64(uint32x2_t a, int64x2_t 
b) {
 
 // ALL-LABEL: @test_vqrshrn_n_u16(
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
-  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" 
 
-  // LLVM-SAME: <8 x i16> {{.*}}[[A:%.*]])
+  // LLVM-SAME: <8 x i16> {{.*}} [[A:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   // LLVM:      [[VQRSHRN_N1:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
@@ -165,9 +164,9 @@ uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
 
 // ALL-LABEL: @test_vqrshrn_n_u32(
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
-  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" 
 
-  // LLVM-SAME: <4 x i32> {{.*}}[[A:%.*]])
+  // LLVM-SAME: <4 x i32> {{.*}} [[A:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   // LLVM:      [[VQRSHRN_N1:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
@@ -177,9 +176,9 @@ uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
 
 // ALL-LABEL: @test_vqrshrn_n_u64(
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
-  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+  // CIR: cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" 
 
-  // LLVM-SAME: <2 x i64> {{.*}}[[A:%.*]])
+  // LLVM-SAME: <2 x i64> {{.*}} [[A:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   // LLVM:      [[VQRSHRN_N1:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
@@ -189,7 +188,7 @@ uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
 
 // ALL-LABEL: @test_vqrshrnd_n_s64(
 int32_t test_vqrshrnd_n_s64(int64_t a) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+  // CIR:    cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" 
 
   // LLVM-SAME: i64 noundef [[A:%.*]])
   // LLVM:      [[VQRSHRND_N_S64:%.*]] = call i32 
@llvm.aarch64.neon.sqrshrn.i32(i64 [[A]], i32 32)
@@ -199,7 +198,7 @@ int32_t test_vqrshrnd_n_s64(int64_t a) {
 
 // ALL-LABEL: @test_vqrshrnd_n_u64(
 uint32_t test_vqrshrnd_n_u64(uint64_t a) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+  // CIR:    cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" 
 
   // LLVM-SAME: i64 noundef [[A:%.*]])
   // LLVM:      [[VQRSHRND_N_U64:%.*]] = call i32 
@llvm.aarch64.neon.uqrshrn.i32(i64 [[A]], i32 32)
@@ -209,9 +208,9 @@ uint32_t test_vqrshrnd_n_u64(uint64_t a) {
 
 // ALL-LABEL: @test_vqrshrn_high_n_s16(
 int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+  // CIR:    cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" 
 
-  // LLVM-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]])
+  // LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i16> {{.*}}  [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   // LLVM:      [[VQRSHRN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
@@ -222,9 +221,9 @@ int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
 
 // ALL-LABEL: @test_vqrshrn_high_n_s32(
 int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+  // CIR:    cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" 
 
-  // LLVM-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]])
+  // LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i32> {{.*}}  [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   // LLVM:      [[VQRSHRN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
@@ -235,9 +234,9 @@ int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) 
{
 
 // ALL-LABEL:  @test_vqrshrn_high_n_s64(
 int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" {{%.*}}
+  // CIR:  cir.call_llvm_intrinsic "aarch64.neon.sqrshrn" 
 
-  // LLVM-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]])
+  // LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i64> {{.*}}  [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   // LLVM:      [[VQRSHRN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
@@ -248,9 +247,9 @@ int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) 
{
 
 // ALL-LABEL:  @test_vqrshrn_high_n_u16(
 uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+  // CIR:    cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" 
 
-  // LLVM-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]])
+  // LLVM-SAME: <8 x i8> {{.*}} [[A:%.*]], <8 x i16> {{.*}}  [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   // LLVM:      [[VQRSHRN_N3:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
@@ -261,9 +260,9 @@ uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t 
b) {
 
 // ALL-LABEL: @test_vqrshrn_high_n_u32(
 uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+  // CIR:   cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" 
 
-  // LLVM-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]])
+  // LLVM-SAME: <4 x i16> {{.*}} [[A:%.*]], <4 x i32> {{.*}}  [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   // LLVM:      [[VQRSHRN_N3:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
@@ -274,9 +273,9 @@ uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t 
b) {
 
 // ALL-LABEL: @test_vqrshrn_high_n_u64(
 uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CIR:   {{%.*}} = cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" {{%.*}}
+  // CIR:    cir.call_llvm_intrinsic "aarch64.neon.uqrshrn" 
 
-  // LLVM-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]])
+  // LLVM-SAME: <2 x i32> {{.*}} [[A:%.*]], <2 x i64> {{.*}}  [[B:%.*]])
   // LLVM:      [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
   // LLVM:      [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   // LLVM:      [[VQRSHRN_N3:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
@@ -285,6 +284,7 @@ uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t 
b) {
   return vqrshrn_high_n_u64(a, b, 19);
 }
 
+//===------------------------------------------------------===//
 // 2.1.1.13.3 Maximum across vector
 //===------------------------------------------------------===//
 

>From 352103c8de40dba88667f1f1f2a9afaa9407d708 Mon Sep 17 00:00:00 2001
From: Kartik Ohlan <[email protected]>
Date: Thu, 21 May 2026 11:57:40 -0400
Subject: [PATCH 7/8] ran clang-format

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index d54d588d9bda6..40ce7e2d4e2e7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2732,8 +2732,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
                         loc);
   }
   case NEON::BI__builtin_neon_vqrshrun_n_v: {
-    cir::VectorType argTy =
-        builder.getExtendedOrTruncatedElementVectorType(ty, 
/*isExtended=*/true, /*isSigned=*/true);
+    cir::VectorType argTy = builder.getExtendedOrTruncatedElementVectorType(
+        ty, /*isExtended=*/true, /*isSigned=*/true);
     return emitNeonCall(cgm, builder, {argTy, sInt32Ty}, ops,
                         "aarch64.neon.sqrshrun", ty, loc);
   }

>From 778cc8f0c5858eb68d0ae54a86c3662324a341f1 Mon Sep 17 00:00:00 2001
From: Kartik Ohlan <[email protected]>
Date: Mon, 25 May 2026 11:45:01 -0400
Subject: [PATCH 8/8] ran clang-format on changes

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 40ce7e2d4e2e7..cd1e607cad4d3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2681,8 +2681,16 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vrecpsd_f64:
   case NEON::BI__builtin_neon_vrecpsh_f16:
   case NEON::BI__builtin_neon_vqshrun_n_v:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+  case NEON::BI__builtin_neon_vqrshrun_n_v:
   case NEON::BI__builtin_neon_vqshrn_n_v:
   case NEON::BI__builtin_neon_vrshrn_n_v:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+  case NEON::BI__builtin_neon_vqrshrn_n_v:
   case NEON::BI__builtin_neon_vrndah_f16:
   case NEON::BI__builtin_neon_vrnda_v:
   case NEON::BI__builtin_neon_vrndaq_v:

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR] Vector saturating rounding shift right and narrow intrinsics (PR #198947)

Reply via email to