kmclaughlin created this revision. kmclaughlin added reviewers: sdesmalen, c-rhodes, fpetrogalli, efriedma, stuij, david-arm. Herald added subscribers: llvm-commits, cfe-commits, danielkiss, psnobl, rkruppe, hiraditya, kristof.beyls, tschuett. Herald added projects: clang, LLVM.
Bfloat16 support added for the following intrinsics: - ST1 - STNT1 Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D82448 Files: clang/include/clang/Basic/arm_sve.td clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll
Index: llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll +++ llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll @@ -94,6 +94,20 @@ ret void } +define void @test_masked_ldst_sv8bf16(bfloat* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8bf16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %gep = getelementptr bfloat, bfloat* %base, i64 %offset + %data = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %mask, + bfloat* %gep) + call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data, + <vscale x 8 x i1> %mask, + bfloat* %gep) + ret void +} + ; 16-lane non-temporal load/stores. define void @test_masked_ldst_sv16i8(i8* %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind { @@ -121,6 +135,7 @@ ; 8-element non-temporal loads. declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i16*) declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*) +declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*) ; 16-element non-temporal loads. declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i8*) @@ -128,14 +143,15 @@ ; 2-element non-temporal stores. declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*) declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*) - -; 4-element non-temporal stores. + +; 4-element non-temporal stores. declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*) declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*) - -; 8-element non-temporal stores. + +; 8-element non-temporal stores. declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*) declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*) +declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*) ; 16-element non-temporal stores. declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*) Index: llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll +++ llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll @@ -139,6 +139,23 @@ ret void } +define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8bf16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1 + %base_load_bc = bitcast <vscale x 8 x bfloat>* %base_load to bfloat* + %data = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %mask, + bfloat* %base_load_bc) + %base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2 + %base_store_bc = bitcast <vscale x 8 x bfloat>* %base_store to bfloat* + call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data, + <vscale x 8 x i1> %mask, + bfloat* %base_store_bc) + ret void +} + ; 16-lane non-temporal load/stores. define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind { @@ -169,6 +186,7 @@ ; 8-element non-temporal loads. declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i16*) declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*) +declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*) ; 16-element non-temporal loads. declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i8*) @@ -176,14 +194,15 @@ ; 2-element non-temporal stores. declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*) declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*) - -; 4-element non-temporal stores. + +; 4-element non-temporal stores. declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*) declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*) - -; 8-element non-temporal stores. + +; 8-element non-temporal stores. declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*) declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*) +declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*) ; 16-element non-temporal stores. declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*) Index: llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll +++ llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll @@ -513,6 +513,24 @@ ret void } +define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8bf16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1 + %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_load, + i32 1, + <vscale x 8 x i1> %mask, + <vscale x 8 x bfloat> undef) + %base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2 + call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data, + <vscale x 8 x bfloat>* %base_store, + i32 1, + <vscale x 8 x i1> %mask) + ret void +} + ; 8-lane zero/sign extended contiguous loads. define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind { @@ -596,6 +614,7 @@ declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> ) declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>) declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>) +declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>) ; 16-element contiguous loads. declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>) @@ -620,6 +639,7 @@ declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>) declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>) declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>) +declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>) ; 16-element contiguous stores. declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>) Index: llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll +++ llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll @@ -179,6 +179,14 @@ ret void } +define void @masked_store_nxv8bf16(<vscale x 8 x bfloat> *%a, <vscale x 8 x bfloat> %val, <vscale x 8 x i1> %mask) nounwind { +; CHECK-LABEL: masked_store_nxv8bf16: +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %val, <vscale x 8 x bfloat> *%a, i32 2, <vscale x 8 x i1> %mask) + ret void +} + declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>) declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>) declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>) @@ -203,3 +211,4 @@ declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>) declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>) declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>) +declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll @@ -377,6 +377,16 @@ ret void } +define void @stnt1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %addr) { +; CHECK-LABEL: stnt1h_bf16: +; CHECK: stnt1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data, + <vscale x 8 x i1> %pred, + bfloat* %addr) + ret void +} + ; ; STNT1W ; @@ -458,5 +468,6 @@ declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*) declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*) declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*) +declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*) declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*) declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll @@ -75,6 +75,16 @@ ret void } +define void @st1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %addr) { +; CHECK-LABEL: st1h_bf16: +; CHECK: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, + <vscale x 8 x i1> %pred, + bfloat* %addr) + ret void +} + define void @st1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pred, i16* %addr) { ; CHECK-LABEL: st1h_s: ; CHECK: st1h { z0.s }, p0, [x0] @@ -161,6 +171,7 @@ declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*) declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*) declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*) +declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*) declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*) declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll @@ -82,6 +82,17 @@ ret void } +define void @st1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %a, i64 %index) { +; CHECK-LABEL: st1h_bf16: +; CHECK: st1h { z0.h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base = getelementptr bfloat, bfloat* %a, i64 %index + call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, + <vscale x 8 x i1> %pred, + bfloat* %base) + ret void +} + define void @st1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pred, i16* %addr) { ; CHECK-LABEL: st1h_s: ; CHECK: st1h { z0.s }, p0, [x0] @@ -174,6 +185,7 @@ declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*) declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*) declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*) +declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*) declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*) declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*) Index: llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll @@ -126,6 +126,17 @@ ret void } +define void @st1h_bf16_inbound(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pg, bfloat* %a) { +; CHECK-LABEL: st1h_bf16_inbound: +; CHECK: st1h { z0.h }, p0, [x0, #-5, mul vl] +; CHECK-NEXT: ret + %base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>* + %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 -5 + %base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat* + call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pg, bfloat* %base_scalar) + ret void +} + define void @st1h_s_inbound(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %a) { ; CHECK-LABEL: st1h_s_inbound: ; CHECK: st1h { z0.s }, p0, [x0, #2, mul vl] @@ -219,6 +230,7 @@ declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*) declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*) declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*) +declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*) declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*) declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*) Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1416,6 +1416,7 @@ def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>; def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>; def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>; def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>; def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>; @@ -1560,9 +1561,10 @@ defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>; // 8-element contiguous stores - defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>; - defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; - defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>; + defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; // 16-element contiguous stores defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12037,6 +12037,7 @@ case MVT::nxv8i8: case MVT::nxv8i16: case MVT::nxv8f16: + case MVT::nxv8bf16: return MVT::nxv8i16; case MVT::nxv16i8: return MVT::nxv16i8; Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +void test_svstnt1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) +{ + // CHECK-LABEL: test_svstnt1_bf16 + // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK: call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %[[PG]], bfloat* %base) + // CHECK-NEXT: ret + // expected-warning@+1 {{implicit declaration of function 'svstnt1_bf16'}} + return SVE_ACLE_FUNC(svstnt1,_bf16,,)(pg, base, data); +} + +void test_svstnt1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data) +{ + // CHECK-LABEL: test_svstnt1_vnum_bf16 + // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>* + // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0 + // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK: call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]]) + // CHECK-NEXT: ret + // expected-warning@+1 {{implicit declaration of function 'svstnt1_vnum_bf16'}} + return SVE_ACLE_FUNC(svstnt1_vnum,_bf16,,)(pg, base, vnum, data); +} Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) +{ + // CHECK-LABEL: test_svst1_bf16 + // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK: call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %[[PG]], bfloat* %base) + // CHECK: ret void + // expected-warning@+1 {{implicit declaration of function 'svst1_bf16'}} + return SVE_ACLE_FUNC(svst1,_bf16,,)(pg, base, data); +} + +void test_svst1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data) +{ + // CHECK-LABEL: test_svst1_vnum_bf16 + // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK-DAG: %[[BASE:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>* + // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BASE]], i64 %vnum, i64 0 + // CHECK: call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]]) + // CHECK: ret void + // expected-warning@+1 {{implicit declaration of function 'svst1_vnum_bf16'}} + return SVE_ACLE_FUNC(svst1_vnum,_bf16,,)(pg, base, vnum, data); +} Index: clang/include/clang/Basic/arm_sve.td =================================================================== --- clang/include/clang/Basic/arm_sve.td +++ clang/include/clang/Basic/arm_sve.td @@ -563,6 +563,11 @@ def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; +} + // Store one vector (vector base) def SVST1_SCATTER_BASES_U : MInst<"svst1_scatter[_{2}base_{d}]", "vPud", "ilUiUlfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1_scatter_scalar_offset">; def SVST1B_SCATTER_BASES_U : MInst<"svst1b_scatter[_{2}base_{d}]", "vPud", "ilUiUl", [IsScatterStore], MemEltTyInt8, "aarch64_sve_st1_scatter_scalar_offset">; @@ -654,6 +659,11 @@ // Store one vector, with no truncation, non-temporal (scalar base, VL displacement) def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +} + //////////////////////////////////////////////////////////////////////////////// // Prefetches
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits