https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/129996
Backport b673a59c9ae5 ab811e75734a Requested by: @davemgreen >From e0f31d9f2345b4ddf4ac96e8275524aac5e827d4 Mon Sep 17 00:00:00 2001 From: David Green <david.gr...@arm.com> Date: Wed, 5 Mar 2025 11:23:33 +0000 Subject: [PATCH 1/2] [AArch64] Add BE test coverage for popcount. NFC For #129843 (cherry picked from commit b673a59c9ae5583aa08a8d34a48f9409b660d826) --- llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 161 ++++++++++++++++++++++ llvm/test/CodeGen/AArch64/popcount.ll | 104 ++++++++++++++ 2 files changed, 265 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll index ad0904ff98080..369667ec33f66 100644 --- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s ; RUN: llc < %s -mtriple=aarch64 -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s ; RUN: llc < %s -mtriple=aarch64 -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s +; RUN: llc < %s -mtriple=aarch64_be-none-eabi | FileCheck %s --check-prefix=CHECK-BE define i32 @cnt32_advsimd(i32 %x) nounwind readnone { ; CHECK-LABEL: cnt32_advsimd: @@ -32,6 +33,14 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone { ; CHECK-CSSC: // %bb.0: ; CHECK-CSSC-NEXT: cnt w0, w0 ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: cnt32_advsimd: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: fmov s0, w0 +; CHECK-BE-NEXT: cnt v0.8b, v0.8b +; CHECK-BE-NEXT: addv b0, v0.8b +; CHECK-BE-NEXT: fmov w0, s0 +; CHECK-BE-NEXT: ret %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) ret i32 %cnt } @@ -69,6 +78,16 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) { ; CHECK-CSSC-NEXT: fmov w8, s0 ; CHECK-CSSC-NEXT: cnt w0, w8 ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: cnt32_advsimd_2: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: fmov w8, s0 +; CHECK-BE-NEXT: fmov s0, w8 +; CHECK-BE-NEXT: cnt v0.8b, v0.8b +; CHECK-BE-NEXT: addv b0, v0.8b +; CHECK-BE-NEXT: fmov w0, s0 +; CHECK-BE-NEXT: ret %1 = extractelement <2 x i32> %x, i64 0 %2 = tail call i32 @llvm.ctpop.i32(i32 %1) ret i32 %2 @@ -103,6 +122,16 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone { ; CHECK-CSSC: // %bb.0: ; CHECK-CSSC-NEXT: cnt x0, x0 ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: cnt64_advsimd: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: fmov d0, x0 +; CHECK-BE-NEXT: rev64 v0.8b, v0.8b +; CHECK-BE-NEXT: cnt v0.8b, v0.8b +; CHECK-BE-NEXT: addv b0, v0.8b +; CHECK-BE-NEXT: rev64 v0.8b, v0.8b +; CHECK-BE-NEXT: fmov x0, d0 +; CHECK-BE-NEXT: ret %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt } @@ -147,6 +176,22 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat { ; CHECK-CSSC: // %bb.0: ; CHECK-CSSC-NEXT: cnt w0, w0 ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: cnt32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: lsr w9, w0, #1 +; CHECK-BE-NEXT: mov w8, #16843009 // =0x1010101 +; CHECK-BE-NEXT: and w9, w9, #0x55555555 +; CHECK-BE-NEXT: sub w9, w0, w9 +; CHECK-BE-NEXT: lsr w10, w9, #2 +; CHECK-BE-NEXT: and w9, w9, #0x33333333 +; CHECK-BE-NEXT: and w10, w10, #0x33333333 +; CHECK-BE-NEXT: add w9, w9, w10 +; CHECK-BE-NEXT: add w9, w9, w9, lsr #4 +; CHECK-BE-NEXT: and w9, w9, #0xf0f0f0f +; CHECK-BE-NEXT: mul w8, w9, w8 +; CHECK-BE-NEXT: lsr w0, w8, #24 +; CHECK-BE-NEXT: ret %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) ret i32 %cnt } @@ -188,6 +233,22 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat { ; CHECK-CSSC: // %bb.0: ; CHECK-CSSC-NEXT: cnt x0, x0 ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: cnt64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: lsr x9, x0, #1 +; CHECK-BE-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; CHECK-BE-NEXT: and x9, x9, #0x5555555555555555 +; CHECK-BE-NEXT: sub x9, x0, x9 +; CHECK-BE-NEXT: lsr x10, x9, #2 +; CHECK-BE-NEXT: and x9, x9, #0x3333333333333333 +; CHECK-BE-NEXT: and x10, x10, #0x3333333333333333 +; CHECK-BE-NEXT: add x9, x9, x10 +; CHECK-BE-NEXT: add x9, x9, x9, lsr #4 +; CHECK-BE-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f +; CHECK-BE-NEXT: mul x8, x9, x8 +; CHECK-BE-NEXT: lsr x0, x8, #56 +; CHECK-BE-NEXT: ret %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt } @@ -215,6 +276,14 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone { ; CHECK-CSSC-NEXT: cmp x8, #1 ; CHECK-CSSC-NEXT: cset w0, eq ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: ctpop_eq_one: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: sub x8, x0, #1 +; CHECK-BE-NEXT: eor x9, x0, x8 +; CHECK-BE-NEXT: cmp x9, x8 +; CHECK-BE-NEXT: cset w0, hi +; CHECK-BE-NEXT: ret %count = tail call i64 @llvm.ctpop.i64(i64 %x) %cmp = icmp eq i64 %count, 1 %conv = zext i1 %cmp to i32 @@ -244,6 +313,14 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone { ; CHECK-CSSC-NEXT: cmp x8, #1 ; CHECK-CSSC-NEXT: cset w0, ne ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: ctpop_ne_one: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: sub x8, x0, #1 +; CHECK-BE-NEXT: eor x9, x0, x8 +; CHECK-BE-NEXT: cmp x9, x8 +; CHECK-BE-NEXT: cset w0, ls +; CHECK-BE-NEXT: ret %count = tail call i64 @llvm.ctpop.i64(i64 %x) %cmp = icmp ne i64 %count, 1 %conv = zext i1 %cmp to i32 @@ -273,6 +350,14 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone { ; CHECK-CSSC-NEXT: cmp w8, #1 ; CHECK-CSSC-NEXT: cset w0, ne ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: ctpop32_ne_one: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: sub w8, w0, #1 +; CHECK-BE-NEXT: eor w9, w0, w8 +; CHECK-BE-NEXT: cmp w9, w8 +; CHECK-BE-NEXT: cset w0, ls +; CHECK-BE-NEXT: ret %count = tail call i32 @llvm.ctpop.i32(i32 %x) %cmp = icmp ne i32 %count, 1 ret i1 %cmp @@ -299,6 +384,13 @@ define i1 @ctpop32_eq_one_nonzero(i32 %x) { ; CHECK-CSSC-NEXT: tst w0, w8 ; CHECK-CSSC-NEXT: cset w0, eq ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: ctpop32_eq_one_nonzero: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: sub w8, w0, #1 +; CHECK-BE-NEXT: tst w0, w8 +; CHECK-BE-NEXT: cset w0, eq +; CHECK-BE-NEXT: ret entry: %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) %cmp = icmp eq i32 %popcnt, 1 @@ -326,11 +418,80 @@ define i1 @ctpop32_ne_one_nonzero(i32 %x) { ; CHECK-CSSC-NEXT: tst w0, w8 ; CHECK-CSSC-NEXT: cset w0, ne ; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: ctpop32_ne_one_nonzero: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: sub w8, w0, #1 +; CHECK-BE-NEXT: tst w0, w8 +; CHECK-BE-NEXT: cset w0, ne +; CHECK-BE-NEXT: ret entry: %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) %cmp = icmp ne i32 %popcnt, 1 ret i1 %cmp } +define i128 @cnt128(i128 %x) nounwind readnone { +; CHECK-LABEL: cnt128: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: mov.d v0[1], x1 +; CHECK-NEXT: cnt.16b v0, v0 +; CHECK-NEXT: addv.16b b0, v0 +; CHECK-NEXT: mov.d x1, v0[1] +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +; +; CHECK-NONEON-LABEL: cnt128: +; CHECK-NONEON: // %bb.0: +; CHECK-NONEON-NEXT: lsr x9, x0, #1 +; CHECK-NONEON-NEXT: lsr x10, x1, #1 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 +; CHECK-NONEON-NEXT: and x10, x10, #0x5555555555555555 +; CHECK-NONEON-NEXT: sub x9, x0, x9 +; CHECK-NONEON-NEXT: sub x10, x1, x10 +; CHECK-NONEON-NEXT: mov x1, xzr +; CHECK-NONEON-NEXT: lsr x11, x9, #2 +; CHECK-NONEON-NEXT: lsr x12, x10, #2 +; CHECK-NONEON-NEXT: and x9, x9, #0x3333333333333333 +; CHECK-NONEON-NEXT: and x10, x10, #0x3333333333333333 +; CHECK-NONEON-NEXT: and x11, x11, #0x3333333333333333 +; CHECK-NONEON-NEXT: add x9, x9, x11 +; CHECK-NONEON-NEXT: and x11, x12, #0x3333333333333333 +; CHECK-NONEON-NEXT: add x9, x9, x9, lsr #4 +; CHECK-NONEON-NEXT: add x10, x10, x11 +; CHECK-NONEON-NEXT: add x10, x10, x10, lsr #4 +; CHECK-NONEON-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f +; CHECK-NONEON-NEXT: mul x9, x9, x8 +; CHECK-NONEON-NEXT: and x10, x10, #0xf0f0f0f0f0f0f0f +; CHECK-NONEON-NEXT: mul x8, x10, x8 +; CHECK-NONEON-NEXT: lsr x9, x9, #56 +; CHECK-NONEON-NEXT: add x0, x9, x8, lsr #56 +; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: cnt128: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt x8, x1 +; CHECK-CSSC-NEXT: cnt x9, x0 +; CHECK-CSSC-NEXT: mov x1, xzr +; CHECK-CSSC-NEXT: add x0, x9, x8 +; CHECK-CSSC-NEXT: ret +; +; CHECK-BE-LABEL: cnt128: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: fmov d0, x0 +; CHECK-BE-NEXT: mov v0.d[1], x1 +; CHECK-BE-NEXT: rev64 v0.16b, v0.16b +; CHECK-BE-NEXT: cnt v0.16b, v0.16b +; CHECK-BE-NEXT: addv b0, v0.16b +; CHECK-BE-NEXT: rev64 v0.16b, v0.16b +; CHECK-BE-NEXT: mov x1, v0.d[1] +; CHECK-BE-NEXT: fmov x0, d0 +; CHECK-BE-NEXT: ret + %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) + ret i128 %cnt +} + declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i64 @llvm.ctpop.i64(i64) nounwind readnone diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll index 89b1ac0a0edf1..6cc925f0ae91f 100644 --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=CHECK,NEON ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon,+dotprod | FileCheck %s --check-prefixes=CHECK,DOT ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+sve | FileCheck %s --check-prefixes=CHECK,SVE +; RUN: llc < %s -mtriple=aarch64_be-unknown-unknown | FileCheck %s --check-prefixes=BE ; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=GISEL ; RUN: llc < %s -O0 -global-isel -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=GISELO0 ; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=GISEL,NEON-GISEL @@ -32,6 +33,18 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) { ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; +; BE-LABEL: popcount128: +; BE: // %bb.0: // %Entry +; BE-NEXT: ldr d0, [x0] +; BE-NEXT: add x8, x0, #8 +; BE-NEXT: ld1 { v0.d }[1], [x8] +; BE-NEXT: rev64 v0.16b, v0.16b +; BE-NEXT: cnt v0.16b, v0.16b +; BE-NEXT: addv b0, v0.16b +; BE-NEXT: rev32 v0.16b, v0.16b +; BE-NEXT: mov w0, v0.s[3] +; BE-NEXT: ret +; ; GISEL-LABEL: popcount128: ; GISEL: // %bb.0: // %Entry ; GISEL-NEXT: ldr q0, [x0] @@ -111,6 +124,27 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) { ; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret ; +; BE-LABEL: popcount256: +; BE: // %bb.0: // %Entry +; BE-NEXT: ldr d0, [x0] +; BE-NEXT: ldr d1, [x0, #16] +; BE-NEXT: add x8, x0, #24 +; BE-NEXT: add x9, x0, #8 +; BE-NEXT: ld1 { v0.d }[1], [x9] +; BE-NEXT: ld1 { v1.d }[1], [x8] +; BE-NEXT: rev64 v0.16b, v0.16b +; BE-NEXT: rev64 v1.16b, v1.16b +; BE-NEXT: cnt v0.16b, v0.16b +; BE-NEXT: cnt v1.16b, v1.16b +; BE-NEXT: addv b0, v0.16b +; BE-NEXT: addv b1, v1.16b +; BE-NEXT: rev32 v0.16b, v0.16b +; BE-NEXT: rev32 v1.16b, v1.16b +; BE-NEXT: mov w8, v0.s[3] +; BE-NEXT: mov w9, v1.s[3] +; BE-NEXT: add w0, w9, w8 +; BE-NEXT: ret +; ; GISEL-LABEL: popcount256: ; GISEL: // %bb.0: // %Entry ; GISEL-NEXT: ldp x8, x9, [x0] @@ -199,6 +233,18 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) { ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; +; BE-LABEL: popcount1x128: +; BE: // %bb.0: // %Entry +; BE-NEXT: fmov d0, x0 +; BE-NEXT: mov v0.d[1], x1 +; BE-NEXT: rev64 v0.16b, v0.16b +; BE-NEXT: cnt v0.16b, v0.16b +; BE-NEXT: addv b0, v0.16b +; BE-NEXT: rev64 v0.16b, v0.16b +; BE-NEXT: mov x1, v0.d[1] +; BE-NEXT: fmov x0, d0 +; BE-NEXT: ret +; ; GISEL-LABEL: popcount1x128: ; GISEL: // %bb.0: // %Entry ; GISEL-NEXT: mov v0.d[0], x0 @@ -266,6 +312,17 @@ define <2 x i64> @popcount2x64(<2 x i64> %0) { ; SVE-NEXT: uaddlp v0.2d, v0.4s ; SVE-NEXT: ret ; +; BE-LABEL: popcount2x64: +; BE: // %bb.0: // %Entry +; BE-NEXT: rev64 v0.16b, v0.16b +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: cnt v0.16b, v0.16b +; BE-NEXT: uaddlp v0.8h, v0.16b +; BE-NEXT: uaddlp v0.4s, v0.8h +; BE-NEXT: uaddlp v0.2d, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: ret +; ; GISELO0-LABEL: popcount2x64: ; GISELO0: // %bb.0: // %Entry ; GISELO0-NEXT: cnt v0.16b, v0.16b @@ -326,6 +383,15 @@ define <1 x i64> @popcount1x64(<1 x i64> %0) { ; CHECK-NEXT: uaddlp v0.1d, v0.2s ; CHECK-NEXT: ret ; +; BE-LABEL: popcount1x64: +; BE: // %bb.0: // %Entry +; BE-NEXT: rev64 v0.8b, v0.8b +; BE-NEXT: cnt v0.8b, v0.8b +; BE-NEXT: uaddlp v0.4h, v0.8b +; BE-NEXT: uaddlp v0.2s, v0.4h +; BE-NEXT: uaddlp v0.1d, v0.2s +; BE-NEXT: ret +; ; GISEL-LABEL: popcount1x64: ; GISEL: // %bb.0: // %Entry ; GISEL-NEXT: cnt v0.8b, v0.8b @@ -382,6 +448,17 @@ define <4 x i32> @popcount4x32(<4 x i32> %0) { ; SVE-NEXT: uaddlp v0.4s, v0.8h ; SVE-NEXT: ret ; +; BE-LABEL: popcount4x32: +; BE: // %bb.0: // %Entry +; BE-NEXT: rev64 v0.16b, v0.16b +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: cnt v0.16b, v0.16b +; BE-NEXT: uaddlp v0.8h, v0.16b +; BE-NEXT: uaddlp v0.4s, v0.8h +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: ret +; ; GISELO0-LABEL: popcount4x32: ; GISELO0: // %bb.0: // %Entry ; GISELO0-NEXT: cnt v0.16b, v0.16b @@ -449,6 +526,15 @@ define <2 x i32> @popcount2x32(<2 x i32> %0) { ; SVE-NEXT: uaddlp v0.2s, v0.4h ; SVE-NEXT: ret ; +; BE-LABEL: popcount2x32: +; BE: // %bb.0: // %Entry +; BE-NEXT: rev64 v0.8b, v0.8b +; BE-NEXT: cnt v0.8b, v0.8b +; BE-NEXT: uaddlp v0.4h, v0.8b +; BE-NEXT: uaddlp v0.2s, v0.4h +; BE-NEXT: rev64 v0.2s, v0.2s +; BE-NEXT: ret +; ; GISELO0-LABEL: popcount2x32: ; GISELO0: // %bb.0: // %Entry ; GISELO0-NEXT: cnt v0.8b, v0.8b @@ -498,6 +584,16 @@ define <8 x i16> @popcount8x16(<8 x i16> %0) { ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: ret ; +; BE-LABEL: popcount8x16: +; BE: // %bb.0: // %Entry +; BE-NEXT: rev64 v0.16b, v0.16b +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: cnt v0.16b, v0.16b +; BE-NEXT: uaddlp v0.8h, v0.16b +; BE-NEXT: rev64 v0.8h, v0.8h +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: ret +; ; GISEL-LABEL: popcount8x16: ; GISEL: // %bb.0: // %Entry ; GISEL-NEXT: cnt v0.16b, v0.16b @@ -529,6 +625,14 @@ define <4 x i16> @popcount4x16(<4 x i16> %0) { ; CHECK-NEXT: uaddlp v0.4h, v0.8b ; CHECK-NEXT: ret ; +; BE-LABEL: popcount4x16: +; BE: // %bb.0: // %Entry +; BE-NEXT: rev64 v0.8b, v0.8b +; BE-NEXT: cnt v0.8b, v0.8b +; BE-NEXT: uaddlp v0.4h, v0.8b +; BE-NEXT: rev64 v0.4h, v0.4h +; BE-NEXT: ret +; ; GISEL-LABEL: popcount4x16: ; GISEL: // %bb.0: // %Entry ; GISEL-NEXT: cnt v0.8b, v0.8b >From 9c970ac147e674f5384c48c065ed5d04399bff1d Mon Sep 17 00:00:00 2001 From: David Green <david.gr...@arm.com> Date: Wed, 5 Mar 2025 20:08:34 +0000 Subject: [PATCH 2/2] [AArch64] Fix BE popcount casts. (#129879) A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes #129843 (cherry picked from commit ab811e75734a77247dae6df1579fa6f29394f200) --- .../Target/AArch64/AArch64ISelLowering.cpp | 10 ++++++++-- llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 8 +++----- llvm/test/CodeGen/AArch64/parity.ll | 2 +- llvm/test/CodeGen/AArch64/popcount.ll | 19 +++++++++---------- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b5cca88b6b511..ca357382c472d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10783,7 +10783,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, if (VT == MVT::i32) AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV, DAG.getConstant(0, DL, MVT::i64)); - AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV); + else + AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV), + DAG.getConstant(0, DL, MVT::i64)); if (IsParity) AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT)); return AddV; @@ -10792,7 +10795,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop); - AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV); + AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, + DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV), + DAG.getConstant(0, DL, MVT::i64)); + AddV = DAG.getZExtOrTrunc(AddV, DL, VT); if (IsParity) AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT)); return AddV; diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll index 369667ec33f66..d06e42f5405ef 100644 --- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll @@ -129,7 +129,6 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone { ; CHECK-BE-NEXT: rev64 v0.8b, v0.8b ; CHECK-BE-NEXT: cnt v0.8b, v0.8b ; CHECK-BE-NEXT: addv b0, v0.8b -; CHECK-BE-NEXT: rev64 v0.8b, v0.8b ; CHECK-BE-NEXT: fmov x0, d0 ; CHECK-BE-NEXT: ret %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) @@ -436,9 +435,9 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; CHECK: // %bb.0: ; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: mov.d v0[1], x1 +; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: cnt.16b v0, v0 ; CHECK-NEXT: addv.16b b0, v0 -; CHECK-NEXT: mov.d x1, v0[1] ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; @@ -481,13 +480,12 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; CHECK-BE-LABEL: cnt128: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: fmov d0, x0 +; CHECK-BE-NEXT: mov x0, xzr ; CHECK-BE-NEXT: mov v0.d[1], x1 ; CHECK-BE-NEXT: rev64 v0.16b, v0.16b ; CHECK-BE-NEXT: cnt v0.16b, v0.16b ; CHECK-BE-NEXT: addv b0, v0.16b -; CHECK-BE-NEXT: rev64 v0.16b, v0.16b -; CHECK-BE-NEXT: mov x1, v0.d[1] -; CHECK-BE-NEXT: fmov x0, d0 +; CHECK-BE-NEXT: fmov x1, d0 ; CHECK-BE-NEXT: ret %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) ret i128 %cnt diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll index 1e51793fb5f91..91515277cb3f6 100644 --- a/llvm/test/CodeGen/AArch64/parity.ll +++ b/llvm/test/CodeGen/AArch64/parity.ll @@ -159,7 +159,7 @@ define i32 @parity_64_trunc(i64 %x) { ; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: cnt v0.8b, v0.8b ; CHECK-NEXT: addv b0, v0.8b -; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll index 6cc925f0ae91f..e664e73594923 100644 --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -41,8 +41,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) { ; BE-NEXT: rev64 v0.16b, v0.16b ; BE-NEXT: cnt v0.16b, v0.16b ; BE-NEXT: addv b0, v0.16b -; BE-NEXT: rev32 v0.16b, v0.16b -; BE-NEXT: mov w0, v0.s[3] +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: mov w0, v0.s[1] ; BE-NEXT: ret ; ; GISEL-LABEL: popcount128: @@ -138,10 +138,10 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) { ; BE-NEXT: cnt v1.16b, v1.16b ; BE-NEXT: addv b0, v0.16b ; BE-NEXT: addv b1, v1.16b -; BE-NEXT: rev32 v0.16b, v0.16b -; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: mov w8, v0.s[3] -; BE-NEXT: mov w9, v1.s[3] +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: rev64 v1.4s, v1.4s +; BE-NEXT: mov w8, v0.s[1] +; BE-NEXT: mov w9, v1.s[1] ; BE-NEXT: add w0, w9, w8 ; BE-NEXT: ret ; @@ -227,22 +227,21 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) { ; CHECK: // %bb.0: // %Entry ; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: mov v0.d[1], x1 +; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: addv b0, v0.16b -; CHECK-NEXT: mov x1, v0.d[1] ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; ; BE-LABEL: popcount1x128: ; BE: // %bb.0: // %Entry ; BE-NEXT: fmov d0, x0 +; BE-NEXT: mov x0, xzr ; BE-NEXT: mov v0.d[1], x1 ; BE-NEXT: rev64 v0.16b, v0.16b ; BE-NEXT: cnt v0.16b, v0.16b ; BE-NEXT: addv b0, v0.16b -; BE-NEXT: rev64 v0.16b, v0.16b -; BE-NEXT: mov x1, v0.d[1] -; BE-NEXT: fmov x0, d0 +; BE-NEXT: fmov x1, d0 ; BE-NEXT: ret ; ; GISEL-LABEL: popcount1x128: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits