Author: Amy Kwan Date: 2026-06-01T06:57:31Z New Revision: ad9524f3865770ed410bbc7d91fa50dc2d223027
URL: https://github.com/llvm/llvm-project/commit/ad9524f3865770ed410bbc7d91fa50dc2d223027 DIFF: https://github.com/llvm/llvm-project/commit/ad9524f3865770ed410bbc7d91fa50dc2d223027.diff LOG: [PowerPC] Fix i128 vcmpequb optimization for loads with range metadata and small constants (#196801) The combine introduced in 55aff64d2c6ef50d2ed725d7dd1fb34080486237 lowers scalar i128 compares into vector compares by reissuing the original loads as v16i8 loads. However, the combine was reusing the original MachineMemOperand without modification. If the original i128 load carries !range metadata, the MMO encodes that range using i128 values. Reusing this MMO for a v16i8 load is incorrect as range metadata is only valid for integer scalar types and its bitwidth must match the memory VT. This patch fixes this by creating a new MachineMemOperand for the vector vector load. Additionally, we restrict the combine for constant operands to avoid cases that are better handled by scalar lowering. Small constants (fit within 16 bits) are excluded to prevent generating suboptimal vector compares. (cherry picked from commit 1907b586384b51be2f6b44490c46941f08ff6974) Added: llvm/test/CodeGen/PowerPC/ppc-i128-cmp.ll Modified: llvm/lib/Target/PowerPC/PPCISelLowering.cpp Removed: ################################################################################ diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index bdba040529d00..56aa33fdd4098 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -15586,17 +15586,27 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, } // The function check a i128 load can convert to 16i8 load for Vcmpequb. -static bool canConvertToVcmpequb(SDValue &LHS, SDValue &RHS) { +static bool canConvertToVcmpequb(SDValue &LHS, SDValue &RHS, bool IsPPC64) { - auto isValidForConvert = [](SDValue &Operand) { + auto isValidForConvert = [IsPPC64](SDValue &Operand) { if (!Operand.hasOneUse()) return false; if (Operand.getValueType() != MVT::i128) return false; - if (Operand.getOpcode() == ISD::Constant) + if (Operand.getOpcode() == ISD::Constant) { + auto *C = cast<ConstantSDNode>(Operand); + const APInt &Val = C->getAPIntValue(); + // On PPC64, comparing an i128 value loaded from memory against a + // constant smaller than 2^16 is usually better left to scalar lowering. + // In that case, the compare can be lowered using xori (since xori has a + // 16-bit immediate field), which is cheaper than materializing a vector + // constant and using vcmpequb. + if (IsPPC64 && Val.ult(1ULL << 16)) + return false; return true; + } auto *LoadNode = dyn_cast<LoadSDNode>(Operand); if (!LoadNode) @@ -15647,10 +15657,19 @@ SDValue convertTwoLoadsAndCmpToVCMPEQUB(SelectionDAG &DAG, SDNode *N, assert(Operand.getOpcode() == ISD::LOAD && "Must be LoadSDNode here."); auto *LoadNode = cast<LoadSDNode>(Operand); - SDValue NewLoad = - DAG.getLoad(MVT::v16i8, DL, LoadNode->getChain(), - LoadNode->getBasePtr(), LoadNode->getMemOperand()); - DAG.ReplaceAllUsesOfValueWith(Operand.getValue(1), NewLoad.getValue(1)); + // Create a new MachineMemOperand without range metadata. + // Range metadata is only valid for integer scalar types, not vectors. + // The original i128 load may have range metadata, but when we convert + // to v16i8, that metadata is no longer semantically valid. + MachineMemOperand *MMO = LoadNode->getMemOperand(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *NewMMO = MF.getMachineMemOperand( + MMO->getPointerInfo(), MMO->getFlags(), MMO->getSize(), MMO->getAlign(), + MMO->getAAInfo(), nullptr, MMO->getSyncScopeID(), + MMO->getSuccessOrdering(), MMO->getFailureOrdering()); + SDValue NewLoad = DAG.getLoad(MVT::v16i8, DL, LoadNode->getChain(), + LoadNode->getBasePtr(), NewMMO); + DAG.ReplaceAllUsesOfValueWith(SDValue(LoadNode, 1), NewLoad.getValue(1)); return NewLoad; }; @@ -15815,7 +15834,8 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N, // This transformation replaces memcmp(a, b, 16) with two vector loads // and one vector compare instruction. - if (Subtarget.hasAltivec() && canConvertToVcmpequb(LHS, RHS)) + if (Subtarget.hasAltivec() && + canConvertToVcmpequb(LHS, RHS, Subtarget.isPPC64())) return convertTwoLoadsAndCmpToVCMPEQUB(DCI.DAG, N, SDLoc(N)); } diff --git a/llvm/test/CodeGen/PowerPC/ppc-i128-cmp.ll b/llvm/test/CodeGen/PowerPC/ppc-i128-cmp.ll new file mode 100644 index 0000000000000..c661d7da690b4 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ppc-i128-cmp.ll @@ -0,0 +1,282 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefixes=COMMON,CHECK-AIX64 +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefixes=COMMON,CHECK-LINUX +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefixes=COMMON,CHECK-AIX32 + +define i1 @test1() { +; CHECK-AIX64-LABEL: test1: +; CHECK-AIX64: # %bb.0: # %bb +; CHECK-AIX64-NEXT: ld r3, 0(0) +; CHECK-AIX64-NEXT: ld r4, 8(0) +; CHECK-AIX64-NEXT: or r3, r4, r3 +; CHECK-AIX64-NEXT: cntlzd r3, r3 +; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63 +; CHECK-AIX64-NEXT: blr +; +; CHECK-LINUX-LABEL: test1: +; CHECK-LINUX: # %bb.0: # %bb +; CHECK-LINUX-NEXT: ld r3, 8(0) +; CHECK-LINUX-NEXT: ld r4, 0(0) +; CHECK-LINUX-NEXT: or r3, r4, r3 +; CHECK-LINUX-NEXT: cntlzd r3, r3 +; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63 +; CHECK-LINUX-NEXT: blr +; +; CHECK-AIX32-LABEL: test1: +; CHECK-AIX32: # %bb.0: # %bb +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: xxlxor vs35, vs35, vs35 +; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX32-NEXT: mfocrf r3, 2 +; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX32-NEXT: blr +bb: + %load = load i128, ptr null, align 16 + %icmp = icmp eq i128 %load, 0 + ret i1 %icmp +} + +define i1 @test2() { +; CHECK-AIX64-LABEL: test2: +; CHECK-AIX64: # %bb.0: # %bb +; CHECK-AIX64-NEXT: ld r4, 8(0) +; CHECK-AIX64-NEXT: ld r3, 0(0) +; CHECK-AIX64-NEXT: xori r4, r4, 10 +; CHECK-AIX64-NEXT: or r3, r4, r3 +; CHECK-AIX64-NEXT: cntlzd r3, r3 +; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63 +; CHECK-AIX64-NEXT: blr +; +; CHECK-LINUX-LABEL: test2: +; CHECK-LINUX: # %bb.0: # %bb +; CHECK-LINUX-NEXT: ld r4, 0(0) +; CHECK-LINUX-NEXT: ld r3, 8(0) +; CHECK-LINUX-NEXT: xori r4, r4, 10 +; CHECK-LINUX-NEXT: or r3, r4, r3 +; CHECK-LINUX-NEXT: cntlzd r3, r3 +; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63 +; CHECK-LINUX-NEXT: blr +; +; CHECK-AIX32-LABEL: test2: +; CHECK-AIX32: # %bb.0: # %bb +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX32-NEXT: lwz r3, L..C0(r2) # %const.0 +; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3 +; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX32-NEXT: mfocrf r3, 2 +; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX32-NEXT: blr +bb: + %load = load i128, ptr null, align 16 + %icmp = icmp eq i128 %load, 10 + ret i1 %icmp +} + +define i1 @test3() { +; CHECK-AIX64-LABEL: test3: +; CHECK-AIX64: # %bb.0: # %bb +; CHECK-AIX64-NEXT: ld r4, 8(0) +; CHECK-AIX64-NEXT: ld r3, 0(0) +; CHECK-AIX64-NEXT: xori r4, r4, 65535 +; CHECK-AIX64-NEXT: or r3, r4, r3 +; CHECK-AIX64-NEXT: cntlzd r3, r3 +; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63 +; CHECK-AIX64-NEXT: blr +; +; CHECK-LINUX-LABEL: test3: +; CHECK-LINUX: # %bb.0: # %bb +; CHECK-LINUX-NEXT: ld r4, 0(0) +; CHECK-LINUX-NEXT: ld r3, 8(0) +; CHECK-LINUX-NEXT: xori r4, r4, 65535 +; CHECK-LINUX-NEXT: or r3, r4, r3 +; CHECK-LINUX-NEXT: cntlzd r3, r3 +; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63 +; CHECK-LINUX-NEXT: blr +; +; CHECK-AIX32-LABEL: test3: +; CHECK-AIX32: # %bb.0: # %bb +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX32-NEXT: lwz r3, L..C1(r2) # %const.0 +; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3 +; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX32-NEXT: mfocrf r3, 2 +; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX32-NEXT: blr +bb: + %load = load i128, ptr null, align 16 + %icmp = icmp eq i128 %load, 65535 + ret i1 %icmp +} + +define i1 @test4() { +; CHECK-AIX64-LABEL: test4: +; CHECK-AIX64: # %bb.0: # %bb +; CHECK-AIX64-NEXT: li r3, 0 +; CHECK-AIX64-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX64-NEXT: ld r3, L..C0(r2) # %const.0 +; CHECK-AIX64-NEXT: lxvd2x vs35, 0, r3 +; CHECK-AIX64-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX64-NEXT: mfocrf r3, 2 +; CHECK-AIX64-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX64-NEXT: blr +; +; CHECK-LINUX-LABEL: test4: +; CHECK-LINUX: # %bb.0: # %bb +; CHECK-LINUX-NEXT: li r3, 0 +; CHECK-LINUX-NEXT: lxvd2x vs34, 0, r3 +; CHECK-LINUX-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-LINUX-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-LINUX-NEXT: lxvd2x vs35, 0, r3 +; CHECK-LINUX-NEXT: vcmpequb. v2, v2, v3 +; CHECK-LINUX-NEXT: mfocrf r3, 2 +; CHECK-LINUX-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-LINUX-NEXT: blr +; +; CHECK-AIX32-LABEL: test4: +; CHECK-AIX32: # %bb.0: # %bb +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX32-NEXT: lwz r3, L..C2(r2) # %const.0 +; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3 +; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX32-NEXT: mfocrf r3, 2 +; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX32-NEXT: blr +bb: + %load = load i128, ptr null, align 16 + %icmp = icmp eq i128 %load, 65536 + ret i1 %icmp +} + +; Test using the !range metadata +define i1 @test5() { +; CHECK-AIX64-LABEL: test5: +; CHECK-AIX64: # %bb.0: # %bb +; CHECK-AIX64-NEXT: ld r3, 0(0) +; CHECK-AIX64-NEXT: ld r4, 8(0) +; CHECK-AIX64-NEXT: or r3, r4, r3 +; CHECK-AIX64-NEXT: cntlzd r3, r3 +; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63 +; CHECK-AIX64-NEXT: blr +; +; CHECK-LINUX-LABEL: test5: +; CHECK-LINUX: # %bb.0: # %bb +; CHECK-LINUX-NEXT: ld r3, 8(0) +; CHECK-LINUX-NEXT: ld r4, 0(0) +; CHECK-LINUX-NEXT: or r3, r4, r3 +; CHECK-LINUX-NEXT: cntlzd r3, r3 +; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63 +; CHECK-LINUX-NEXT: blr +; +; CHECK-AIX32-LABEL: test5: +; CHECK-AIX32: # %bb.0: # %bb +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: xxlxor vs35, vs35, vs35 +; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX32-NEXT: mfocrf r3, 2 +; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX32-NEXT: blr +bb: + %load = load i128, ptr null, align 16, !range !0 + %icmp = icmp eq i128 %load, 0 + ret i1 %icmp +} + +define i1 @test6() { +; CHECK-AIX64-LABEL: test6: +; CHECK-AIX64: # %bb.0: # %bb +; CHECK-AIX64-NEXT: ld r4, 8(0) +; CHECK-AIX64-NEXT: ld r3, 0(0) +; CHECK-AIX64-NEXT: xori r4, r4, 65535 +; CHECK-AIX64-NEXT: or r3, r4, r3 +; CHECK-AIX64-NEXT: cntlzd r3, r3 +; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63 +; CHECK-AIX64-NEXT: blr +; +; CHECK-LINUX-LABEL: test6: +; CHECK-LINUX: # %bb.0: # %bb +; CHECK-LINUX-NEXT: ld r4, 0(0) +; CHECK-LINUX-NEXT: ld r3, 8(0) +; CHECK-LINUX-NEXT: xori r4, r4, 65535 +; CHECK-LINUX-NEXT: or r3, r4, r3 +; CHECK-LINUX-NEXT: cntlzd r3, r3 +; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63 +; CHECK-LINUX-NEXT: blr +; +; CHECK-AIX32-LABEL: test6: +; CHECK-AIX32: # %bb.0: # %bb +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX32-NEXT: lwz r3, L..C3(r2) # %const.0 +; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3 +; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX32-NEXT: mfocrf r3, 2 +; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX32-NEXT: blr +bb: + %load = load i128, ptr null, align 16, !range !1 + %icmp = icmp eq i128 %load, 65535 + ret i1 %icmp +} + +define i1 @test7() { +; COMMON-LABEL: test7: +; COMMON: # %bb.0: # %bb +; COMMON-NEXT: li r3, 0 +; COMMON-NEXT: blr +bb: + %load = load i128, ptr null, align 16, !range !1 + %icmp = icmp eq i128 %load, 65536 + ret i1 %icmp +} + +define i1 @test8() { +; CHECK-AIX64-LABEL: test8: +; CHECK-AIX64: # %bb.0: # %bb +; CHECK-AIX64-NEXT: li r3, 0 +; CHECK-AIX64-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX64-NEXT: ld r3, L..C1(r2) # %const.0 +; CHECK-AIX64-NEXT: lxvd2x vs35, 0, r3 +; CHECK-AIX64-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX64-NEXT: mfocrf r3, 2 +; CHECK-AIX64-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX64-NEXT: blr +; +; CHECK-LINUX-LABEL: test8: +; CHECK-LINUX: # %bb.0: # %bb +; CHECK-LINUX-NEXT: li r3, 0 +; CHECK-LINUX-NEXT: lxvd2x vs34, 0, r3 +; CHECK-LINUX-NEXT: addis r3, r2, .LCPI7_0@toc@ha +; CHECK-LINUX-NEXT: addi r3, r3, .LCPI7_0@toc@l +; CHECK-LINUX-NEXT: lxvd2x vs35, 0, r3 +; CHECK-LINUX-NEXT: vcmpequb. v2, v2, v3 +; CHECK-LINUX-NEXT: mfocrf r3, 2 +; CHECK-LINUX-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-LINUX-NEXT: blr +; +; CHECK-AIX32-LABEL: test8: +; CHECK-AIX32: # %bb.0: # %bb +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3 +; CHECK-AIX32-NEXT: lwz r3, L..C4(r2) # %const.0 +; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3 +; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3 +; CHECK-AIX32-NEXT: mfocrf r3, 2 +; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31 +; CHECK-AIX32-NEXT: blr +bb: + %load = load i128, ptr null, align 16, !range !2 + %icmp = icmp eq i128 %load, 65536 + ret i1 %icmp +} + +!0 = !{i128 0, i128 2} +!1 = !{i128 0, i128 65536} +!2 = !{i128 0, i128 65537} _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
