https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/197862
>From 6c022f5972acc87806d2bc7118d82b80c08cba71 Mon Sep 17 00:00:00 2001 From: jofrn <[email protected]> Date: Wed, 3 Jun 2026 05:46:34 -0700 Subject: [PATCH 1/2] [SelectionDAG] Keep split vector atomic store value in a vector register When the value of an ATOMIC_STORE has a vector type whose legalization action is split (e.g. <4 x half>/<4 x bfloat> on X86 without F16C), SplitVecOp_ATOMIC_STORE bitcast the value straight to a scalar integer spanning the memory width. For a split vector that bitcast is expanded element by element, reassembling the value in GPRs (a long pextrw/shl/or sequence) before the store. Instead, keep the value in a vector register when a legal vector form exists: reinterpret it as a same-shaped integer-element vector (an FP element type may have no legal vector form, e.g. bfloat on SSE2, while the integer-of-element-size form does), widen that to a legal vector, and extract the low integer element of the memory width. This issues the store directly from a vector register (a single MOVQ/MOVD on X86), matching the widen-path codegen already produced on AVX targets. Falls back to the scalar bitcast when no suitable legal vector type exists. --- .../SelectionDAG/LegalizeVectorTypes.cpp | 43 +- llvm/test/CodeGen/X86/atomic-load-store.ll | 506 +++++++----------- 2 files changed, 227 insertions(+), 322 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5350be412176d..73cc5dc76c1eb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4742,16 +4742,47 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { SDValue DAGTypeLegalizer::SplitVecOp_ATOMIC_STORE(AtomicSDNode *N) { SDLoc DL(N); + LLVMContext &Ctx = *DAG.getContext(); SDValue StVal = N->getVal(); EVT VT = StVal.getValueType(); + EVT MemIntVT = EVT::getIntegerVT(Ctx, N->getMemoryVT().getSizeInBits()); + + // The store needs a single value spanning the full memory width. If the + // value can be held in a legal vector register, keep it there and extract + // the low integer element of the memory width. This lets the store be issued + // directly from a vector register (e.g. a single MOVQ/MOVD) instead of + // bitcasting the split vector straight to a scalar integer, which would + // reassemble the value element by element in GPRs. + // + // Reinterpret the value as a same-shaped integer vector first: an FP element + // type may not have a legal vector form (e.g. bfloat on SSE2) while the + // integer-of-element-size form does. + unsigned NumElts = VT.getVectorNumElements(); + EVT IntEltVT = EVT::getIntegerVT(Ctx, VT.getScalarSizeInBits()); + EVT IntVecVT = EVT::getVectorVT(Ctx, IntEltVT, NumElts); + if (DAG.getDataLayout().isLittleEndian() && TLI.isTypeLegal(MemIntVT) && + IntEltVT.getSizeInBits() <= MemIntVT.getSizeInBits()) { + EVT WideVT = IntVecVT; + while (!TLI.isTypeLegal(WideVT) && WideVT.getSizeInBits() < 512) + WideVT = + EVT::getVectorVT(Ctx, IntEltVT, WideVT.getVectorNumElements() * 2); + if (TLI.isTypeLegal(WideVT) && + WideVT.getSizeInBits() % MemIntVT.getSizeInBits() == 0) { + SDValue Wide = ModifyToType(DAG.getBitcast(IntVecVT, StVal), WideVT); + unsigned NumMemElts = WideVT.getSizeInBits() / MemIntVT.getSizeInBits(); + EVT MemVecVT = EVT::getVectorVT(Ctx, MemIntVT, NumMemElts); + SDValue Elt = DAG.getExtractVectorElt(DL, MemIntVT, + DAG.getBitcast(MemVecVT, Wide), 0); + return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MemIntVT, N->getChain(), Elt, + N->getBasePtr(), N->getMemOperand()); + } + } - // Issue a single atomic store of an integer that spans the full memory - // width. Bitcasting the (illegal) vector value to that integer lets the - // type legalizer further legalize the BITCAST input as needed, while the + // Otherwise issue a single atomic store of an integer that spans the full + // memory width. Bitcasting the (illegal) vector value to that integer lets + // the type legalizer further legalize the BITCAST input as needed, while the // ATOMIC_STORE itself uses only the legal integer type. - EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); - EVT MemIntVT = - EVT::getIntegerVT(*DAG.getContext(), N->getMemoryVT().getSizeInBits()); + EVT IntVT = EVT::getIntegerVT(Ctx, VT.getSizeInBits()); SDValue AsInt = DAG.getBitcast(IntVT, StVal); return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MemIntVT, N->getChain(), AsInt, N->getBasePtr(), N->getMemOperand()); diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 5be8d4d47fa9e..1ac3066a393a8 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -754,13 +754,7 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) { ; CHECK-SSE-O3-LABEL: store_atomic_vec2_half: ; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %eax -; CHECK-SSE-O3-NEXT: psrld $16, %xmm0 -; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %ecx -; CHECK-SSE-O3-NEXT: shll $16, %ecx -; CHECK-SSE-O3-NEXT: movzwl %ax, %eax -; CHECK-SSE-O3-NEXT: orl %ecx, %eax -; CHECK-SSE-O3-NEXT: movl %eax, (%rdi) +; CHECK-SSE-O3-NEXT: movss %xmm0, (%rdi) ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: store_atomic_vec2_half: @@ -773,16 +767,8 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) { ; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 ; CHECK-SSE-O0-NEXT: movaps %xmm1, %xmm0 ; CHECK-SSE-O0-NEXT: psrld $16, %xmm1 -; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE-O0-NEXT: movw %ax, %cx -; CHECK-SSE-O0-NEXT: shll $16, %ecx -; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %eax -; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE-O0-NEXT: movzwl %ax, %eax -; CHECK-SSE-O0-NEXT: orl %ecx, %eax -; CHECK-SSE-O0-NEXT: movl %eax, (%rdi) +; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: movd %xmm0, (%rdi) ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: store_atomic_vec2_half: @@ -796,13 +782,7 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) { define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) nounwind { ; CHECK-SSE-O3-LABEL: store_atomic_vec2_bfloat: ; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %eax -; CHECK-SSE-O3-NEXT: psrld $16, %xmm0 -; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %ecx -; CHECK-SSE-O3-NEXT: shll $16, %ecx -; CHECK-SSE-O3-NEXT: movzwl %ax, %eax -; CHECK-SSE-O3-NEXT: orl %ecx, %eax -; CHECK-SSE-O3-NEXT: movl %eax, (%rdi) +; CHECK-SSE-O3-NEXT: movss %xmm0, (%rdi) ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: store_atomic_vec2_bfloat: @@ -810,68 +790,78 @@ define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) nounwind { ; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi) ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat: -; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: subq $24, %rsp -; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-SSE-O0-NEXT: pextrw $1, %xmm1, %eax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 -; CHECK-SSE-O0-NEXT: movd %xmm1, %eax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movd %eax, %xmm1 -; CHECK-SSE-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE-O0-NEXT: movw %ax, %cx -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %eax -; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE-O0-NEXT: movzwl %ax, %eax -; CHECK-SSE-O0-NEXT: orl %ecx, %eax -; CHECK-SSE-O0-NEXT: movl %eax, (%rdi) -; CHECK-SSE-O0-NEXT: addq $24, %rsp -; CHECK-SSE-O0-NEXT: retq +; CHECK-SSE2-O0-LABEL: store_atomic_vec2_bfloat: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: subq $24, %rsp +; CHECK-SSE2-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-SSE2-O0-NEXT: pextrw $1, %xmm0, %eax +; CHECK-SSE2-O0-NEXT: shll $16, %eax +; CHECK-SSE2-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE2-O0-NEXT: movd %xmm0, %eax +; CHECK-SSE2-O0-NEXT: shll $16, %eax +; CHECK-SSE2-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE2-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE2-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; CHECK-SSE2-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE2-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; CHECK-SSE2-O0-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE2-O0-NEXT: movl %eax, (%rdi) +; CHECK-SSE2-O0-NEXT: addq $24, %rsp +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: store_atomic_vec2_bfloat: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: subq $24, %rsp +; CHECK-SSE4-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-SSE4-O0-NEXT: pextrw $1, %xmm0, %eax +; CHECK-SSE4-O0-NEXT: shll $16, %eax +; CHECK-SSE4-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE4-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE4-O0-NEXT: movd %xmm0, %eax +; CHECK-SSE4-O0-NEXT: shll $16, %eax +; CHECK-SSE4-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE4-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE4-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE4-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, {{[0-9]+}}(%rsp) +; CHECK-SSE4-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; CHECK-SSE4-O0-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE4-O0-NEXT: movl %eax, (%rdi) +; CHECK-SSE4-O0-NEXT: addq $24, %rsp +; CHECK-SSE4-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: store_atomic_vec2_bfloat: ; CHECK-AVX-O0: # %bb.0: ; CHECK-AVX-O0-NEXT: subq $24, %rsp -; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 ; CHECK-AVX-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm1, %eax -; CHECK-AVX-O0-NEXT: shll $16, %eax -; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 -; CHECK-AVX-O0-NEXT: vmovd %xmm1, %eax +; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm0, %eax ; CHECK-AVX-O0-NEXT: shll $16, %eax ; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1 ; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vmovd %xmm0, %eax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT ; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 ; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax -; CHECK-AVX-O0-NEXT: movw %ax, %cx -; CHECK-AVX-O0-NEXT: # implicit-def: $eax -; CHECK-AVX-O0-NEXT: movw %cx, %ax -; CHECK-AVX-O0-NEXT: shll $16, %eax -; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, {{[0-9]+}}(%rsp) ; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload ; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-AVX-O0-NEXT: movzwl %ax, %eax -; CHECK-AVX-O0-NEXT: orl %ecx, %eax +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-AVX-O0-NEXT: movl %eax, (%rdi) ; CHECK-AVX-O0-NEXT: addq $24, %rsp ; CHECK-AVX-O0-NEXT: retq @@ -880,48 +870,10 @@ define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) nounwind { } define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind { -; CHECK-SSE2-O3-LABEL: store_atomic_vec4_half: -; CHECK-SSE2-O3: # %bb.0: -; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-O3-NEXT: psrld $16, %xmm1 -; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %ecx -; CHECK-SSE2-O3-NEXT: shll $16, %eax -; CHECK-SSE2-O3-NEXT: movzwl %cx, %ecx -; CHECK-SSE2-O3-NEXT: orl %eax, %ecx -; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-O3-NEXT: psrlq $48, %xmm1 -; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE2-O3-NEXT: shll $16, %eax -; CHECK-SSE2-O3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %edx -; CHECK-SSE2-O3-NEXT: movzwl %dx, %edx -; CHECK-SSE2-O3-NEXT: orl %eax, %edx -; CHECK-SSE2-O3-NEXT: shlq $32, %rdx -; CHECK-SSE2-O3-NEXT: orq %rcx, %rdx -; CHECK-SSE2-O3-NEXT: movq %rdx, (%rdi) -; CHECK-SSE2-O3-NEXT: retq -; -; CHECK-SSE4-O3-LABEL: store_atomic_vec4_half: -; CHECK-SSE4-O3: # %bb.0: -; CHECK-SSE4-O3-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE4-O3-NEXT: psrld $16, %xmm1 -; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE4-O3-NEXT: shll $16, %eax -; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %ecx -; CHECK-SSE4-O3-NEXT: movzwl %cx, %ecx -; CHECK-SSE4-O3-NEXT: orl %eax, %ecx -; CHECK-SSE4-O3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE4-O3-NEXT: psrlq $48, %xmm0 -; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %eax -; CHECK-SSE4-O3-NEXT: shll $16, %eax -; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %edx -; CHECK-SSE4-O3-NEXT: movzwl %dx, %edx -; CHECK-SSE4-O3-NEXT: orl %eax, %edx -; CHECK-SSE4-O3-NEXT: shlq $32, %rdx -; CHECK-SSE4-O3-NEXT: orq %rcx, %rdx -; CHECK-SSE4-O3-NEXT: movq %rdx, (%rdi) -; CHECK-SSE4-O3-NEXT: retq +; CHECK-SSE-O3-LABEL: store_atomic_vec4_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movlps %xmm0, (%rdi) +; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: store_atomic_vec4_half: ; CHECK-AVX-O3: # %bb.0: @@ -931,70 +883,30 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind { ; CHECK-SSE2-O0-LABEL: store_atomic_vec4_half: ; CHECK-SSE2-O0: # %bb.0: ; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm3 +; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm0 ; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm2 +; CHECK-SSE2-O0-NEXT: psrlq $48, %xmm2 ; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm1 -; CHECK-SSE2-O0-NEXT: psrlq $48, %xmm1 -; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm0 -; CHECK-SSE2-O0-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-SSE2-O0-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-SSE2-O0-NEXT: psrld $16, %xmm3 -; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm3, %eax -; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE2-O0-NEXT: movw %ax, %cx -; CHECK-SSE2-O0-NEXT: shll $16, %ecx -; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm2, %eax -; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE2-O0-NEXT: movzwl %ax, %eax -; CHECK-SSE2-O0-NEXT: orl %ecx, %eax -; CHECK-SSE2-O0-NEXT: # kill: def $rax killed $eax -; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %ecx -; CHECK-SSE2-O0-NEXT: movw %cx, %dx -; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE2-O0-NEXT: movw %dx, %cx -; CHECK-SSE2-O0-NEXT: shll $16, %ecx -; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm0, %edx -; CHECK-SSE2-O0-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-SSE2-O0-NEXT: movzwl %dx, %edx -; CHECK-SSE2-O0-NEXT: orl %ecx, %edx -; CHECK-SSE2-O0-NEXT: # implicit-def: $rcx -; CHECK-SSE2-O0-NEXT: movl %edx, %ecx -; CHECK-SSE2-O0-NEXT: shlq $32, %rcx -; CHECK-SSE2-O0-NEXT: orq %rcx, %rax -; CHECK-SSE2-O0-NEXT: movq %rax, (%rdi) +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-O0-NEXT: movq %xmm0, (%rdi) ; CHECK-SSE2-O0-NEXT: retq ; ; CHECK-SSE4-O0-LABEL: store_atomic_vec4_half: ; CHECK-SSE4-O0: # %bb.0: ; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm3 +; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm0 ; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm2 -; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm1 -; CHECK-SSE4-O0-NEXT: psrlq $48, %xmm1 -; CHECK-SSE4-O0-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] +; CHECK-SSE4-O0-NEXT: psrlq $48, %xmm2 +; CHECK-SSE4-O0-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; CHECK-SSE4-O0-NEXT: psrld $16, %xmm3 -; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm3, %eax -; CHECK-SSE4-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE4-O0-NEXT: movw %ax, %cx -; CHECK-SSE4-O0-NEXT: shll $16, %ecx -; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm2, %eax -; CHECK-SSE4-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE4-O0-NEXT: movzwl %ax, %eax -; CHECK-SSE4-O0-NEXT: orl %ecx, %eax -; CHECK-SSE4-O0-NEXT: # kill: def $rax killed $eax -; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, %ecx -; CHECK-SSE4-O0-NEXT: movw %cx, %dx -; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE4-O0-NEXT: movw %dx, %cx -; CHECK-SSE4-O0-NEXT: shll $16, %ecx -; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, %edx -; CHECK-SSE4-O0-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-SSE4-O0-NEXT: movzwl %dx, %edx -; CHECK-SSE4-O0-NEXT: orl %ecx, %edx -; CHECK-SSE4-O0-NEXT: # implicit-def: $rcx -; CHECK-SSE4-O0-NEXT: movl %edx, %ecx -; CHECK-SSE4-O0-NEXT: shlq $32, %rcx -; CHECK-SSE4-O0-NEXT: orq %rcx, %rax -; CHECK-SSE4-O0-NEXT: movq %rax, (%rdi) +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-SSE4-O0-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE4-O0-NEXT: movq %xmm0, (%rdi) ; CHECK-SSE4-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: store_atomic_vec4_half: @@ -1006,184 +918,146 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind { } define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind { -; CHECK-SSE2-O3-LABEL: store_atomic_vec4_bfloat: -; CHECK-SSE2-O3: # %bb.0: -; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-O3-NEXT: psrld $16, %xmm1 -; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %ecx -; CHECK-SSE2-O3-NEXT: shll $16, %eax -; CHECK-SSE2-O3-NEXT: movzwl %cx, %ecx -; CHECK-SSE2-O3-NEXT: orl %eax, %ecx -; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-O3-NEXT: psrlq $48, %xmm1 -; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE2-O3-NEXT: shll $16, %eax -; CHECK-SSE2-O3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %edx -; CHECK-SSE2-O3-NEXT: movzwl %dx, %edx -; CHECK-SSE2-O3-NEXT: orl %eax, %edx -; CHECK-SSE2-O3-NEXT: shlq $32, %rdx -; CHECK-SSE2-O3-NEXT: orq %rcx, %rdx -; CHECK-SSE2-O3-NEXT: movq %rdx, (%rdi) -; CHECK-SSE2-O3-NEXT: retq -; -; CHECK-SSE4-O3-LABEL: store_atomic_vec4_bfloat: -; CHECK-SSE4-O3: # %bb.0: -; CHECK-SSE4-O3-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE4-O3-NEXT: psrld $16, %xmm1 -; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE4-O3-NEXT: shll $16, %eax -; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %ecx -; CHECK-SSE4-O3-NEXT: movzwl %cx, %ecx -; CHECK-SSE4-O3-NEXT: orl %eax, %ecx -; CHECK-SSE4-O3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE4-O3-NEXT: psrlq $48, %xmm0 -; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %eax -; CHECK-SSE4-O3-NEXT: shll $16, %eax -; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %edx -; CHECK-SSE4-O3-NEXT: movzwl %dx, %edx -; CHECK-SSE4-O3-NEXT: orl %eax, %edx -; CHECK-SSE4-O3-NEXT: shlq $32, %rdx -; CHECK-SSE4-O3-NEXT: orq %rcx, %rdx -; CHECK-SSE4-O3-NEXT: movq %rdx, (%rdi) -; CHECK-SSE4-O3-NEXT: retq +; CHECK-SSE-O3-LABEL: store_atomic_vec4_bfloat: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movlps %xmm0, (%rdi) +; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: store_atomic_vec4_bfloat: ; CHECK-AVX-O3: # %bb.0: ; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-SSE-O0-LABEL: store_atomic_vec4_bfloat: -; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: subq $40, %rsp -; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-SSE-O0-NEXT: pextrw $3, %xmm1, %eax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 -; CHECK-SSE-O0-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-O0-NEXT: pextrw $2, %xmm1, %eax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 -; CHECK-SSE-O0-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-O0-NEXT: pextrw $1, %xmm1, %eax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 -; CHECK-SSE-O0-NEXT: movd %xmm1, %eax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movd %eax, %xmm1 -; CHECK-SSE-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE-O0-NEXT: movw %ax, %cx -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE-O0-NEXT: movzwl %ax, %eax -; CHECK-SSE-O0-NEXT: orl %ecx, %eax -; CHECK-SSE-O0-NEXT: # kill: def $rax killed $eax -; CHECK-SSE-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax -; CHECK-SSE-O0-NEXT: movw %ax, %cx -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: shll $16, %eax -; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %edx -; CHECK-SSE-O0-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-SSE-O0-NEXT: movzwl %dx, %edx -; CHECK-SSE-O0-NEXT: orl %ecx, %edx -; CHECK-SSE-O0-NEXT: # implicit-def: $rcx -; CHECK-SSE-O0-NEXT: movl %edx, %ecx -; CHECK-SSE-O0-NEXT: shlq $32, %rcx -; CHECK-SSE-O0-NEXT: orq %rcx, %rax -; CHECK-SSE-O0-NEXT: movq %rax, (%rdi) -; CHECK-SSE-O0-NEXT: addq $40, %rsp -; CHECK-SSE-O0-NEXT: retq +; CHECK-SSE2-O0-LABEL: store_atomic_vec4_bfloat: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: subq $40, %rsp +; CHECK-SSE2-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-SSE2-O0-NEXT: pextrw $3, %xmm0, %eax +; CHECK-SSE2-O0-NEXT: shll $16, %eax +; CHECK-SSE2-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE2-O0-NEXT: pextrw $2, %xmm0, %eax +; CHECK-SSE2-O0-NEXT: shll $16, %eax +; CHECK-SSE2-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE2-O0-NEXT: pextrw $1, %xmm0, %eax +; CHECK-SSE2-O0-NEXT: shll $16, %eax +; CHECK-SSE2-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE2-O0-NEXT: movd %xmm0, %eax +; CHECK-SSE2-O0-NEXT: shll $16, %eax +; CHECK-SSE2-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE2-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE2-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; CHECK-SSE2-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE2-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; CHECK-SSE2-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE2-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; CHECK-SSE2-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE2-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; CHECK-SSE2-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-SSE2-O0-NEXT: movq %rax, (%rdi) +; CHECK-SSE2-O0-NEXT: addq $40, %rsp +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: store_atomic_vec4_bfloat: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: subq $40, %rsp +; CHECK-SSE4-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-SSE4-O0-NEXT: pextrw $3, %xmm0, %eax +; CHECK-SSE4-O0-NEXT: shll $16, %eax +; CHECK-SSE4-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE4-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE4-O0-NEXT: pextrw $2, %xmm0, %eax +; CHECK-SSE4-O0-NEXT: shll $16, %eax +; CHECK-SSE4-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE4-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE4-O0-NEXT: pextrw $1, %xmm0, %eax +; CHECK-SSE4-O0-NEXT: shll $16, %eax +; CHECK-SSE4-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE4-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE4-O0-NEXT: movd %xmm0, %eax +; CHECK-SSE4-O0-NEXT: shll $16, %eax +; CHECK-SSE4-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE4-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE4-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE4-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, {{[0-9]+}}(%rsp) +; CHECK-SSE4-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE4-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE4-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, {{[0-9]+}}(%rsp) +; CHECK-SSE4-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE4-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE4-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, {{[0-9]+}}(%rsp) +; CHECK-SSE4-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; CHECK-SSE4-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-SSE4-O0-NEXT: movq %rax, (%rdi) +; CHECK-SSE4-O0-NEXT: addq $40, %rsp +; CHECK-SSE4-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: store_atomic_vec4_bfloat: ; CHECK-AVX-O0: # %bb.0: ; CHECK-AVX-O0-NEXT: subq $40, %rsp -; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 ; CHECK-AVX-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-AVX-O0-NEXT: vpextrw $3, %xmm1, %eax -; CHECK-AVX-O0-NEXT: shll $16, %eax -; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 -; CHECK-AVX-O0-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-O0-NEXT: vpextrw $2, %xmm1, %eax +; CHECK-AVX-O0-NEXT: vpextrw $3, %xmm0, %eax ; CHECK-AVX-O0-NEXT: shll $16, %eax -; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 -; CHECK-AVX-O0-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm1, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $2, %xmm0, %eax ; CHECK-AVX-O0-NEXT: shll $16, %eax -; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 -; CHECK-AVX-O0-NEXT: vmovd %xmm1, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm0, %eax ; CHECK-AVX-O0-NEXT: shll $16, %eax ; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1 ; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vmovd %xmm0, %eax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT ; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 ; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax -; CHECK-AVX-O0-NEXT: movw %ax, %cx -; CHECK-AVX-O0-NEXT: # implicit-def: $eax -; CHECK-AVX-O0-NEXT: movw %cx, %ax -; CHECK-AVX-O0-NEXT: shll $16, %eax -; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, {{[0-9]+}}(%rsp) ; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload ; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 ; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax -; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-AVX-O0-NEXT: movzwl %ax, %eax -; CHECK-AVX-O0-NEXT: orl %ecx, %eax -; CHECK-AVX-O0-NEXT: # kill: def $rax killed $eax -; CHECK-AVX-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, {{[0-9]+}}(%rsp) ; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT ; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 ; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax -; CHECK-AVX-O0-NEXT: movw %ax, %cx -; CHECK-AVX-O0-NEXT: # implicit-def: $eax -; CHECK-AVX-O0-NEXT: movw %cx, %ax -; CHECK-AVX-O0-NEXT: shll $16, %eax -; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, {{[0-9]+}}(%rsp) ; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT -; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, %edx -; CHECK-AVX-O0-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-AVX-O0-NEXT: movzwl %dx, %edx -; CHECK-AVX-O0-NEXT: orl %ecx, %edx -; CHECK-AVX-O0-NEXT: # implicit-def: $rcx -; CHECK-AVX-O0-NEXT: movl %edx, %ecx -; CHECK-AVX-O0-NEXT: shlq $32, %rcx -; CHECK-AVX-O0-NEXT: orq %rcx, %rax +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax ; CHECK-AVX-O0-NEXT: movq %rax, (%rdi) ; CHECK-AVX-O0-NEXT: addq $40, %rsp ; CHECK-AVX-O0-NEXT: retq >From 82d1e80b007bac2554ad0b475747999c340b54c6 Mon Sep 17 00:00:00 2001 From: jofrn <[email protected]> Date: Thu, 14 May 2026 20:42:42 -0700 Subject: [PATCH 2/2] [AtomicExpand] Add bitcasts when expanding store atomic vector AtomicExpand fails for aligned \`store atomic <n x T>\` because it does not find a compatible library call. This change adds appropriate ptrtoint + bitcast so that the call can be lowered, mirroring the load-side handling from #148900. --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 6 +- llvm/test/CodeGen/ARM/atomic-load-store.ll | 49 ++++++++ llvm/test/CodeGen/X86/atomic-load-store.ll | 105 +++++++++++++++++- .../X86/expand-atomic-non-integer.ll | 98 ++++++++++++++++ 4 files changed, 250 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 960d2492c2856..db048e0c5ab5c 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -711,7 +711,9 @@ StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) { auto *M = SI->getModule(); Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), M->getDataLayout()); - Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); + Value *NewVal = SI->getValueOperand()->getType()->isPtrOrPtrVectorTy() + ? Builder.CreatePtrToInt(SI->getValueOperand(), NewTy) + : Builder.CreateBitCast(SI->getValueOperand(), NewTy); Value *Addr = SI->getPointerOperand(); @@ -2191,7 +2193,7 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall( if (ValueOperand) { if (UseSizedLibcall) { Value *IntValue = - Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); + Builder.CreateBitPreservingCastChain(DL, ValueOperand, SizedIntTy); Args.push_back(IntValue); } else { AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll index 1af2832702296..0c787a4ca05c3 100644 --- a/llvm/test/CodeGen/ARM/atomic-load-store.ll +++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll @@ -1038,3 +1038,52 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 { %ret = load atomic <1 x ptr>, ptr %x acquire, align 4 ret <1 x ptr> %ret } + +define void @store_atomic_vec1_ptr(ptr %x, <1 x ptr> %v) #0 { +; ARM-LABEL: store_atomic_vec1_ptr: +; ARM: @ %bb.0: +; ARM-NEXT: dmb ish +; ARM-NEXT: str r1, [r0] +; ARM-NEXT: bx lr +; +; ARMOPTNONE-LABEL: store_atomic_vec1_ptr: +; ARMOPTNONE: @ %bb.0: +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: str r1, [r0] +; ARMOPTNONE-NEXT: bx lr +; +; THUMBTWO-LABEL: store_atomic_vec1_ptr: +; THUMBTWO: @ %bb.0: +; THUMBTWO-NEXT: dmb ish +; THUMBTWO-NEXT: str r1, [r0] +; THUMBTWO-NEXT: bx lr +; +; THUMBONE-LABEL: store_atomic_vec1_ptr: +; THUMBONE: @ %bb.0: +; THUMBONE-NEXT: push {r7, lr} +; THUMBONE-NEXT: bl __sync_lock_test_and_set_4 +; THUMBONE-NEXT: pop {r7, pc} +; +; ARMV4-LABEL: store_atomic_vec1_ptr: +; ARMV4: @ %bb.0: +; ARMV4-NEXT: push {r11, lr} +; ARMV4-NEXT: mov r2, #3 +; ARMV4-NEXT: bl __atomic_store_4 +; ARMV4-NEXT: pop {r11, lr} +; ARMV4-NEXT: mov pc, lr +; +; ARMV6-LABEL: store_atomic_vec1_ptr: +; ARMV6: @ %bb.0: +; ARMV6-NEXT: mov r2, #0 +; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5 +; ARMV6-NEXT: str r1, [r0] +; ARMV6-NEXT: bx lr +; +; THUMBM-LABEL: store_atomic_vec1_ptr: +; THUMBM: @ %bb.0: +; THUMBM-NEXT: dmb sy +; THUMBM-NEXT: str r1, [r0] +; THUMBM-NEXT: bx lr + store atomic <1 x ptr> %v, ptr %x release, align 4 + ret void +} diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 1ac3066a393a8..47f8a15fdf267 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -657,6 +657,53 @@ define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind { %ret = load atomic <2 x ptr>, ptr %x acquire, align 16 ret <2 x ptr> %ret } + +define void @store_atomic_vec2_ptr_align(ptr %x, <2 x ptr> %v) nounwind { +; CHECK-SSE2-O3-LABEL: store_atomic_vec2_ptr_align: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: pushq %rax +; CHECK-SSE2-O3-NEXT: movq %xmm0, %rsi +; CHECK-SSE2-O3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; CHECK-SSE2-O3-NEXT: movq %xmm0, %rdx +; CHECK-SSE2-O3-NEXT: movl $3, %ecx +; CHECK-SSE2-O3-NEXT: callq __atomic_store_16@PLT +; CHECK-SSE2-O3-NEXT: popq %rax +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: store_atomic_vec2_ptr_align: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec2_ptr_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: store_atomic_vec2_ptr_align: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: pushq %rax +; CHECK-SSE2-O0-NEXT: movq %xmm0, %rsi +; CHECK-SSE2-O0-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; CHECK-SSE2-O0-NEXT: movq %xmm0, %rdx +; CHECK-SSE2-O0-NEXT: movl $3, %ecx +; CHECK-SSE2-O0-NEXT: callq __atomic_store_16@PLT +; CHECK-SSE2-O0-NEXT: popq %rax +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: store_atomic_vec2_ptr_align: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec2_ptr_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <2 x ptr> %v, ptr %x release, align 16 + ret void +} + define <4 x ptr addrspace(270)> @atomic_vec4_ptr270(ptr %x) nounwind { ; CHECK-SSE2-O3-LABEL: atomic_vec4_ptr270: ; CHECK-SSE2-O3: # %bb.0: @@ -703,6 +750,52 @@ define <4 x ptr addrspace(270)> @atomic_vec4_ptr270(ptr %x) nounwind { ret <4 x ptr addrspace(270)> %ret } +define void @store_atomic_vec4_ptr270_align(ptr %x, <4 x ptr addrspace(270)> %v) nounwind { +; CHECK-SSE2-O3-LABEL: store_atomic_vec4_ptr270_align: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: pushq %rax +; CHECK-SSE2-O3-NEXT: movq %xmm0, %rsi +; CHECK-SSE2-O3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; CHECK-SSE2-O3-NEXT: movq %xmm0, %rdx +; CHECK-SSE2-O3-NEXT: movl $3, %ecx +; CHECK-SSE2-O3-NEXT: callq __atomic_store_16@PLT +; CHECK-SSE2-O3-NEXT: popq %rax +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: store_atomic_vec4_ptr270_align: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec4_ptr270_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: store_atomic_vec4_ptr270_align: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: pushq %rax +; CHECK-SSE2-O0-NEXT: movq %xmm0, %rsi +; CHECK-SSE2-O0-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; CHECK-SSE2-O0-NEXT: movq %xmm0, %rdx +; CHECK-SSE2-O0-NEXT: movl $3, %ecx +; CHECK-SSE2-O0-NEXT: callq __atomic_store_16@PLT +; CHECK-SSE2-O0-NEXT: popq %rax +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: store_atomic_vec4_ptr270_align: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec4_ptr270_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <4 x ptr addrspace(270)> %v, ptr %x release, align 16 + ret void +} + define <2 x i32> @atomic_vec2_i32_align(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_i32_align: ; CHECK-SSE-O3: # %bb.0: @@ -1084,7 +1177,7 @@ define void @store_atomic_vec4_float_align(ptr %x, <4 x float> %v) nounwind { ; CHECK-SSE4-O3-NEXT: pextrq $1, %xmm0, %rcx ; CHECK-SSE4-O3-NEXT: movq %xmm0, %rbx ; CHECK-SSE4-O3-NEXT: .p2align 4 -; CHECK-SSE4-O3-NEXT: .LBB39_1: # %atomicrmw.start +; CHECK-SSE4-O3-NEXT: .LBB41_1: # %atomicrmw.start ; CHECK-SSE4-O3-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-SSE4-O3-NEXT: movq %xmm1, %rax ; CHECK-SSE4-O3-NEXT: pextrq $1, %xmm1, %rdx @@ -1092,7 +1185,7 @@ define void @store_atomic_vec4_float_align(ptr %x, <4 x float> %v) nounwind { ; CHECK-SSE4-O3-NEXT: movq %rdx, %xmm0 ; CHECK-SSE4-O3-NEXT: movq %rax, %xmm1 ; CHECK-SSE4-O3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-SSE4-O3-NEXT: jne .LBB39_1 +; CHECK-SSE4-O3-NEXT: jne .LBB41_1 ; CHECK-SSE4-O3-NEXT: # %bb.2: # %atomicrmw.end ; CHECK-SSE4-O3-NEXT: popq %rbx ; CHECK-SSE4-O3-NEXT: retq @@ -1120,7 +1213,7 @@ define void @store_atomic_vec4_float_align(ptr %x, <4 x float> %v) nounwind { ; CHECK-SSE4-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-SSE4-O0-NEXT: movaps (%rdi), %xmm0 ; CHECK-SSE4-O0-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE4-O0-NEXT: .LBB39_1: # %atomicrmw.start +; CHECK-SSE4-O0-NEXT: .LBB41_1: # %atomicrmw.start ; CHECK-SSE4-O0-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-SSE4-O0-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload @@ -1137,9 +1230,9 @@ define void @store_atomic_vec4_float_align(ptr %x, <4 x float> %v) nounwind { ; CHECK-SSE4-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-SSE4-O0-NEXT: testb $1, %al ; CHECK-SSE4-O0-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-SSE4-O0-NEXT: jne .LBB39_2 -; CHECK-SSE4-O0-NEXT: jmp .LBB39_1 -; CHECK-SSE4-O0-NEXT: .LBB39_2: # %atomicrmw.end +; CHECK-SSE4-O0-NEXT: jne .LBB41_2 +; CHECK-SSE4-O0-NEXT: jmp .LBB41_1 +; CHECK-SSE4-O0-NEXT: .LBB41_2: # %atomicrmw.end ; CHECK-SSE4-O0-NEXT: popq %rbx ; CHECK-SSE4-O0-NEXT: retq ; diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll index 17c20e211e508..46cc641246610 100644 --- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll +++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll @@ -454,3 +454,101 @@ define void @store_i128_volatile_syncscope(ptr %p, i128 %x) { store atomic volatile i128 %x, ptr %p syncscope("singlethread") seq_cst, align 16 ret void } + +define void @store_atomic_vec2_ptr_align(ptr %x, <2 x ptr> %v) nounwind { +; CHECK64-LABEL: define void @store_atomic_vec2_ptr_align( +; CHECK64-SAME: ptr [[X:%.*]], <2 x ptr> [[V:%.*]]) #[[ATTR0]] { +; CHECK64-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[V]] to <2 x i64> +; CHECK64-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +; CHECK64-NEXT: call void @__atomic_store_16(ptr [[X]], i128 [[TMP2]], i32 3) +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @store_atomic_vec2_ptr_align( +; CHECK32-SAME: ptr [[X:%.*]], <2 x ptr> [[V:%.*]]) #[[ATTR0]] { +; CHECK32-NEXT: store atomic <2 x ptr> [[V]], ptr [[X]] release, align 16 +; CHECK32-NEXT: ret void +; + store atomic <2 x ptr> %v, ptr %x release, align 16 + ret void +} + +define void @store_atomic_vec4_ptr270_align(ptr %x, <4 x ptr addrspace(270)> %v) nounwind { +; CHECK64-LABEL: define void @store_atomic_vec4_ptr270_align( +; CHECK64-SAME: ptr [[X:%.*]], <4 x ptr addrspace(270)> [[V:%.*]]) #[[ATTR0]] { +; CHECK64-NEXT: [[TMP1:%.*]] = ptrtoint <4 x ptr addrspace(270)> [[V]] to <4 x i32> +; CHECK64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK64-NEXT: call void @__atomic_store_16(ptr [[X]], i128 [[TMP2]], i32 3) +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @store_atomic_vec4_ptr270_align( +; CHECK32-SAME: ptr [[X:%.*]], <4 x ptr addrspace(270)> [[V:%.*]]) #[[ATTR0]] { +; CHECK32-NEXT: [[TMP1:%.*]] = alloca <4 x ptr addrspace(270)>, align 16 +; CHECK32-NEXT: call void @llvm.lifetime.start.p0(ptr [[TMP1]]) +; CHECK32-NEXT: store <4 x ptr addrspace(270)> [[V]], ptr [[TMP1]], align 16 +; CHECK32-NEXT: call void @__atomic_store(i32 16, ptr [[X]], ptr [[TMP1]], i32 3) +; CHECK32-NEXT: call void @llvm.lifetime.end.p0(ptr [[TMP1]]) +; CHECK32-NEXT: ret void +; + store atomic <4 x ptr addrspace(270)> %v, ptr %x release, align 16 + ret void +} + +define void @store_atomic_vec2_i16(ptr %x, <2 x i16> %v) nounwind { +; CHECK-LABEL: define void @store_atomic_vec2_i16( +; CHECK-SAME: ptr [[X:%.*]], <2 x i16> [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store atomic <2 x i16> [[V]], ptr [[X]] release, align 8 +; CHECK-NEXT: ret void +; + store atomic <2 x i16> %v, ptr %x release, align 8 + ret void +} + +define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) nounwind { +; CHECK-LABEL: define void @store_atomic_vec2_half( +; CHECK-SAME: ptr [[X:%.*]], <2 x half> [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store atomic <2 x half> [[V]], ptr [[X]] release, align 8 +; CHECK-NEXT: ret void +; + store atomic <2 x half> %v, ptr %x release, align 8 + ret void +} + +define void @store_atomic_vec4_i32(ptr %x, <4 x i32> %v) nounwind { +; CHECK64-LABEL: define void @store_atomic_vec4_i32( +; CHECK64-SAME: ptr [[X:%.*]], <4 x i32> [[V:%.*]]) #[[ATTR0]] { +; CHECK64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V]] to i128 +; CHECK64-NEXT: call void @__atomic_store_16(ptr [[X]], i128 [[TMP1]], i32 3) +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @store_atomic_vec4_i32( +; CHECK32-SAME: ptr [[X:%.*]], <4 x i32> [[V:%.*]]) #[[ATTR0]] { +; CHECK32-NEXT: [[TMP1:%.*]] = alloca <4 x i32>, align 16 +; CHECK32-NEXT: call void @llvm.lifetime.start.p0(ptr [[TMP1]]) +; CHECK32-NEXT: store <4 x i32> [[V]], ptr [[TMP1]], align 16 +; CHECK32-NEXT: call void @__atomic_store(i32 16, ptr [[X]], ptr [[TMP1]], i32 3) +; CHECK32-NEXT: call void @llvm.lifetime.end.p0(ptr [[TMP1]]) +; CHECK32-NEXT: ret void +; + store atomic <4 x i32> %v, ptr %x release, align 16 + ret void +} + +define void @store_atomic_vec4_float(ptr %x, <4 x float> %v) nounwind { +; CHECK64-LABEL: define void @store_atomic_vec4_float( +; CHECK64-SAME: ptr [[X:%.*]], <4 x float> [[V:%.*]]) #[[ATTR0]] { +; CHECK64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V]] to i128 +; CHECK64-NEXT: call void @__atomic_store_16(ptr [[X]], i128 [[TMP1]], i32 3) +; CHECK64-NEXT: ret void +; +; CHECK32-LABEL: define void @store_atomic_vec4_float( +; CHECK32-SAME: ptr [[X:%.*]], <4 x float> [[V:%.*]]) #[[ATTR0]] { +; CHECK32-NEXT: [[TMP1:%.*]] = alloca <4 x float>, align 16 +; CHECK32-NEXT: call void @llvm.lifetime.start.p0(ptr [[TMP1]]) +; CHECK32-NEXT: store <4 x float> [[V]], ptr [[TMP1]], align 16 +; CHECK32-NEXT: call void @__atomic_store(i32 16, ptr [[X]], ptr [[TMP1]], i32 3) +; CHECK32-NEXT: call void @llvm.lifetime.end.p0(ptr [[TMP1]]) +; CHECK32-NEXT: ret void +; + store atomic <4 x float> %v, ptr %x release, align 16 + ret void +} _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
