[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
pengfei added inline comments. Comment at: llvm/docs/LangRef.rst:21195 +'``llvm.arithmetic_fence``' Intrinsic +^^ + pengfei wrote: > Should be equal to the text? Yeah, a good catch. But I initially meant `^^^` should be equal to the title. :) Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D99675/new/ https://reviews.llvm.org/D99675 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
mibintc updated this revision to Diff 350700. mibintc added a comment. I corrected error in LangRef documentation that @pengfei pointed out. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D99675/new/ https://reviews.llvm.org/D99675 Files: llvm/docs/LangRef.rst llvm/include/llvm/Analysis/TargetTransformInfoImpl.h llvm/include/llvm/CodeGen/BasicTTIImpl.h llvm/include/llvm/CodeGen/ISDOpcodes.h llvm/include/llvm/CodeGen/SelectionDAGISel.h llvm/include/llvm/IR/IRBuilder.h llvm/include/llvm/IR/Intrinsics.td llvm/include/llvm/Support/TargetOpcodes.def llvm/include/llvm/Target/Target.td llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp llvm/test/CodeGen/X86/arithmetic_fence.ll llvm/test/CodeGen/X86/arithmetic_fence2.ll Index: llvm/test/CodeGen/X86/arithmetic_fence2.ll === --- /dev/null +++ llvm/test/CodeGen/X86/arithmetic_fence2.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 + +define double @f1(double %a) { +; X86-LABEL: f1: +; X86: # %bb.0: +; X86-NEXT:pushl %ebp +; X86-NEXT:.cfi_def_cfa_offset 8 +; X86-NEXT:.cfi_offset %ebp, -8 +; X86-NEXT:movl %esp, %ebp +; X86-NEXT:.cfi_def_cfa_register %ebp +; X86-NEXT:andl $-8, %esp +; X86-NEXT:subl $8, %esp +; X86-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT:mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT:movsd %xmm0, (%esp) +; X86-NEXT:fldl (%esp) +; X86-NEXT:movl %ebp, %esp +; X86-NEXT:popl %ebp +; X86-NEXT:.cfi_def_cfa %esp, 4 +; X86-NEXT:retl +; +; X64-LABEL: f1: +; X64: # %bb.0: +; X64-NEXT:mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT:retq + %1 = fadd fast double %a, %a + %2 = fadd fast double %a, %a + %3 = fadd fast double %1, %2 + ret double %3 +} + +define double @f2(double %a) { +; X86-LABEL: f2: +; X86: # %bb.0: +; X86-NEXT:pushl %ebp +; X86-NEXT:.cfi_def_cfa_offset 8 +; X86-NEXT:.cfi_offset %ebp, -8 +; X86-NEXT:movl %esp, %ebp +; X86-NEXT:.cfi_def_cfa_register %ebp +; X86-NEXT:andl $-8, %esp +; X86-NEXT:subl $8, %esp +; X86-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT:addsd %xmm0, %xmm0 +; X86-NEXT:movapd %xmm0, %xmm1 +; X86-NEXT:#ARITH_FENCE +; X86-NEXT:addsd %xmm0, %xmm1 +; X86-NEXT:movsd %xmm1, (%esp) +; X86-NEXT:fldl (%esp) +; X86-NEXT:movl %ebp, %esp +; X86-NEXT:popl %ebp +; X86-NEXT:.cfi_def_cfa %esp, 4 +; X86-NEXT:retl +; +; X64-LABEL: f2: +; X64: # %bb.0: +; X64-NEXT:addsd %xmm0, %xmm0 +; X64-NEXT:movapd %xmm0, %xmm1 +; X64-NEXT:#ARITH_FENCE +; X64-NEXT:addsd %xmm0, %xmm1 +; X64-NEXT:movapd %xmm1, %xmm0 +; X64-NEXT:retq + %1 = fadd fast double %a, %a + %t = call double @llvm.arithmetic.fence.f64(double %1) + %2 = fadd fast double %a, %a + %3 = fadd fast double %t, %2 + ret double %3 +} + +define <2 x float> @f3(<2 x float> %a) { +; X86-LABEL: f3: +; X86: # %bb.0: +; X86-NEXT:mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT:retl +; +; X64-LABEL: f3: +; X64: # %bb.0: +; X64-NEXT:mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT:retq + %1 = fadd fast <2 x float> %a, %a + %2 = fadd fast <2 x float> %a, %a + %3 = fadd fast <2 x float> %1, %2 + ret <2 x float> %3 +} + +define <2 x float> @f4(<2 x float> %a) { +; X86-LABEL: f4: +; X86: # %bb.0: +; X86-NEXT:addps %xmm0, %xmm0 +; X86-NEXT:movaps %xmm0, %xmm1 +; X86-NEXT:#ARITH_FENCE +; X86-NEXT:addps %xmm0, %xmm1 +; X86-NEXT:movaps %xmm1, %xmm0 +; X86-NEXT:retl +; +; X64-LABEL: f4: +; X64: # %bb.0: +; X64-NEXT:addps %xmm0, %xmm0 +; X64-NEXT:movaps %xmm0, %xmm1 +; X64-NEXT:#ARITH_FENCE +; X64-NEXT:addps %xmm0, %xmm1 +; X64-NEXT:movaps %xmm1, %xmm0 +; X64-NEXT:retq + %1 = fadd fast <2 x float> %a, %a + %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1) + %2 = fadd fast <2 x float> %a, %a + %3 = fadd fast <2 x float> %t, %2 + ret <2 x float> %3 +} + +define <8 x float> @f5(<8 x float> %a) { +; X86-LABEL: f5: +; X86: # %bb.0: +; X86-NEXT:movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0] +; X86-NEXT:mulps %xmm2, %xmm0 +; X86-NEXT:mulps %xmm2, %xmm1 +; X86-NEXT:retl +; +; X64-LABEL: f5: +; X64: # %bb.0: +; X64-NEXT:movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0] +; X64-NEXT:mulps %xmm2, %xmm0 +; X64-NEXT:mulps %xmm2, %xmm1 +; X64-NEXT:retq + %1 = fadd fast <8 x float> %a, %a + %2 = fadd fast <8 x
[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
pengfei added inline comments. Comment at: llvm/docs/LangRef.rst:21195 +'``llvm.arithmetic_fence``' Intrinsic +^^ + Should be equal to the text? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D99675/new/ https://reviews.llvm.org/D99675 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
mibintc updated this revision to Diff 350307. mibintc added a comment. This patch addresses all of @craig.topper comments and adds documentation for the new intrinsic to the language reference as requested by @LuoYuanke nke Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D99675/new/ https://reviews.llvm.org/D99675 Files: llvm/docs/LangRef.rst llvm/include/llvm/Analysis/TargetTransformInfoImpl.h llvm/include/llvm/CodeGen/BasicTTIImpl.h llvm/include/llvm/CodeGen/ISDOpcodes.h llvm/include/llvm/CodeGen/SelectionDAGISel.h llvm/include/llvm/IR/IRBuilder.h llvm/include/llvm/IR/Intrinsics.td llvm/include/llvm/Support/TargetOpcodes.def llvm/include/llvm/Target/Target.td llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp llvm/test/CodeGen/X86/arithmetic_fence.ll llvm/test/CodeGen/X86/arithmetic_fence2.ll Index: llvm/test/CodeGen/X86/arithmetic_fence2.ll === --- /dev/null +++ llvm/test/CodeGen/X86/arithmetic_fence2.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 + +define double @f1(double %a) { +; X86-LABEL: f1: +; X86: # %bb.0: +; X86-NEXT:pushl %ebp +; X86-NEXT:.cfi_def_cfa_offset 8 +; X86-NEXT:.cfi_offset %ebp, -8 +; X86-NEXT:movl %esp, %ebp +; X86-NEXT:.cfi_def_cfa_register %ebp +; X86-NEXT:andl $-8, %esp +; X86-NEXT:subl $8, %esp +; X86-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT:mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT:movsd %xmm0, (%esp) +; X86-NEXT:fldl (%esp) +; X86-NEXT:movl %ebp, %esp +; X86-NEXT:popl %ebp +; X86-NEXT:.cfi_def_cfa %esp, 4 +; X86-NEXT:retl +; +; X64-LABEL: f1: +; X64: # %bb.0: +; X64-NEXT:mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT:retq + %1 = fadd fast double %a, %a + %2 = fadd fast double %a, %a + %3 = fadd fast double %1, %2 + ret double %3 +} + +define double @f2(double %a) { +; X86-LABEL: f2: +; X86: # %bb.0: +; X86-NEXT:pushl %ebp +; X86-NEXT:.cfi_def_cfa_offset 8 +; X86-NEXT:.cfi_offset %ebp, -8 +; X86-NEXT:movl %esp, %ebp +; X86-NEXT:.cfi_def_cfa_register %ebp +; X86-NEXT:andl $-8, %esp +; X86-NEXT:subl $8, %esp +; X86-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT:addsd %xmm0, %xmm0 +; X86-NEXT:movapd %xmm0, %xmm1 +; X86-NEXT:#ARITH_FENCE +; X86-NEXT:addsd %xmm0, %xmm1 +; X86-NEXT:movsd %xmm1, (%esp) +; X86-NEXT:fldl (%esp) +; X86-NEXT:movl %ebp, %esp +; X86-NEXT:popl %ebp +; X86-NEXT:.cfi_def_cfa %esp, 4 +; X86-NEXT:retl +; +; X64-LABEL: f2: +; X64: # %bb.0: +; X64-NEXT:addsd %xmm0, %xmm0 +; X64-NEXT:movapd %xmm0, %xmm1 +; X64-NEXT:#ARITH_FENCE +; X64-NEXT:addsd %xmm0, %xmm1 +; X64-NEXT:movapd %xmm1, %xmm0 +; X64-NEXT:retq + %1 = fadd fast double %a, %a + %t = call double @llvm.arithmetic.fence.f64(double %1) + %2 = fadd fast double %a, %a + %3 = fadd fast double %t, %2 + ret double %3 +} + +define <2 x float> @f3(<2 x float> %a) { +; X86-LABEL: f3: +; X86: # %bb.0: +; X86-NEXT:mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT:retl +; +; X64-LABEL: f3: +; X64: # %bb.0: +; X64-NEXT:mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT:retq + %1 = fadd fast <2 x float> %a, %a + %2 = fadd fast <2 x float> %a, %a + %3 = fadd fast <2 x float> %1, %2 + ret <2 x float> %3 +} + +define <2 x float> @f4(<2 x float> %a) { +; X86-LABEL: f4: +; X86: # %bb.0: +; X86-NEXT:addps %xmm0, %xmm0 +; X86-NEXT:movaps %xmm0, %xmm1 +; X86-NEXT:#ARITH_FENCE +; X86-NEXT:addps %xmm0, %xmm1 +; X86-NEXT:movaps %xmm1, %xmm0 +; X86-NEXT:retl +; +; X64-LABEL: f4: +; X64: # %bb.0: +; X64-NEXT:addps %xmm0, %xmm0 +; X64-NEXT:movaps %xmm0, %xmm1 +; X64-NEXT:#ARITH_FENCE +; X64-NEXT:addps %xmm0, %xmm1 +; X64-NEXT:movaps %xmm1, %xmm0 +; X64-NEXT:retq + %1 = fadd fast <2 x float> %a, %a + %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1) + %2 = fadd fast <2 x float> %a, %a + %3 = fadd fast <2 x float> %t, %2 + ret <2 x float> %3 +} + +define <8 x float> @f5(<8 x float> %a) { +; X86-LABEL: f5: +; X86: # %bb.0: +; X86-NEXT:movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0] +; X86-NEXT:mulps %xmm2, %xmm0 +; X86-NEXT:mulps %xmm2, %xmm1 +; X86-NEXT:retl +; +; X64-LABEL: f5: +; X64: # %bb.0: +; X64-NEXT:movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0] +; X64-NEXT:mulps %xmm2, %xmm0 +; X64-NEXT:mulps %xmm2, %xmm1
[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
craig.topper added inline comments. Comment at: llvm/include/llvm/IR/IRBuilder.h:903 + const Twine = "") { +return CreateIntrinsic(Intrinsic::arithmetic_fence, {DstType}, {Val}, nullptr, + Name); Do you really need curly braces around DstType and Val? A single value should be implicitly convertible to ArrayRef. Comment at: llvm/include/llvm/IR/Intrinsics.td:1333 + + // Intrinsics to support half precision floating point format // Intrinsics to support half precision floating point format This comment got duplicated. Comment at: llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp:1332 + case TargetOpcode::ARITH_FENCE: +OutStreamer->emitRawComment("ARITH_FENCE"); +break; I think you should check isVerbose() before printing this. Comment at: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp:3149 case ISD::FREEZE: + case ISD::ARITH_FENCE: case ISD::FCANONICALIZE: What about splitting a vector like v8f32 on SSE2? Comment at: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp:6296 + case Intrinsic::arithmetic_fence: { +auto DL = getCurSDLoc(); + There's already a variable called sdl that contains this. It's used in the surrounding cases. Comment at: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp:6299 +SDValue Val = getValue(I.getArgOperand(0)); +EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + Why isn't this just Val.getValueType()? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D99675/new/ https://reviews.llvm.org/D99675 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
LuoYuanke added a comment. We may add description on the intrinsic in docs/LangRef.rst. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D99675/new/ https://reviews.llvm.org/D99675 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
mibintc updated this revision to Diff 348046. mibintc retitled this revision from "RFC [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level" to "[llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level". mibintc edited the summary of this revision. mibintc added a comment. Rebased to ToT. It fixes the previous illegal type lowering problems. It also updates the tests to show the functionality in a better way as well as fixes a newly found problem. Ready for your code review and +1 We think this patch provides basic functionality for the intrinsic, and enhancements can be added in future patches. Thanks! Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D99675/new/ https://reviews.llvm.org/D99675 Files: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h llvm/include/llvm/CodeGen/BasicTTIImpl.h llvm/include/llvm/CodeGen/ISDOpcodes.h llvm/include/llvm/CodeGen/SelectionDAGISel.h llvm/include/llvm/IR/IRBuilder.h llvm/include/llvm/IR/Intrinsics.td llvm/include/llvm/Support/TargetOpcodes.def llvm/include/llvm/Target/Target.td llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp llvm/test/CodeGen/X86/arithmetic_fence.ll Index: llvm/test/CodeGen/X86/arithmetic_fence.ll === --- /dev/null +++ llvm/test/CodeGen/X86/arithmetic_fence.ll @@ -0,0 +1,161 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X64 + +define float @f1(float %a, float %b, float %c) { +; X86-LABEL: f1: +; X86: # %bb.0: +; X86-NEXT:pushl %eax +; X86-NEXT:.cfi_def_cfa_offset 8 +; X86-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT:vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT:vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem +; X86-NEXT:vmovss %xmm1, (%esp) +; X86-NEXT:flds (%esp) +; X86-NEXT:popl %eax +; X86-NEXT:.cfi_def_cfa_offset 4 +; X86-NEXT:retl +; +; X64-LABEL: f1: +; X64: # %bb.0: +; X64-NEXT:vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; X64-NEXT:retq + %mul = fmul fast float %b, %a + %add = fadd fast float %mul, %c + ret float %add +} + +define float @f2(float %a, float %b, float %c) { +; X86-LABEL: f2: +; X86: # %bb.0: +; X86-NEXT:pushl %eax +; X86-NEXT:.cfi_def_cfa_offset 8 +; X86-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT:vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT:#ARITH_FENCE +; X86-NEXT:vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT:vmovss %xmm0, (%esp) +; X86-NEXT:flds (%esp) +; X86-NEXT:popl %eax +; X86-NEXT:.cfi_def_cfa_offset 4 +; X86-NEXT:retl +; +; X64-LABEL: f2: +; X64: # %bb.0: +; X64-NEXT:vmulss %xmm0, %xmm1, %xmm0 +; X64-NEXT:#ARITH_FENCE +; X64-NEXT:vaddss %xmm2, %xmm0, %xmm0 +; X64-NEXT:retq + %mul = fmul fast float %b, %a + %tmp = call float @llvm.arithmetic.fence.f32(float %mul) + %add = fadd fast float %tmp, %c + ret float %add +} + +define double @f3(double %a) { +; X86-LABEL: f3: +; X86: # %bb.0: +; X86-NEXT:pushl %ebp +; X86-NEXT:.cfi_def_cfa_offset 8 +; X86-NEXT:.cfi_offset %ebp, -8 +; X86-NEXT:movl %esp, %ebp +; X86-NEXT:.cfi_def_cfa_register %ebp +; X86-NEXT:andl $-8, %esp +; X86-NEXT:subl $8, %esp +; X86-NEXT:vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT:vmulsd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT:vmovsd %xmm0, (%esp) +; X86-NEXT:fldl (%esp) +; X86-NEXT:movl %ebp, %esp +; X86-NEXT:popl %ebp +; X86-NEXT:.cfi_def_cfa %esp, 4 +; X86-NEXT:retl +; +; X64-LABEL: f3: +; X64: # %bb.0: +; X64-NEXT:vmulsd {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT:retq + %1 = fadd fast double %a, %a + %2 = fadd fast double %a, %a + %3 = fadd fast double %1, %2 + ret double %3 +} + +define double @f4(double %a) { +; X86-LABEL: f4: +; X86: # %bb.0: +; X86-NEXT:pushl %ebp +; X86-NEXT:.cfi_def_cfa_offset 8 +; X86-NEXT:.cfi_offset %ebp, -8 +; X86-NEXT:movl %esp, %ebp +; X86-NEXT:.cfi_def_cfa_register %ebp +; X86-NEXT:andl $-8, %esp +; X86-NEXT:subl $8, %esp +; X86-NEXT:vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT:vaddsd %xmm0, %xmm0, %xmm0 +; X86-NEXT:vmovapd %xmm0, %xmm1 +; X86-NEXT:#ARITH_FENCE +; X86-NEXT:vaddsd %xmm0, %xmm1, %xmm0 +; X86-NEXT:vmovsd %xmm0, (%esp) +; X86-NEXT:fldl (%esp) +; X86-NEXT:movl %ebp, %esp +; X86-NEXT:popl %ebp +; X86-NEXT:.cfi_def_cfa %esp, 4 +; X86-NEXT:retl +;