[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

2021-06-09 Thread Pengfei Wang via Phabricator via cfe-commits
pengfei added inline comments.



Comment at: llvm/docs/LangRef.rst:21195
+'``llvm.arithmetic_fence``' Intrinsic
+^^
+

pengfei wrote:
> Should be equal to the text?
Yeah, a good catch. But I initially meant `^^^` should be equal to the title. :)


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99675/new/

https://reviews.llvm.org/D99675

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

2021-06-08 Thread Melanie Blower via Phabricator via cfe-commits
mibintc updated this revision to Diff 350700.
mibintc added a comment.

I corrected error in LangRef documentation that @pengfei pointed out.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99675/new/

https://reviews.llvm.org/D99675

Files:
  llvm/docs/LangRef.rst
  llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
  llvm/include/llvm/CodeGen/BasicTTIImpl.h
  llvm/include/llvm/CodeGen/ISDOpcodes.h
  llvm/include/llvm/CodeGen/SelectionDAGISel.h
  llvm/include/llvm/IR/IRBuilder.h
  llvm/include/llvm/IR/Intrinsics.td
  llvm/include/llvm/Support/TargetOpcodes.def
  llvm/include/llvm/Target/Target.td
  llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
  llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
  llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
  llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
  llvm/test/CodeGen/X86/arithmetic_fence.ll
  llvm/test/CodeGen/X86/arithmetic_fence2.ll

Index: llvm/test/CodeGen/X86/arithmetic_fence2.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+define double @f1(double %a) {
+; X86-LABEL: f1:
+; X86:   # %bb.0:
+; X86-NEXT:pushl %ebp
+; X86-NEXT:.cfi_def_cfa_offset 8
+; X86-NEXT:.cfi_offset %ebp, -8
+; X86-NEXT:movl %esp, %ebp
+; X86-NEXT:.cfi_def_cfa_register %ebp
+; X86-NEXT:andl $-8, %esp
+; X86-NEXT:subl $8, %esp
+; X86-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:movsd %xmm0, (%esp)
+; X86-NEXT:fldl (%esp)
+; X86-NEXT:movl %ebp, %esp
+; X86-NEXT:popl %ebp
+; X86-NEXT:.cfi_def_cfa %esp, 4
+; X86-NEXT:retl
+;
+; X64-LABEL: f1:
+; X64:   # %bb.0:
+; X64-NEXT:mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast double %a, %a
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %1, %2
+  ret double %3
+}
+
+define double @f2(double %a) {
+; X86-LABEL: f2:
+; X86:   # %bb.0:
+; X86-NEXT:pushl %ebp
+; X86-NEXT:.cfi_def_cfa_offset 8
+; X86-NEXT:.cfi_offset %ebp, -8
+; X86-NEXT:movl %esp, %ebp
+; X86-NEXT:.cfi_def_cfa_register %ebp
+; X86-NEXT:andl $-8, %esp
+; X86-NEXT:subl $8, %esp
+; X86-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:addsd %xmm0, %xmm0
+; X86-NEXT:movapd %xmm0, %xmm1
+; X86-NEXT:#ARITH_FENCE
+; X86-NEXT:addsd %xmm0, %xmm1
+; X86-NEXT:movsd %xmm1, (%esp)
+; X86-NEXT:fldl (%esp)
+; X86-NEXT:movl %ebp, %esp
+; X86-NEXT:popl %ebp
+; X86-NEXT:.cfi_def_cfa %esp, 4
+; X86-NEXT:retl
+;
+; X64-LABEL: f2:
+; X64:   # %bb.0:
+; X64-NEXT:addsd %xmm0, %xmm0
+; X64-NEXT:movapd %xmm0, %xmm1
+; X64-NEXT:#ARITH_FENCE
+; X64-NEXT:addsd %xmm0, %xmm1
+; X64-NEXT:movapd %xmm1, %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast double %a, %a
+  %t = call double @llvm.arithmetic.fence.f64(double %1)
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %t, %2
+  ret double %3
+}
+
+define <2 x float> @f3(<2 x float> %a) {
+; X86-LABEL: f3:
+; X86:   # %bb.0:
+; X86-NEXT:mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:retl
+;
+; X64-LABEL: f3:
+; X64:   # %bb.0:
+; X64-NEXT:mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast <2 x float> %a, %a
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %1, %2
+  ret <2 x float> %3
+}
+
+define <2 x float> @f4(<2 x float> %a) {
+; X86-LABEL: f4:
+; X86:   # %bb.0:
+; X86-NEXT:addps %xmm0, %xmm0
+; X86-NEXT:movaps %xmm0, %xmm1
+; X86-NEXT:#ARITH_FENCE
+; X86-NEXT:addps %xmm0, %xmm1
+; X86-NEXT:movaps %xmm1, %xmm0
+; X86-NEXT:retl
+;
+; X64-LABEL: f4:
+; X64:   # %bb.0:
+; X64-NEXT:addps %xmm0, %xmm0
+; X64-NEXT:movaps %xmm0, %xmm1
+; X64-NEXT:#ARITH_FENCE
+; X64-NEXT:addps %xmm0, %xmm1
+; X64-NEXT:movaps %xmm1, %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast <2 x float> %a, %a
+  %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %t, %2
+  ret <2 x float> %3
+}
+
+define <8 x float> @f5(<8 x float> %a) {
+; X86-LABEL: f5:
+; X86:   # %bb.0:
+; X86-NEXT:movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; X86-NEXT:mulps %xmm2, %xmm0
+; X86-NEXT:mulps %xmm2, %xmm1
+; X86-NEXT:retl
+;
+; X64-LABEL: f5:
+; X64:   # %bb.0:
+; X64-NEXT:movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; X64-NEXT:mulps %xmm2, %xmm0
+; X64-NEXT:mulps %xmm2, %xmm1
+; X64-NEXT:retq
+  %1 = fadd fast <8 x float> %a, %a
+  %2 = fadd fast <8 x 

[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

2021-06-07 Thread Pengfei Wang via Phabricator via cfe-commits
pengfei added inline comments.



Comment at: llvm/docs/LangRef.rst:21195
+'``llvm.arithmetic_fence``' Intrinsic
+^^
+

Should be equal to the text?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99675/new/

https://reviews.llvm.org/D99675

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

2021-06-07 Thread Melanie Blower via Phabricator via cfe-commits
mibintc updated this revision to Diff 350307.
mibintc added a comment.

This patch addresses all of @craig.topper comments and adds documentation for 
the new intrinsic to the language reference as requested by @LuoYuanke nke


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99675/new/

https://reviews.llvm.org/D99675

Files:
  llvm/docs/LangRef.rst
  llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
  llvm/include/llvm/CodeGen/BasicTTIImpl.h
  llvm/include/llvm/CodeGen/ISDOpcodes.h
  llvm/include/llvm/CodeGen/SelectionDAGISel.h
  llvm/include/llvm/IR/IRBuilder.h
  llvm/include/llvm/IR/Intrinsics.td
  llvm/include/llvm/Support/TargetOpcodes.def
  llvm/include/llvm/Target/Target.td
  llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
  llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
  llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
  llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
  llvm/test/CodeGen/X86/arithmetic_fence.ll
  llvm/test/CodeGen/X86/arithmetic_fence2.ll

Index: llvm/test/CodeGen/X86/arithmetic_fence2.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+define double @f1(double %a) {
+; X86-LABEL: f1:
+; X86:   # %bb.0:
+; X86-NEXT:pushl %ebp
+; X86-NEXT:.cfi_def_cfa_offset 8
+; X86-NEXT:.cfi_offset %ebp, -8
+; X86-NEXT:movl %esp, %ebp
+; X86-NEXT:.cfi_def_cfa_register %ebp
+; X86-NEXT:andl $-8, %esp
+; X86-NEXT:subl $8, %esp
+; X86-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:movsd %xmm0, (%esp)
+; X86-NEXT:fldl (%esp)
+; X86-NEXT:movl %ebp, %esp
+; X86-NEXT:popl %ebp
+; X86-NEXT:.cfi_def_cfa %esp, 4
+; X86-NEXT:retl
+;
+; X64-LABEL: f1:
+; X64:   # %bb.0:
+; X64-NEXT:mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast double %a, %a
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %1, %2
+  ret double %3
+}
+
+define double @f2(double %a) {
+; X86-LABEL: f2:
+; X86:   # %bb.0:
+; X86-NEXT:pushl %ebp
+; X86-NEXT:.cfi_def_cfa_offset 8
+; X86-NEXT:.cfi_offset %ebp, -8
+; X86-NEXT:movl %esp, %ebp
+; X86-NEXT:.cfi_def_cfa_register %ebp
+; X86-NEXT:andl $-8, %esp
+; X86-NEXT:subl $8, %esp
+; X86-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:addsd %xmm0, %xmm0
+; X86-NEXT:movapd %xmm0, %xmm1
+; X86-NEXT:#ARITH_FENCE
+; X86-NEXT:addsd %xmm0, %xmm1
+; X86-NEXT:movsd %xmm1, (%esp)
+; X86-NEXT:fldl (%esp)
+; X86-NEXT:movl %ebp, %esp
+; X86-NEXT:popl %ebp
+; X86-NEXT:.cfi_def_cfa %esp, 4
+; X86-NEXT:retl
+;
+; X64-LABEL: f2:
+; X64:   # %bb.0:
+; X64-NEXT:addsd %xmm0, %xmm0
+; X64-NEXT:movapd %xmm0, %xmm1
+; X64-NEXT:#ARITH_FENCE
+; X64-NEXT:addsd %xmm0, %xmm1
+; X64-NEXT:movapd %xmm1, %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast double %a, %a
+  %t = call double @llvm.arithmetic.fence.f64(double %1)
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %t, %2
+  ret double %3
+}
+
+define <2 x float> @f3(<2 x float> %a) {
+; X86-LABEL: f3:
+; X86:   # %bb.0:
+; X86-NEXT:mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:retl
+;
+; X64-LABEL: f3:
+; X64:   # %bb.0:
+; X64-NEXT:mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast <2 x float> %a, %a
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %1, %2
+  ret <2 x float> %3
+}
+
+define <2 x float> @f4(<2 x float> %a) {
+; X86-LABEL: f4:
+; X86:   # %bb.0:
+; X86-NEXT:addps %xmm0, %xmm0
+; X86-NEXT:movaps %xmm0, %xmm1
+; X86-NEXT:#ARITH_FENCE
+; X86-NEXT:addps %xmm0, %xmm1
+; X86-NEXT:movaps %xmm1, %xmm0
+; X86-NEXT:retl
+;
+; X64-LABEL: f4:
+; X64:   # %bb.0:
+; X64-NEXT:addps %xmm0, %xmm0
+; X64-NEXT:movaps %xmm0, %xmm1
+; X64-NEXT:#ARITH_FENCE
+; X64-NEXT:addps %xmm0, %xmm1
+; X64-NEXT:movaps %xmm1, %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast <2 x float> %a, %a
+  %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %t, %2
+  ret <2 x float> %3
+}
+
+define <8 x float> @f5(<8 x float> %a) {
+; X86-LABEL: f5:
+; X86:   # %bb.0:
+; X86-NEXT:movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; X86-NEXT:mulps %xmm2, %xmm0
+; X86-NEXT:mulps %xmm2, %xmm1
+; X86-NEXT:retl
+;
+; X64-LABEL: f5:
+; X64:   # %bb.0:
+; X64-NEXT:movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; X64-NEXT:mulps %xmm2, %xmm0
+; X64-NEXT:mulps %xmm2, %xmm1

[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

2021-06-03 Thread Craig Topper via Phabricator via cfe-commits
craig.topper added inline comments.



Comment at: llvm/include/llvm/IR/IRBuilder.h:903
+  const Twine  = "") {
+return CreateIntrinsic(Intrinsic::arithmetic_fence, {DstType}, {Val}, 
nullptr,
+   Name);

Do you really need curly braces around DstType and Val? A single value should 
be implicitly convertible to ArrayRef.



Comment at: llvm/include/llvm/IR/Intrinsics.td:1333
+
+ // Intrinsics to support half precision floating point format
 // Intrinsics to support half precision floating point format

This comment got duplicated.



Comment at: llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp:1332
+  case TargetOpcode::ARITH_FENCE:
+OutStreamer->emitRawComment("ARITH_FENCE");
+break;

I think you should check isVerbose() before printing this.



Comment at: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp:3149
   case ISD::FREEZE:
+  case ISD::ARITH_FENCE:
   case ISD::FCANONICALIZE:

What about splitting a vector like v8f32 on SSE2?



Comment at: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp:6296
+  case Intrinsic::arithmetic_fence: {
+auto DL = getCurSDLoc();
+

There's already a variable called sdl that contains this. It's used in the 
surrounding cases.



Comment at: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp:6299
+SDValue Val = getValue(I.getArgOperand(0));
+EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+

Why isn't this just Val.getValueType()?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99675/new/

https://reviews.llvm.org/D99675

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

2021-06-03 Thread LuoYuanke via Phabricator via cfe-commits
LuoYuanke added a comment.

We may add description on the intrinsic in docs/LangRef.rst.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99675/new/

https://reviews.llvm.org/D99675

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D99675: [llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

2021-05-26 Thread Melanie Blower via Phabricator via cfe-commits
mibintc updated this revision to Diff 348046.
mibintc retitled this revision from "RFC [llvm][clang] Create new intrinsic 
llvm.arith.fence to control FP optimization at expression level" to 
"[llvm][clang] Create new intrinsic llvm.arith.fence to control FP optimization 
at expression level".
mibintc edited the summary of this revision.
mibintc added a comment.

Rebased to ToT. It fixes the previous illegal type lowering problems. It also 
updates the tests to show the functionality in a better way as well as fixes a 
newly found problem.

Ready for your code review and +1

We think this patch provides basic functionality for the intrinsic, and 
enhancements can be added in future patches.

Thanks!


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99675/new/

https://reviews.llvm.org/D99675

Files:
  llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
  llvm/include/llvm/CodeGen/BasicTTIImpl.h
  llvm/include/llvm/CodeGen/ISDOpcodes.h
  llvm/include/llvm/CodeGen/SelectionDAGISel.h
  llvm/include/llvm/IR/IRBuilder.h
  llvm/include/llvm/IR/Intrinsics.td
  llvm/include/llvm/Support/TargetOpcodes.def
  llvm/include/llvm/Target/Target.td
  llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
  llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
  llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
  llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
  llvm/test/CodeGen/X86/arithmetic_fence.ll

Index: llvm/test/CodeGen/X86/arithmetic_fence.ll
===
--- /dev/null
+++ llvm/test/CodeGen/X86/arithmetic_fence.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X64
+
+define float @f1(float %a, float %b, float %c) {
+; X86-LABEL: f1:
+; X86:   # %bb.0:
+; X86-NEXT:pushl %eax
+; X86-NEXT:.cfi_def_cfa_offset 8
+; X86-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
+; X86-NEXT:vmovss %xmm1, (%esp)
+; X86-NEXT:flds (%esp)
+; X86-NEXT:popl %eax
+; X86-NEXT:.cfi_def_cfa_offset 4
+; X86-NEXT:retl
+;
+; X64-LABEL: f1:
+; X64:   # %bb.0:
+; X64-NEXT:vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; X64-NEXT:retq
+  %mul = fmul fast float %b, %a
+  %add = fadd fast float %mul, %c
+  ret float %add
+}
+
+define float @f2(float %a, float %b, float %c) {
+; X86-LABEL: f2:
+; X86:   # %bb.0:
+; X86-NEXT:pushl %eax
+; X86-NEXT:.cfi_def_cfa_offset 8
+; X86-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:#ARITH_FENCE
+; X86-NEXT:vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:vmovss %xmm0, (%esp)
+; X86-NEXT:flds (%esp)
+; X86-NEXT:popl %eax
+; X86-NEXT:.cfi_def_cfa_offset 4
+; X86-NEXT:retl
+;
+; X64-LABEL: f2:
+; X64:   # %bb.0:
+; X64-NEXT:vmulss %xmm0, %xmm1, %xmm0
+; X64-NEXT:#ARITH_FENCE
+; X64-NEXT:vaddss %xmm2, %xmm0, %xmm0
+; X64-NEXT:retq
+  %mul = fmul fast float %b, %a
+  %tmp = call float @llvm.arithmetic.fence.f32(float %mul)
+  %add = fadd fast float %tmp, %c
+  ret float %add
+}
+
+define double @f3(double %a) {
+; X86-LABEL: f3:
+; X86:   # %bb.0:
+; X86-NEXT:pushl %ebp
+; X86-NEXT:.cfi_def_cfa_offset 8
+; X86-NEXT:.cfi_offset %ebp, -8
+; X86-NEXT:movl %esp, %ebp
+; X86-NEXT:.cfi_def_cfa_register %ebp
+; X86-NEXT:andl $-8, %esp
+; X86-NEXT:subl $8, %esp
+; X86-NEXT:vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:vmulsd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:vmovsd %xmm0, (%esp)
+; X86-NEXT:fldl (%esp)
+; X86-NEXT:movl %ebp, %esp
+; X86-NEXT:popl %ebp
+; X86-NEXT:.cfi_def_cfa %esp, 4
+; X86-NEXT:retl
+;
+; X64-LABEL: f3:
+; X64:   # %bb.0:
+; X64-NEXT:vmulsd {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:retq
+  %1 = fadd fast double %a, %a
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %1, %2
+  ret double %3
+}
+
+define double @f4(double %a) {
+; X86-LABEL: f4:
+; X86:   # %bb.0:
+; X86-NEXT:pushl %ebp
+; X86-NEXT:.cfi_def_cfa_offset 8
+; X86-NEXT:.cfi_offset %ebp, -8
+; X86-NEXT:movl %esp, %ebp
+; X86-NEXT:.cfi_def_cfa_register %ebp
+; X86-NEXT:andl $-8, %esp
+; X86-NEXT:subl $8, %esp
+; X86-NEXT:vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:vaddsd %xmm0, %xmm0, %xmm0
+; X86-NEXT:vmovapd %xmm0, %xmm1
+; X86-NEXT:#ARITH_FENCE
+; X86-NEXT:vaddsd %xmm0, %xmm1, %xmm0
+; X86-NEXT:vmovsd %xmm0, (%esp)
+; X86-NEXT:fldl (%esp)
+; X86-NEXT:movl %ebp, %esp
+; X86-NEXT:popl %ebp
+; X86-NEXT:.cfi_def_cfa %esp, 4
+; X86-NEXT:retl
+;