Mainly talking about x86 target platform, on x86. I write the following code for testing with float exception
```rust #![feature(asm)] use core::arch::x86_64::*; use std::time::Instant; /* 00011111 10100000 00011111 10000000 54321098 76543210 */ /* Pnemonic Bit Location Description FZ bit 15 Flush To Zero R+ bit 14 Round Positive R- bit 13 Round Negative RZ bits 13 and 14 Round To Zero RN bits 13 and 14 are 0 Round To Nearest PM bit 12 Precision Mask UM bit 11 Underflow Mask OM bit 10 Overflow Mask ZM bit 9 Divide By Zero Mask DM bit 8 Denormal Mask IM bit 7 Invalid Operation Mask DAZ bit 6 Denormals Are Zero PE bit 5 Precision Flag UE bit 4 Underflow Flag OE bit 3 Overflow Flag ZE bit 2 Divide By Zero Flag DE bit 1 Denormal Flag IE bit 0 Invalid Operation Flag */ fn test_double_div(a: f64, b:f64, init_csr: u32, ops: u32) { let mut final_csr: u32 = 0; let mut count = 0; let mut c: f64 = 0.0; let now = Instant::now(); unsafe { for _i in 0..ops { final_csr = _mm_getcsr(); if final_csr != init_csr { count += 1; _mm_setcsr(init_csr); } let ma = _mm_set1_pd(a); let mb = _mm_set1_pd(b); let mc = _mm_div_pd(ma, mb); c += _mm_cvtsd_f64(mc); } } let m_opts = ops as f64 / 1024.0/1024.8; println!("final csr: {} flops:{} count:{} c:{}", final_csr, m_opts / now.elapsed().as_secs_f64(), count, c); } fn main() { let init_csr: u32; unsafe { println!("first csr:{}", _mm_getcsr()); // _mm_setcsr((_mm_getcsr() & !0x8040) | 0b100000); // Initially set inexact bit _mm_setcsr(_mm_getcsr() & !0x8040); // Disable `Flush To Zero` and `Denormals Are Zero` init_csr = _mm_getcsr(); println!("init_csr:{}", init_csr); } let ops = 256 * 1024 * 1024; for _ in 1..5 { test_double_div(6.0f64, 6.0f64, init_csr, ops); test_double_div(6.0f64, 5.0f64, init_csr, ops); test_double_div(6.0f64, 6.0f64, init_csr, ops); test_double_div(6.0f64, 7.0f64, init_csr, ops); } } ``` Amd ryzen 3950x test result: ``` cargo run --release Finished release [optimized] target(s) in 0.00s Running `target\release\xemu-float.exe` first csr:8064 init_csr:8064 final csr: 8064 flops:199.95805114656875 count:0 c:268435456 final csr: 8096 flops:183.20469224891 count:268435454 c:322122545.5686275 final csr: 8064 flops:201.86001192537793 count:1 c:268435456 final csr: 8096 flops:183.08332320794779 count:268435454 c:230087534.2222222 final csr: 8064 flops:196.64067223704072 count:1 c:268435456 final csr: 8096 flops:181.63963551098507 count:268435454 c:322122545.5686275 final csr: 8064 flops:201.74742194716032 count:1 c:268435456 final csr: 8096 flops:182.57995318163572 count:268435454 c:230087534.2222222 final csr: 8064 flops:200.94880783384463 count:1 c:268435456 final csr: 8096 flops:182.80824315169866 count:268435454 c:322122545.5686275 final csr: 8064 flops:201.20301405393934 count:1 c:268435456 final csr: 8096 flops:182.99063466809466 count:268435454 c:230087534.2222222 final csr: 8064 flops:201.89608256173372 count:1 c:268435456 final csr: 8096 flops:182.46839074321227 count:268435454 c:322122545.5686275 final csr: 8064 flops:201.6361495464766 count:1 c:268435456 final csr: 8096 flops:181.2901173261191 count:268435454 c:230087534.2222222 ``` Intel i7-8750H test result: ``` cargo run --release Finished release [optimized] target(s) in 0.01s Running `target\release\xemu-float.exe` first csr:8064 init_csr:8064 final csr: 8064 flops:919.039615426437 count:0 c:268435456 final csr: 8096 flops:128.06695571065126 count:268435454 c:322122545.5686275 final csr: 8064 flops:908.9774298981789 count:1 c:268435456 final csr: 8096 flops:127.67450912501586 count:268435454 c:230087534.2222222 final csr: 8064 flops:911.2272033144177 count:1 c:268435456 final csr: 8096 flops:124.61355141244842 count:268435454 c:322122545.5686275 final csr: 8064 flops:921.2731718357994 count:1 c:268435456 final csr: 8096 flops:128.68833918242356 count:268435454 c:230087534.2222222 final csr: 8064 flops:936.8098900696564 count:1 c:268435456 final csr: 8096 flops:126.0582659518433 count:268435454 c:322122545.5686275 final csr: 8064 flops:915.392964318394 count:1 c:268435456 final csr: 8096 flops:126.88957390875404 count:268435454 c:230087534.2222222 final csr: 8064 flops:923.3882629699333 count:1 c:268435456 final csr: 8096 flops:120.6875203037403 count:268435454 c:322122545.5686275 final csr: 8064 flops:915.0317886715162 count:1 c:268435456 final csr: 8096 flops:125.19570682160303 count:268435454 c:230087534.2222222 ``` That's means read mxcsr register is very fast in Intel x64 processor read mxcsr register is slow on AMD x64 processor read/write mxcsr register is about 4x slow on AMD processor and 8x slow on Intel processor, we can take advantage of these when accleration target float instructions. -- 此致 礼 罗勇刚 Yours sincerely, Yonggang Luo