Mainly talking about x86 target platform, on x86. I write the following code
for testing with float exception

use core::arch::x86_64::*;
use std::time::Instant;

00011111 10100000
00011111 10000000
54321098 76543210

Pnemonic Bit Location Description
FZ bit 15 Flush To Zero

R+ bit 14 Round Positive
R- bit 13 Round Negative
RZ bits 13 and 14 Round To Zero
RN bits 13 and 14 are 0 Round To Nearest

PM bit 12 Precision Mask
UM bit 11 Underflow Mask
OM bit 10 Overflow Mask
ZM bit 9 Divide By Zero Mask
DM bit 8 Denormal Mask
IM bit 7 Invalid Operation Mask
DAZ bit 6 Denormals Are Zero
PE bit 5 Precision Flag
UE bit 4 Underflow Flag
OE bit 3 Overflow Flag
ZE bit 2 Divide By Zero Flag
DE bit 1 Denormal Flag
IE bit 0 Invalid Operation Flag

fn test_double_div(a: f64, b:f64, init_csr: u32, ops: u32) {
    let mut final_csr: u32 = 0;
    let mut count = 0;
    let mut c: f64 = 0.0;
    let now = Instant::now();
    unsafe {
        for _i in 0..ops {
            final_csr = _mm_getcsr();
            if final_csr != init_csr {
                count += 1;
            let ma = _mm_set1_pd(a);
            let mb = _mm_set1_pd(b);
            let mc = _mm_div_pd(ma, mb);
            c +=  _mm_cvtsd_f64(mc);
    let m_opts = ops as f64 / 1024.0/1024.8;

    println!("final csr: {} flops:{} count:{} c:{}",
    final_csr, m_opts / now.elapsed().as_secs_f64(), count, c);

fn main() {
    let init_csr: u32;
    unsafe {
        println!("first csr:{}", _mm_getcsr());
        // _mm_setcsr((_mm_getcsr() & !0x8040) | 0b100000); // Initially
set inexact bit
        _mm_setcsr(_mm_getcsr() & !0x8040); // Disable `Flush To Zero` and
`Denormals Are Zero`
        init_csr = _mm_getcsr();
        println!("init_csr:{}", init_csr);
    let ops = 256 * 1024 * 1024;
    for _ in 1..5 {
        test_double_div(6.0f64, 6.0f64, init_csr, ops);
        test_double_div(6.0f64, 5.0f64, init_csr, ops);
        test_double_div(6.0f64, 6.0f64, init_csr, ops);
        test_double_div(6.0f64, 7.0f64, init_csr, ops);

Amd ryzen 3950x test result:
cargo run --release
    Finished release [optimized] target(s) in 0.00s
     Running `target\release\xemu-float.exe`
first csr:8064
final csr: 8064 flops:199.95805114656875 count:0 c:268435456
final csr: 8096 flops:183.20469224891 count:268435454 c:322122545.5686275
final csr: 8064 flops:201.86001192537793 count:1 c:268435456
final csr: 8096 flops:183.08332320794779 count:268435454 c:230087534.2222222
final csr: 8064 flops:196.64067223704072 count:1 c:268435456
final csr: 8096 flops:181.63963551098507 count:268435454 c:322122545.5686275
final csr: 8064 flops:201.74742194716032 count:1 c:268435456
final csr: 8096 flops:182.57995318163572 count:268435454 c:230087534.2222222
final csr: 8064 flops:200.94880783384463 count:1 c:268435456
final csr: 8096 flops:182.80824315169866 count:268435454 c:322122545.5686275
final csr: 8064 flops:201.20301405393934 count:1 c:268435456
final csr: 8096 flops:182.99063466809466 count:268435454 c:230087534.2222222
final csr: 8064 flops:201.89608256173372 count:1 c:268435456
final csr: 8096 flops:182.46839074321227 count:268435454 c:322122545.5686275
final csr: 8064 flops:201.6361495464766 count:1 c:268435456
final csr: 8096 flops:181.2901173261191 count:268435454 c:230087534.2222222

Intel i7-8750H test result:
cargo run --release
    Finished release [optimized] target(s) in 0.01s
     Running `target\release\xemu-float.exe`
first csr:8064
final csr: 8064 flops:919.039615426437 count:0 c:268435456
final csr: 8096 flops:128.06695571065126 count:268435454 c:322122545.5686275
final csr: 8064 flops:908.9774298981789 count:1 c:268435456
final csr: 8096 flops:127.67450912501586 count:268435454 c:230087534.2222222
final csr: 8064 flops:911.2272033144177 count:1 c:268435456
final csr: 8096 flops:124.61355141244842 count:268435454 c:322122545.5686275
final csr: 8064 flops:921.2731718357994 count:1 c:268435456
final csr: 8096 flops:128.68833918242356 count:268435454 c:230087534.2222222
final csr: 8064 flops:936.8098900696564 count:1 c:268435456
final csr: 8096 flops:126.0582659518433 count:268435454 c:322122545.5686275
final csr: 8064 flops:915.392964318394 count:1 c:268435456
final csr: 8096 flops:126.88957390875404 count:268435454 c:230087534.2222222
final csr: 8064 flops:923.3882629699333 count:1 c:268435456
final csr: 8096 flops:120.6875203037403 count:268435454 c:322122545.5686275
final csr: 8064 flops:915.0317886715162 count:1 c:268435456
final csr: 8096 flops:125.19570682160303 count:268435454 c:230087534.2222222

That's means read mxcsr register is very fast in Intel x64 processor
read mxcsr register is slow on AMD x64 processor
read/write mxcsr register is about 4x slow on AMD processor and 8x slow  on
Intel processor, we can take advantage of these when
accleration target float instructions.

Yonggang Luo

Reply via email to