Oh, yeah. that is the other thing that I did Friday. I wrote a testcase to compare __rw_atomic_add32() against InterlockedIncrement() on Win32. There is a performance penalty...
C:\Temp>t 2 && t 4 && t 8 ---------- locked inc ---- atomic_add ---- 2 threads ms 4266 4469 ms/op 0.00003178 0.00003330 -4.7586% thr ms 18117 18437 thr ms/op 0.00013498 0.00013737 -1.7663% ---------- locked inc ---- atomic_add ---- 4 threads ms 7969 8609 ms/op 0.00005937 0.00006414 -8.0311% thr ms 36359 37019 thr ms/op 0.00027090 0.00027581 -1.8152% ---------- locked inc ---- atomic_add ---- 8 threads ms 5016 5484 ms/op 0.00003737 0.00004086 -9.3301% thr ms 60846 66130 thr ms/op 0.00045334 0.00049271 -8.6842% C:\Temp>t 2 && t 4 && t 8 ---------- locked inc ---- atomic_add ---- 2 threads ms 2781 2906 ms/op 0.00002072 0.00002165 -4.4948% thr ms 14961 16093 thr ms/op 0.00011147 0.00011990 -7.5663% ---------- locked inc ---- atomic_add ---- 4 threads ms 2781 2891 ms/op 0.00002072 0.00002154 -3.9554% thr ms 30867 31328 thr ms/op 0.00022998 0.00023341 -1.4935% ---------- locked inc ---- atomic_add ---- 8 threads ms 2782 2890 ms/op 0.00002073 0.00002153 -3.8821% thr ms 64318 64341 thr ms/op 0.00047921 0.00047938 -0.0358% I will do a quick run using the string performance test after lunch. I'll report the results on that later. I've pasted the source for the bulk of my test below. If someone wants the entire thing, let me know and I'll provide everything. Travis Martin Sebor wrote: >Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows > >What's the status of this? We need to decide if we can put this >in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make >sure the new functions don't cause a performance regression in >basic_string. I.e., we need to see the before and after numbers. > >Martin > >Martin Sebor wrote: >> >> One concern I have is performance. Does replacing the intrinsics with >> out of line function call whose semantics the compiler has no idea >> about have any impact on the runtime efficiency of the >generated code? >> I would be especially interested in "real life" scenarios such as the >> usage of the atomic operations in basic_string. >> >> It would be good to see some before and after numbers. If you don't >> have all the platforms to run the test post your benchmark and Travis >> can help you put them together. > #include <stdio.h> #include <stdlib.h> #define WIN32_LEAN_AND_MEAN #include <windows.h> #include <process.h> #include "lib.h" #define MIN_THREADS 2 #define MAX_THREADS 16 unsigned long locked_inc(long* val, long iters) { const unsigned long t0 = GetTickCount (); long n; for (n = 0; n < iters; ++n) { InterlockedIncrement(val); } const unsigned long t1 = GetTickCount (); return (t1 - t0); } unsigned long atomic_add(long* val, long iters) { const unsigned long t0 = GetTickCount (); long n; for (n = 0; n < iters; ++n) { __rw_atomic_add32(val, 1); } const unsigned long t1 = GetTickCount (); return (t1 - t0); } struct thread_param { // atomic variable long* variable; // number of iterations long iters; // function to invoke unsigned long (*fun)(long*, long); // result of function unsigned long result; // thread handle used by main thread HANDLE thread; }; extern "C" { void thread_func(void* p) { thread_param* param = (thread_param*)p; param->result = (param->fun)(param->variable, param->iters); } } // extern "C" unsigned long run_threads(int nthreads, unsigned long (*fun)(long*, long), long iters) { thread_param params[MAX_THREADS]; long thread_var = 0; int i; for (i = 0; i < nthreads; ++i) { params[i].variable = &thread_var; params[i].result = 0; params[i].fun = fun; params[i].iters = iters; } int n; for (n = 0; n < nthreads; ++n) { params[n].thread = (HANDLE)_beginthread(thread_func, 0, ¶ms[n]); } unsigned long thread_time = 0; for (n = 0; n < nthreads; ++n) { WaitForSingleObject (params[n].thread, INFINITE); thread_time += params[n].result; } return thread_time; } int main(int argc, char* argv[]) { int nthreads = MIN_THREADS; if (1 < argc) nthreads = atoi(argv[1]); // cap thread count if (nthreads < MIN_THREADS) nthreads = MIN_THREADS; else if (MAX_THREADS < nthreads) nthreads = MAX_THREADS; const long ops = 0x7ffffff; long thread_var; thread_var = 0; unsigned long locked_inc_ms = locked_inc (&thread_var, ops); thread_var = 0; unsigned long atomic_add_ms = atomic_add (&thread_var, ops); printf("---------- locked inc ---- atomic_add ---- %d threads\n", nthreads); printf("ms %8.u %8.u\n", locked_inc_ms, atomic_add_ms); float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops; float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops; printf("ms/op %8.8f %8.8f %.4f%%\n", locked_inc_ops_p_ms, atomic_add_ops_p_ms, 100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) / locked_inc_ops_p_ms); // do it with threads locked_inc_ms = run_threads(nthreads, locked_inc, ops); atomic_add_ms = run_threads(nthreads, atomic_add, ops); locked_inc_ms /= nthreads; atomic_add_ms /= nthreads; printf("thr ms %8.u %8.u\n", locked_inc_ms, atomic_add_ms); locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops; atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops; printf("thr ms/op %8.8f %8.8f %.4f%%\n", locked_inc_ops_p_ms, atomic_add_ops_p_ms, 100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) / locked_inc_ops_p_ms); return 0; }