Oh, yeah. that is the other thing that I did Friday. I wrote a testcase
to compare __rw_atomic_add32() against InterlockedIncrement() on Win32.
There is a performance penalty...

  C:\Temp>t 2 && t 4 && t 8
  ---------- locked inc ---- atomic_add ---- 2 threads
  ms               4266            4469
  ms/op      0.00003178      0.00003330      -4.7586%
  thr ms          18117           18437
  thr ms/op  0.00013498      0.00013737      -1.7663%
  ---------- locked inc ---- atomic_add ---- 4 threads
  ms               7969            8609
  ms/op      0.00005937      0.00006414      -8.0311%
  thr ms          36359           37019
  thr ms/op  0.00027090      0.00027581      -1.8152%
  ---------- locked inc ---- atomic_add ---- 8 threads
  ms               5016            5484
  ms/op      0.00003737      0.00004086      -9.3301%
  thr ms          60846           66130
  thr ms/op  0.00045334      0.00049271      -8.6842%

  C:\Temp>t 2 && t 4 && t 8
  ---------- locked inc ---- atomic_add ---- 2 threads
  ms               2781            2906
  ms/op      0.00002072      0.00002165      -4.4948%
  thr ms          14961           16093
  thr ms/op  0.00011147      0.00011990      -7.5663%
  ---------- locked inc ---- atomic_add ---- 4 threads
  ms               2781            2891
  ms/op      0.00002072      0.00002154      -3.9554%
  thr ms          30867           31328
  thr ms/op  0.00022998      0.00023341      -1.4935%
  ---------- locked inc ---- atomic_add ---- 8 threads
  ms               2782            2890
  ms/op      0.00002073      0.00002153      -3.8821%
  thr ms          64318           64341
  thr ms/op  0.00047921      0.00047938      -0.0358%

I will do a quick run using the string performance test after lunch.
I'll report the results on that later. I've pasted the source for the
bulk of my test below. If someone wants the entire thing, let me know
and I'll provide everything.

Travis


Martin Sebor wrote:
>Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
>
>What's the status of this? We need to decide if we can put this
>in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
>sure the new functions don't cause a performance regression in
>basic_string. I.e., we need to see the before and after numbers.
>
>Martin
>
>Martin Sebor wrote:
>>
>> One concern I have is performance. Does replacing the intrinsics with
>> out of line function call whose semantics the compiler has no idea
>> about have any impact on the runtime efficiency of the 
>generated code?
>> I would be especially interested in "real life" scenarios such as the
>> usage of the atomic operations in basic_string.
>> 
>> It would be good to see some before and after numbers. If you don't
>> have all the platforms to run the test post your benchmark and Travis
>> can help you put them together.
>

#include <stdio.h>
#include <stdlib.h>

#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <process.h>

#include "lib.h"

#define MIN_THREADS 2
#define MAX_THREADS 16

unsigned long locked_inc(long* val, long iters)
{
    const unsigned long t0 = GetTickCount ();

    long n;
    for (n = 0; n < iters; ++n)
    {
        InterlockedIncrement(val);
    }

    const unsigned long t1 = GetTickCount ();

    return (t1 - t0);
}

unsigned long atomic_add(long* val, long iters)
{
    const unsigned long t0 = GetTickCount ();

    long n;
    for (n = 0; n < iters; ++n)
    {
        __rw_atomic_add32(val, 1);
    }

    const unsigned long t1 = GetTickCount ();

    return (t1 - t0);
}

struct thread_param {

    // atomic variable
    long* variable;

    // number of iterations
    long iters;

    // function to invoke
    unsigned long (*fun)(long*, long);

    // result of function
    unsigned long result;

    // thread handle used by main thread
    HANDLE thread;
};

extern "C" {

    void thread_func(void* p)
    {
        thread_param* param = (thread_param*)p;
        param->result = (param->fun)(param->variable, param->iters);
    }

} // extern "C"


unsigned long run_threads(int nthreads, unsigned long (*fun)(long*,
long), long iters)
{
    thread_param params[MAX_THREADS];
    long thread_var = 0;

    int i;
    for (i = 0; i < nthreads; ++i) {
        params[i].variable = &thread_var;
        params[i].result   = 0;
        params[i].fun      = fun;
        params[i].iters    = iters;
    }

    int n;
    for (n = 0; n < nthreads; ++n) {
        params[n].thread = (HANDLE)_beginthread(thread_func, 0,
&params[n]);
    }

    unsigned long thread_time = 0;

    for (n = 0; n < nthreads; ++n) {
        WaitForSingleObject (params[n].thread, INFINITE);
        thread_time += params[n].result;
    }

    return thread_time;
}


int main(int argc, char* argv[])
{
    int nthreads = MIN_THREADS;
    if (1 < argc)
        nthreads = atoi(argv[1]);

    // cap thread count
    if (nthreads < MIN_THREADS)
        nthreads = MIN_THREADS;
    else if (MAX_THREADS < nthreads)
        nthreads = MAX_THREADS;

    const long ops = 0x7ffffff;
    long thread_var;
    
    thread_var = 0;
    unsigned long locked_inc_ms = locked_inc (&thread_var, ops);
    
    thread_var = 0;
    unsigned long atomic_add_ms = atomic_add (&thread_var, ops);

    printf("---------- locked inc ---- atomic_add ---- %d threads\n",
nthreads);
    printf("ms           %8.u        %8.u\n", locked_inc_ms,
atomic_add_ms);

    float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
    float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;

    printf("ms/op      %8.8f      %8.8f      %.4f%%\n", 
        locked_inc_ops_p_ms, atomic_add_ops_p_ms,
        100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);

    // do it with threads

    locked_inc_ms = run_threads(nthreads, locked_inc, ops);
    atomic_add_ms = run_threads(nthreads, atomic_add, ops);

    locked_inc_ms /= nthreads;
    atomic_add_ms /= nthreads;

    printf("thr ms       %8.u        %8.u\n", locked_inc_ms,
atomic_add_ms);

    locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
    atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;

     printf("thr ms/op  %8.8f      %8.8f      %.4f%%\n", 
        locked_inc_ops_p_ms, atomic_add_ops_p_ms,
        100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);

    return 0;
}

Reply via email to