Re: Fwd: Re: STDCXX-1071 numpunct facet defect

Liviu Nicoara Sun, 21 Oct 2012 09:22:51 -0700

On 10/03/12 11:10, Martin Sebor wrote:

[...]
I was just thinking of a few simple loops along the lines of:


tl;dr: I consider the results of the multi-threaded performance tests (12S, 
Intel/AMD multicores) as coming from heavy contention in copying of 
reference-counted std::string objects.

The long version: I went back to the original test case (attached, t.cpp). As a 
reminder, the program is a simplification of the library numpunct facet 
creation process, which can be compiled either with data caching enabled or 
disabled.

I ran the perf tool (https://perf.wiki.kernel.org/index.php/Main_Page) on the 
program, and I noticed marked differences in the number of instructions and 
cycles (an order of magnitude) and also in the number of CPU-migrations and 
context-switches (way larger for the cached version):

 Performance counter stats for './t':

      121168.026301  task-clock-msecs         #      3.778 CPUs
            6133377  context-switches         #      0.051 M/sec
              35457  CPU-migrations           #      0.000 M/sec
                298  page-faults              #      0.000 M/sec
       338006437277  cycles                   #   2789.568 M/sec
       136554855096  instructions             #      0.404 IPC
        31298346444  branches                 #    258.305 M/sec
         2305761092  branch-misses            #      7.367 %
         1592157186  cache-references         #     13.140 M/sec
            1729755  cache-misses             #      0.014 M/sec

       32.070381363  seconds time elapsed

vs.

 Performance counter stats for './t':

       16194.235282  task-clock-msecs         #      3.883 CPUs
               7491  context-switches         #      0.000 M/sec
                 47  CPU-migrations           #      0.000 M/sec
                311  page-faults              #      0.000 M/sec
        45210281987  cycles                   #   2791.752 M/sec
        76784904688  instructions             #      1.698 IPC
        18891711522  branches                 #   1166.570 M/sec
         3162251552  branch-misses            #     16.739 %
            3332839  cache-references         #      0.206 M/sec
             119424  cache-misses             #      0.007 M/sec

        4.170904279  seconds time elapsed


Then, I have run more detailed reports on the runs. I noticed that the report 
for the cached version contains what appears to be thread synchronization 
overhead, whereas the non-cached version doesn't:

# Events: 134K cycles
#
# Overhead  Command       Shared Object                               Symbol
# ........  .......  ..................  ...................................
#
    50.69%        t  [kernel.kallsyms]   [k] _raw_spin_lock
     5.46%        t  libpthread-2.13.so  [.] pthread_mutex_lock
     4.16%        t  libpthread-2.13.so  [.] __pthread_mutex_unlock
     3.65%        t  libpthread-2.13.so  [.] __lll_lock_wait
     3.35%        t  [kernel.kallsyms]   [k] copy_user_generic_string
     3.09%        t              40395b  [.]           40395b
     2.91%        t  [kernel.kallsyms]   [k] hash_futex
     2.87%        t  [kernel.kallsyms]   [k] futex_wake
     2.67%        t  t                   [.] f
     2.37%        t  [kernel.kallsyms]   [k] futex_wait_setup
     1.61%        t  [kernel.kallsyms]   [k] system_call
     1.00%        t  [kernel.kallsyms]   [k] system_call_after_swapgs
     0.98%        t  [kernel.kallsyms]   [k] __switch_to
     0.96%        t  libpthread-2.13.so  [.] __lll_unlock_wake
     0.95%        t  [kernel.kallsyms]   [k] get_futex_key
     0.92%        t  libc-2.13.so        [.] __strlen_sse42
     [...]


vs.


# Events: 15K cycles
#
# Overhead  Command       Shared Object                                         
                                                                                
 Symbol
# ........  .......  ..................  
..............................................................................................................................
#
    49.55%   :29571        7f1ed3830a86  [.]     7f1ed3830a86
     8.67%        t  libc-2.13.so        [.] __malloc
     8.03%        t  libc-2.13.so        [.] cfree
     4.88%        t  t                   [.] std::basic_string<char, std::char_traits<char>, 
std::allocator<char> >::basic_string(char const*, std::allocator<char> const&)
     4.67%        t  libc-2.13.so        [.] __strlen_sse42
     4.14%        t  libpthread-2.13.so  [.] __pthread_mutex_unlock
     3.62%        t  libpthread-2.13.so  [.] pthread_mutex_lock
     2.64%        t  t                   [.] f
     2.40%        t  libc-2.13.so        [.] _int_malloc
     2.10%        t  libc-2.13.so        [.] _int_free
     1.53%        t  libpthread-2.13.so  [.] __pthread_mutex_init_internal
     1.33%        t  t                   [.] operator new(unsigned long)
     1.08%        t  t                   [.] X::numpunct::do_grouping() const
     1.04%        t  libc-2.13.so        [.] __memcpy_sse2
     0.89%        t  t                   [.] X::get_data(X::facet*)
     0.51%        t  t                   [.] __rw::__rw_allocate(unsigned long, 
int)
     0.45%        t  t                   [.] strlen@plt
     0.33%        t  t                   [.] __rw::__rw_deallocate(void*, 
unsigned long, int)
     0.32%        t  libpthread-2.13.so  [.] pthread_mutex_destroy
     0.21%        t  t                   [.] pthread_mutex_unlock@plt
     0.21%        t  t                   [.] pthread_mutex_init@plt
     0.18%        t  t                   [.] pthread_mutex_destroy@plt
     0.17%        t  t                   [.] operator delete(void*)
     0.17%        t  t                   [.] malloc@plt
     0.16%        t  t                   [.] memcpy@plt
     0.15%   :29571  [kernel.kallsyms]   [k] hpet_msi_next_event
     0.12%        t  libc-2.13.so        [.] __strrchr_sse2
     0.12%        t  t                   [.] pthread_mutex_lock@plt
     0.11%        t  t                   [.] free@plt
     [...]


Now, if you remember from previous measurements, the performance of the two 
versions of the program was reversed for the case of one thread, i.e., 
one-threaded programs performed better in the cached version. Here is the 
report for a run with one thread, the cached version (running one thread in 
1000M loops, otherwise it does not register anything):

# Events: 36K cycles
#
# Overhead  Command       Shared Object                    Symbol
# ........  .......  ..................  ........................
#
    37.68%        t  libpthread-2.13.so  [.] __pthread_mutex_unlock
    31.04%        t  libpthread-2.13.so  [.] pthread_mutex_lock
    11.87%        t  t                   [.] f
     9.64%   :27362        7fd94e2ebd6d  [.]     7fd94e2ebd6d
     3.21%        t  libc-2.13.so        [.] __strlen_sse42
     2.38%        t  t                   [.] pthread_mutex_unlock@plt
     2.27%        t  t                   [.] pthread_mutex_lock@plt
     1.61%        t  t                   [.] strlen@plt
     [...]

The one-threaded run for the non-cached version yields a report identical with 
the multi-threaded run.


There are two things which I take from this round of measurements: first, that 
performance suffers on multicores in heavy contention, possibly as a direct 
result of std::string object copying and I'll follow up with direct 
measurements. Second, that perhaps we should not focus on this particular use 
model for the numpunct facet, because it may not be very representative for 
`normal' usage.


HTH

Liviu

* With caching

** Stats counters

*** Contended run 

16, 10000000

 Performance counter stats for './t':

      121168.026301  task-clock-msecs         #      3.778 CPUs 
            6133377  context-switches         #      0.051 M/sec
              35457  CPU-migrations           #      0.000 M/sec
                298  page-faults              #      0.000 M/sec
       338006437277  cycles                   #   2789.568 M/sec
       136554855096  instructions             #      0.404 IPC  
        31298346444  branches                 #    258.305 M/sec
         2305761092  branch-misses            #      7.367 %    
         1592157186  cache-references         #     13.140 M/sec
            1729755  cache-misses             #      0.014 M/sec

       32.070381363  seconds time elapsed

** Report

# Events: 134K cycles
#
# Overhead  Command       Shared Object                               Symbol
# ........  .......  ..................  ...................................
#
    50.69%        t  [kernel.kallsyms]   [k] _raw_spin_lock
     5.46%        t  libpthread-2.13.so  [.] pthread_mutex_lock
     4.16%        t  libpthread-2.13.so  [.] __pthread_mutex_unlock
     3.65%        t  libpthread-2.13.so  [.] __lll_lock_wait
     3.35%        t  [kernel.kallsyms]   [k] copy_user_generic_string
     3.09%        t              40395b  [.]           40395b
     2.91%        t  [kernel.kallsyms]   [k] hash_futex
     2.87%        t  [kernel.kallsyms]   [k] futex_wake
     2.67%        t  t                   [.] f
     2.37%        t  [kernel.kallsyms]   [k] futex_wait_setup
     1.61%        t  [kernel.kallsyms]   [k] system_call
     1.00%        t  [kernel.kallsyms]   [k] system_call_after_swapgs
     0.98%        t  [kernel.kallsyms]   [k] __switch_to
     0.96%        t  libpthread-2.13.so  [.] __lll_unlock_wake
     0.95%        t  [kernel.kallsyms]   [k] get_futex_key
     0.92%        t  libc-2.13.so        [.] __strlen_sse42
     [...]

*** Uncontended run 

1, 100000000

# Events: 3K cycles
#
# Overhead  Command      Shared Object                 Symbol
# ........  .......  .................  .....................
#
    99.35%   :15870       7f082cdda456  [.]     7f082cdda456
     0.46%        t  [kernel.kallsyms]  [k] recalc_sigpending_tsk
     0.19%   :15870  [kernel.kallsyms]  [k] hpet_msi_next_event
     0.01%        t  [kernel.kallsyms]  [k] __free_pipe_info
     0.00%        t  [kernel.kallsyms]  [k] intel_pmu_enable_all


* Without caching

** Stats counters

16, 10000000

 Performance counter stats for './t':

       16194.235282  task-clock-msecs         #      3.883 CPUs 
               7491  context-switches         #      0.000 M/sec
                 47  CPU-migrations           #      0.000 M/sec
                311  page-faults              #      0.000 M/sec
        45210281987  cycles                   #   2791.752 M/sec
        76784904688  instructions             #      1.698 IPC  
        18891711522  branches                 #   1166.570 M/sec
         3162251552  branch-misses            #     16.739 %    
            3332839  cache-references         #      0.206 M/sec
             119424  cache-misses             #      0.007 M/sec

        4.170904279  seconds time elapsed

** Report

# Events: 15K cycles
#
# Overhead  Command       Shared Object                                         
                                                                                
 Symbol
# ........  .......  ..................  
..............................................................................................................................
#
    49.55%   :29571        7f1ed3830a86  [.]     7f1ed3830a86
     8.67%        t  libc-2.13.so        [.] __malloc
     8.03%        t  libc-2.13.so        [.] cfree
     4.88%        t  t                   [.] std::basic_string<char, 
std::char_traits<char>, std::allocator<char> >::basic_string(char const*, 
std::allocator<char> const&)
     4.67%        t  libc-2.13.so        [.] __strlen_sse42
     4.14%        t  libpthread-2.13.so  [.] __pthread_mutex_unlock
     3.62%        t  libpthread-2.13.so  [.] pthread_mutex_lock
     2.64%        t  t                   [.] f
     2.40%        t  libc-2.13.so        [.] _int_malloc
     2.10%        t  libc-2.13.so        [.] _int_free
     1.53%        t  libpthread-2.13.so  [.] __pthread_mutex_init_internal
     1.33%        t  t                   [.] operator new(unsigned long)
     1.08%        t  t                   [.] X::numpunct::do_grouping() const
     1.04%        t  libc-2.13.so        [.] __memcpy_sse2
     0.89%        t  t                   [.] X::get_data(X::facet*)
     0.51%        t  t                   [.] __rw::__rw_allocate(unsigned long, 
int)
     0.45%        t  t                   [.] strlen@plt
     0.33%        t  t                   [.] __rw::__rw_deallocate(void*, 
unsigned long, int)
     0.32%        t  libpthread-2.13.so  [.] pthread_mutex_destroy
     0.21%        t  t                   [.] pthread_mutex_unlock@plt
     0.21%        t  t                   [.] pthread_mutex_init@plt
     0.18%        t  t                   [.] pthread_mutex_destroy@plt
     0.17%        t  t                   [.] operator delete(void*)
     0.17%        t  t                   [.] malloc@plt
     0.16%        t  t                   [.] memcpy@plt
     0.15%   :29571  [kernel.kallsyms]   [k] hpet_msi_next_event
     0.12%        t  libc-2.13.so        [.] __strrchr_sse2
     0.12%        t  t                   [.] pthread_mutex_lock@plt
     0.11%        t  t                   [.] free@plt
     [...]

#include <iostream>
#include <locale>

#include <cstdio>
#include <cstdlib>
#include <cstring>

#include <pthread.h>
#include <unistd.h>

#define MAX_THREADS 128

static long nloops = 10000000, nthreads = 16;
static bool volatile pwait = true;

////////////////////////////////////////////////////////////////////////

namespace X {

struct guard 
{
    guard (pthread_mutex_t* ptr) : lock_ (ptr) { 
        pthread_mutex_lock (lock_);
    }

    ~guard () { 
        pthread_mutex_unlock (lock_); 
    }

private:

    guard (guard const&);
    guard& operator= (guard const&);

private:

    pthread_mutex_t* lock_;
};

struct facet
{
    facet () {
        pthread_mutex_init (&_C_mutex, 0);
    }

    void* _C_data () {
        return _C_impsize ? _C_impdata : _C_get_data ();
    }

    void* _C_get_data ();

    size_t _C_impsize;
    void*  _C_impdata;

    pthread_mutex_t _C_mutex;
};

void* facet::_C_get_data ()
{
    if (_C_impsize)
        return _C_impdata;

    guard g (&_C_mutex);

    if (_C_impsize)
        return _C_impdata;

#if !defined (NO_STDCXX_LOCALES)
    _C_impdata = (void*)"\3\3";
#endif // NO_STDCXX_LOCALES
    _C_impsize = (size_t)(-1);

    return _C_impdata;
}

void* get_data (facet*);

struct numpunct : facet
{
    numpunct () : _C_flags () { }

    virtual std::string do_grouping () const;

    std::string grouping () const {
        numpunct* const __self = const_cast< numpunct* > (this);

#if !defined (NO_CACHE)
        if (!(_C_flags & 1)) {
            __self->_C_flags  |= 1;
            __self->_C_grouping = do_grouping ();
        }

        return _C_grouping;
#else
        return do_grouping ();
#endif // NO_CACHE
    }

private:

    int          _C_flags;
    std::string  _C_grouping;
};

std::string numpunct::do_grouping () const
{
    return (const char*)get_data (const_cast<numpunct*>(this));
}

void* get_data (facet* pfacet)
{
    void* pdata = pfacet->_C_data ();

    if (pdata) 
        return pdata;

    // __rw_setlocale loc (...); <- other mutex

    if (pfacet->_C_data ())
        return get_data (pfacet);

    pfacet->_C_impdata = const_cast<char*>("\4\4");
    pfacet->_C_impsize = (size_t)(-1);

    return 0;
}

} // namespace X

////////////////////////////////////////////////////////////////////////

typedef X::numpunct Numpunct;

extern "C" {

static void* 
f (void* pv)
{
    Numpunct const& fac = *reinterpret_cast< Numpunct* > (pv);

    volatile unsigned long n = 0;

    while (pwait) ;
    
    for (int i = 0; i < nloops; ++i) {
        const std::string grouping = fac.grouping ();
        n += strlen (grouping.c_str ());
    }

    return (void*)n;
}

} // extern "C"

int
main (int argc, char** argv)
{
    switch (argc) {
    case 3:
        nloops = atol (argv [2]);
    case 2:
        nthreads = atol (argv [1]);
        break;
    }

    pthread_t tid [MAX_THREADS] = { 0 };

    if (nthreads > MAX_THREADS)
        nthreads = MAX_THREADS;

    printf ("%ld, %ld\n", nthreads, nloops);

    pthread_setconcurrency (nthreads);

    Numpunct fac;
    fac.grouping ();
    
    for (int i = 0; i < nthreads; ++i) {
        if (pthread_create (tid + i, 0, f, const_cast<Numpunct*> (&fac)))
            exit (-1);
    }

    usleep (50);
    pwait = false;

    for (int i = 0; i < nthreads; ++i) {
        if (tid [i])
            pthread_join (tid [i], 0);
    }

    return 0;
}

Re: Fwd: Re: STDCXX-1071 numpunct facet defect

Reply via email to