On 10/03/12 11:10, Martin Sebor wrote:
[...] I was just thinking of a few simple loops along the lines of:
tl;dr: I consider the results of the multi-threaded performance tests (12S, Intel/AMD multicores) as coming from heavy contention in copying of reference-counted std::string objects. The long version: I went back to the original test case (attached, t.cpp). As a reminder, the program is a simplification of the library numpunct facet creation process, which can be compiled either with data caching enabled or disabled. I ran the perf tool (https://perf.wiki.kernel.org/index.php/Main_Page) on the program, and I noticed marked differences in the number of instructions and cycles (an order of magnitude) and also in the number of CPU-migrations and context-switches (way larger for the cached version): Performance counter stats for './t': 121168.026301 task-clock-msecs # 3.778 CPUs 6133377 context-switches # 0.051 M/sec 35457 CPU-migrations # 0.000 M/sec 298 page-faults # 0.000 M/sec 338006437277 cycles # 2789.568 M/sec 136554855096 instructions # 0.404 IPC 31298346444 branches # 258.305 M/sec 2305761092 branch-misses # 7.367 % 1592157186 cache-references # 13.140 M/sec 1729755 cache-misses # 0.014 M/sec 32.070381363 seconds time elapsed vs. Performance counter stats for './t': 16194.235282 task-clock-msecs # 3.883 CPUs 7491 context-switches # 0.000 M/sec 47 CPU-migrations # 0.000 M/sec 311 page-faults # 0.000 M/sec 45210281987 cycles # 2791.752 M/sec 76784904688 instructions # 1.698 IPC 18891711522 branches # 1166.570 M/sec 3162251552 branch-misses # 16.739 % 3332839 cache-references # 0.206 M/sec 119424 cache-misses # 0.007 M/sec 4.170904279 seconds time elapsed Then, I have run more detailed reports on the runs. I noticed that the report for the cached version contains what appears to be thread synchronization overhead, whereas the non-cached version doesn't: # Events: 134K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ................................... # 50.69% t [kernel.kallsyms] [k] _raw_spin_lock 5.46% t libpthread-2.13.so [.] pthread_mutex_lock 4.16% t libpthread-2.13.so [.] __pthread_mutex_unlock 3.65% t libpthread-2.13.so [.] __lll_lock_wait 3.35% t [kernel.kallsyms] [k] copy_user_generic_string 3.09% t 40395b [.] 40395b 2.91% t [kernel.kallsyms] [k] hash_futex 2.87% t [kernel.kallsyms] [k] futex_wake 2.67% t t [.] f 2.37% t [kernel.kallsyms] [k] futex_wait_setup 1.61% t [kernel.kallsyms] [k] system_call 1.00% t [kernel.kallsyms] [k] system_call_after_swapgs 0.98% t [kernel.kallsyms] [k] __switch_to 0.96% t libpthread-2.13.so [.] __lll_unlock_wake 0.95% t [kernel.kallsyms] [k] get_futex_key 0.92% t libc-2.13.so [.] __strlen_sse42 [...] vs. # Events: 15K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. .............................................................................................................................. # 49.55% :29571 7f1ed3830a86 [.] 7f1ed3830a86 8.67% t libc-2.13.so [.] __malloc 8.03% t libc-2.13.so [.] cfree 4.88% t t [.] std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, std::allocator<char> const&) 4.67% t libc-2.13.so [.] __strlen_sse42 4.14% t libpthread-2.13.so [.] __pthread_mutex_unlock 3.62% t libpthread-2.13.so [.] pthread_mutex_lock 2.64% t t [.] f 2.40% t libc-2.13.so [.] _int_malloc 2.10% t libc-2.13.so [.] _int_free 1.53% t libpthread-2.13.so [.] __pthread_mutex_init_internal 1.33% t t [.] operator new(unsigned long) 1.08% t t [.] X::numpunct::do_grouping() const 1.04% t libc-2.13.so [.] __memcpy_sse2 0.89% t t [.] X::get_data(X::facet*) 0.51% t t [.] __rw::__rw_allocate(unsigned long, int) 0.45% t t [.] strlen@plt 0.33% t t [.] __rw::__rw_deallocate(void*, unsigned long, int) 0.32% t libpthread-2.13.so [.] pthread_mutex_destroy 0.21% t t [.] pthread_mutex_unlock@plt 0.21% t t [.] pthread_mutex_init@plt 0.18% t t [.] pthread_mutex_destroy@plt 0.17% t t [.] operator delete(void*) 0.17% t t [.] malloc@plt 0.16% t t [.] memcpy@plt 0.15% :29571 [kernel.kallsyms] [k] hpet_msi_next_event 0.12% t libc-2.13.so [.] __strrchr_sse2 0.12% t t [.] pthread_mutex_lock@plt 0.11% t t [.] free@plt [...] Now, if you remember from previous measurements, the performance of the two versions of the program was reversed for the case of one thread, i.e., one-threaded programs performed better in the cached version. Here is the report for a run with one thread, the cached version (running one thread in 1000M loops, otherwise it does not register anything): # Events: 36K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ........................ # 37.68% t libpthread-2.13.so [.] __pthread_mutex_unlock 31.04% t libpthread-2.13.so [.] pthread_mutex_lock 11.87% t t [.] f 9.64% :27362 7fd94e2ebd6d [.] 7fd94e2ebd6d 3.21% t libc-2.13.so [.] __strlen_sse42 2.38% t t [.] pthread_mutex_unlock@plt 2.27% t t [.] pthread_mutex_lock@plt 1.61% t t [.] strlen@plt [...] The one-threaded run for the non-cached version yields a report identical with the multi-threaded run. There are two things which I take from this round of measurements: first, that performance suffers on multicores in heavy contention, possibly as a direct result of std::string object copying and I'll follow up with direct measurements. Second, that perhaps we should not focus on this particular use model for the numpunct facet, because it may not be very representative for `normal' usage. HTH Liviu
* With caching ** Stats counters *** Contended run 16, 10000000 Performance counter stats for './t': 121168.026301 task-clock-msecs # 3.778 CPUs 6133377 context-switches # 0.051 M/sec 35457 CPU-migrations # 0.000 M/sec 298 page-faults # 0.000 M/sec 338006437277 cycles # 2789.568 M/sec 136554855096 instructions # 0.404 IPC 31298346444 branches # 258.305 M/sec 2305761092 branch-misses # 7.367 % 1592157186 cache-references # 13.140 M/sec 1729755 cache-misses # 0.014 M/sec 32.070381363 seconds time elapsed ** Report # Events: 134K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ................................... # 50.69% t [kernel.kallsyms] [k] _raw_spin_lock 5.46% t libpthread-2.13.so [.] pthread_mutex_lock 4.16% t libpthread-2.13.so [.] __pthread_mutex_unlock 3.65% t libpthread-2.13.so [.] __lll_lock_wait 3.35% t [kernel.kallsyms] [k] copy_user_generic_string 3.09% t 40395b [.] 40395b 2.91% t [kernel.kallsyms] [k] hash_futex 2.87% t [kernel.kallsyms] [k] futex_wake 2.67% t t [.] f 2.37% t [kernel.kallsyms] [k] futex_wait_setup 1.61% t [kernel.kallsyms] [k] system_call 1.00% t [kernel.kallsyms] [k] system_call_after_swapgs 0.98% t [kernel.kallsyms] [k] __switch_to 0.96% t libpthread-2.13.so [.] __lll_unlock_wake 0.95% t [kernel.kallsyms] [k] get_futex_key 0.92% t libc-2.13.so [.] __strlen_sse42 [...] *** Uncontended run 1, 100000000 # Events: 3K cycles # # Overhead Command Shared Object Symbol # ........ ....... ................. ..................... # 99.35% :15870 7f082cdda456 [.] 7f082cdda456 0.46% t [kernel.kallsyms] [k] recalc_sigpending_tsk 0.19% :15870 [kernel.kallsyms] [k] hpet_msi_next_event 0.01% t [kernel.kallsyms] [k] __free_pipe_info 0.00% t [kernel.kallsyms] [k] intel_pmu_enable_all * Without caching ** Stats counters 16, 10000000 Performance counter stats for './t': 16194.235282 task-clock-msecs # 3.883 CPUs 7491 context-switches # 0.000 M/sec 47 CPU-migrations # 0.000 M/sec 311 page-faults # 0.000 M/sec 45210281987 cycles # 2791.752 M/sec 76784904688 instructions # 1.698 IPC 18891711522 branches # 1166.570 M/sec 3162251552 branch-misses # 16.739 % 3332839 cache-references # 0.206 M/sec 119424 cache-misses # 0.007 M/sec 4.170904279 seconds time elapsed ** Report # Events: 15K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. .............................................................................................................................. # 49.55% :29571 7f1ed3830a86 [.] 7f1ed3830a86 8.67% t libc-2.13.so [.] __malloc 8.03% t libc-2.13.so [.] cfree 4.88% t t [.] std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, std::allocator<char> const&) 4.67% t libc-2.13.so [.] __strlen_sse42 4.14% t libpthread-2.13.so [.] __pthread_mutex_unlock 3.62% t libpthread-2.13.so [.] pthread_mutex_lock 2.64% t t [.] f 2.40% t libc-2.13.so [.] _int_malloc 2.10% t libc-2.13.so [.] _int_free 1.53% t libpthread-2.13.so [.] __pthread_mutex_init_internal 1.33% t t [.] operator new(unsigned long) 1.08% t t [.] X::numpunct::do_grouping() const 1.04% t libc-2.13.so [.] __memcpy_sse2 0.89% t t [.] X::get_data(X::facet*) 0.51% t t [.] __rw::__rw_allocate(unsigned long, int) 0.45% t t [.] strlen@plt 0.33% t t [.] __rw::__rw_deallocate(void*, unsigned long, int) 0.32% t libpthread-2.13.so [.] pthread_mutex_destroy 0.21% t t [.] pthread_mutex_unlock@plt 0.21% t t [.] pthread_mutex_init@plt 0.18% t t [.] pthread_mutex_destroy@plt 0.17% t t [.] operator delete(void*) 0.17% t t [.] malloc@plt 0.16% t t [.] memcpy@plt 0.15% :29571 [kernel.kallsyms] [k] hpet_msi_next_event 0.12% t libc-2.13.so [.] __strrchr_sse2 0.12% t t [.] pthread_mutex_lock@plt 0.11% t t [.] free@plt [...]
#include <iostream> #include <locale> #include <cstdio> #include <cstdlib> #include <cstring> #include <pthread.h> #include <unistd.h> #define MAX_THREADS 128 static long nloops = 10000000, nthreads = 16; static bool volatile pwait = true; //////////////////////////////////////////////////////////////////////// namespace X { struct guard { guard (pthread_mutex_t* ptr) : lock_ (ptr) { pthread_mutex_lock (lock_); } ~guard () { pthread_mutex_unlock (lock_); } private: guard (guard const&); guard& operator= (guard const&); private: pthread_mutex_t* lock_; }; struct facet { facet () { pthread_mutex_init (&_C_mutex, 0); } void* _C_data () { return _C_impsize ? _C_impdata : _C_get_data (); } void* _C_get_data (); size_t _C_impsize; void* _C_impdata; pthread_mutex_t _C_mutex; }; void* facet::_C_get_data () { if (_C_impsize) return _C_impdata; guard g (&_C_mutex); if (_C_impsize) return _C_impdata; #if !defined (NO_STDCXX_LOCALES) _C_impdata = (void*)"\3\3"; #endif // NO_STDCXX_LOCALES _C_impsize = (size_t)(-1); return _C_impdata; } void* get_data (facet*); struct numpunct : facet { numpunct () : _C_flags () { } virtual std::string do_grouping () const; std::string grouping () const { numpunct* const __self = const_cast< numpunct* > (this); #if !defined (NO_CACHE) if (!(_C_flags & 1)) { __self->_C_flags |= 1; __self->_C_grouping = do_grouping (); } return _C_grouping; #else return do_grouping (); #endif // NO_CACHE } private: int _C_flags; std::string _C_grouping; }; std::string numpunct::do_grouping () const { return (const char*)get_data (const_cast<numpunct*>(this)); } void* get_data (facet* pfacet) { void* pdata = pfacet->_C_data (); if (pdata) return pdata; // __rw_setlocale loc (...); <- other mutex if (pfacet->_C_data ()) return get_data (pfacet); pfacet->_C_impdata = const_cast<char*>("\4\4"); pfacet->_C_impsize = (size_t)(-1); return 0; } } // namespace X //////////////////////////////////////////////////////////////////////// typedef X::numpunct Numpunct; extern "C" { static void* f (void* pv) { Numpunct const& fac = *reinterpret_cast< Numpunct* > (pv); volatile unsigned long n = 0; while (pwait) ; for (int i = 0; i < nloops; ++i) { const std::string grouping = fac.grouping (); n += strlen (grouping.c_str ()); } return (void*)n; } } // extern "C" int main (int argc, char** argv) { switch (argc) { case 3: nloops = atol (argv [2]); case 2: nthreads = atol (argv [1]); break; } pthread_t tid [MAX_THREADS] = { 0 }; if (nthreads > MAX_THREADS) nthreads = MAX_THREADS; printf ("%ld, %ld\n", nthreads, nloops); pthread_setconcurrency (nthreads); Numpunct fac; fac.grouping (); for (int i = 0; i < nthreads; ++i) { if (pthread_create (tid + i, 0, f, const_cast<Numpunct*> (&fac))) exit (-1); } usleep (50); pwait = false; for (int i = 0; i < nthreads; ++i) { if (tid [i]) pthread_join (tid [i], 0); } return 0; }