On 10/3/12 11:10 AM, Martin Sebor wrote:
On 10/03/2012 07:01 AM, Liviu Nicoara wrote:
I am gathering some more measurements along these lines but it's time
consuming. I estimate I will have some ready for review later today or
tomorrow. In the meantime could you please post your kernel, glibc and
compiler versions?
I was just thinking of a few simple loops along the lines of:
void* thread_func (void*) {
for (int i = 0; i < N; ++)
test 1: do some simple stuff inline
test 2: call a virtual function to do the same stuff
test 3: lock and unlock a mutex and do the same stuff
}
Test 1 should be the fastest and test 3 the slowest. This should
hold regardless of what "simple stuff" is (eventually, even when
it's getting numpunct::grouping() data).
That is expected; I attached test case x.cpp and results-x.txt.
I did not find it too interesting in its own, though. The difference between the
cached and non-cached data is that in the case of the cached data the copying of
the string involves nothing more than a bump in the reference counter, whereas
in the non-cached version a string object is constructed anew, and memory gets
allocated for its body. Yet, in my measurements, the cached version is the one
which shows the worse performance.
So, I extracted the std::string class and simplified it down and put it in
another test case. That would be u.cpp and the results are results-u.txt. The
results show the same performance trends although the absolute values have
skewed. Will get back to this after I digest the results a bit more.
Liviu
-*- mode: org -*-
* iMac, 4x Core i5 , 12S, gcc 4.5.4:
$ nice make u
16, 100000000 1m18.811s 5m12.329s 0m0.263s
8, 100000000 0m39.919s 2m36.198s 0m0.150s
4, 100000000 0m20.449s 1m20.797s 0m0.050s
2, 100000000 0m9.888s 0m19.725s 0m0.005s
1, 100000000 0m2.483s 0m2.480s 0m0.002s
$ nice make CPPOPTS="-DNO_CACHE" u
16, 100000000 0m37.418s 2m27.822s 0m0.872s
8, 100000000 0m18.844s 1m14.607s 0m0.261s
4, 100000000 0m10.165s 0m40.147s 0m0.023s
2, 100000000 0m8.652s 0m17.278s 0m0.003s
1, 100000000 0m8.482s 0m8.473s 0m0.007s
$ nice make CPPOPTS="-DNO_VIRTUAL_CALL" u
16, 100000000 1m2.770s 4m9.307s 0m0.179s
8, 100000000 0m31.890s 2m6.792s 0m0.087s
4, 100000000 0m16.427s 1m5.133s 0m0.039s
2, 100000000 0m8.497s 0m16.981s 0m0.007s
1, 100000000 0m2.291s 0m2.288s 0m0.002s
$ nice make CPPOPTS="-DNO_CACHE -DNO_VIRTUAL_CALL" u
16, 100000000 0m35.838s 2m21.406s 0m0.877s
8, 100000000 0m19.007s 1m14.920s 0m0.255s
4, 100000000 0m10.099s 0m39.504s 0m0.042s
2, 100000000 0m8.599s 0m17.190s 0m0.003s
1, 100000000 0m8.986s 0m8.980s 0m0.005s
* Linux Slackware, 16x AMD Opteron, 12S, gcc 4.5.2
$ nice make u
$ nice make CPPOPTS="-DNO_CACHE" u
$ nice make CPPOPTS="-DNO_VIRTUAL_CALL" u
$ nice make CPPOPTS="-DNO_CACHE -DNO_VIRTUAL_CALL" u
-*- mode: org -*-
* iMac, 4x Core i5 , 12S, gcc 4.5.4:
$ nice make CPPOPTS="-DNO_LOCK -DNO_VIRTUAL_CALL" u
16, 100000000 0m7.864s 0m30.259s 0m0.035s
8, 100000000 0m4.396s 0m17.034s 0m0.016s
4, 100000000 0m2.729s 0m10.473s 0m0.011s
2, 100000000 0m2.481s 0m4.929s 0m0.003s
1, 100000000 0m2.461s 0m2.455s 0m0.002s
$ nice make CPPOPTS="-DNO_LOCK" u
16, 100000000 0m9.724s 0m37.455s 0m0.043s
8, 100000000 0m5.559s 0m20.309s 0m0.048s
4, 100000000 0m3.160s 0m12.213s 0m0.013s
2, 100000000 0m2.872s 0m5.694s 0m0.004s
1, 100000000 0m2.845s 0m2.838s 0m0.002s
$ nice make u
16, 100000000 1m3.745s 3m58.570s 0m0.351s
8, 100000000 0m32.351s 1m55.740s 0m0.203s
4, 100000000 0m16.852s 1m1.633s 0m0.092s
2, 100000000 0m8.419s 0m16.699s 0m0.010s
1, 100000000 0m4.214s 0m4.179s 0m0.005s
* Linux Slackware, 16x AMD Opteron, 12S, gcc 4.7.1
$ nice make CPPOPTS="-DNO_LOCK -DNO_VIRTUAL_CALL" u
16, 100000000 0m4.382s 1m9.896s 0m0.004s
8, 100000000 0m4.374s 0m34.904s 0m0.002s
4, 100000000 0m4.368s 0m17.445s 0m0.002s
2, 100000000 0m4.366s 0m8.720s 0m0.003s
1, 100000000 0m4.355s 0m4.351s 0m0.001s
$ nice make CPPOPTS="-DNO_LOCK" u
16, 100000000 0m5.415s 1m19.833s 0m0.005s
8, 100000000 0m4.939s 0m39.438s 0m0.003s
4, 100000000 0m4.936s 0m19.712s 0m0.001s
2, 100000000 0m4.930s 0m9.847s 0m0.002s
1, 100000000 0m4.921s 0m4.917s 0m0.002s
$ nice make u
16, 100000000 1m40.769s 24m17.198s 0m0.006s
8, 100000000 0m51.702s 6m15.400s 0m0.003s
4, 100000000 0m26.033s 1m37.651s 0m0.002s
2, 100000000 0m13.534s 0m25.164s 0m0.003s
1, 100000000 0m4.964s 0m4.961s 0m0.002s
#include <iostream>
#include <locale>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <pthread.h>
#include <unistd.h>
#define MAX_THREADS 128
static long nloops = 100000000, nthreads = 16;
static bool volatile pwait = true;
////////////////////////////////////////////////////////////////////////
struct facet
{
private:
#if !defined (NO_CACHE)
//
// Mimic the ref-counting in std::string where the copy ctor and
// the dtor yank the counter up and down, respectively. This is
// what's happening when copying a cached std::string object.
//
char* _C_copy () const {
facet* self = const_cast< facet* > (this);
__rw::__rw_atomic_preincrement (self->_C_refs, false);
return const_cast< char* > (self->_C_data);
}
void _C_discard (char*) const {
facet* self = const_cast< facet* > (this);
__rw::__rw_atomic_predecrement (self->_C_refs, false);
}
#else
//
// Mimic the construction of an std::string object, anew:
// allocation of space, copying of content over. This is what's
// happening when bypassing the cache and creating std::string
// objects directly out of locale data.
//
char* _C_copy () const {
size_t n = strlen (_C_data) + 1;
char* p = reinterpret_cast< char* > (::operator new (n));
memcpy (p, _C_data, n);
return p;
}
void _C_discard (char* p) const {
::operator delete (p);
}
#endif // NO_CACHE
public:
facet (char const* s) : _C_data (s) { }
char* get () const {
#if !defined (NO_VIRTUAL_CALL)
return do_get ();
#else
return _C_copy ();
#endif // NO_VIRTUAL_CALL
}
void discard (char* p) const {
return _C_discard (p);
}
protected:
virtual char* do_get () const;
private:
char const* _C_data;
unsigned long _C_refs;
__rw::__rw_mutex _C_mutex;
};
/* virtual */ char*
facet::do_get () const
{
return _C_copy ();
}
extern "C" {
static void*
f (void* pv)
{
facet& fac = *reinterpret_cast< facet* > (pv);
unsigned long n = 0;
while (pwait) ;
for (int i = 0; i < nloops; ++i) {
char* s = fac.get ();
n += strlen (s);
fac.discard (s);
}
return (void*)n;
}
} // extern "C"
int
main (int argc, char** argv)
{
switch (argc) {
case 3:
nloops = atol (argv [2]);
case 2:
nthreads = atol (argv [1]);
break;
}
pthread_t tid [MAX_THREADS] = { 0 };
if (nthreads > MAX_THREADS)
nthreads = MAX_THREADS;
printf ("%ld, %ld", nthreads, nloops);
pthread_setconcurrency (nthreads);
facet fac ("\3\3");
for (int i = 0; i < nthreads; ++i) {
if (pthread_create (tid + i, 0, f, &fac))
exit (-1);
}
sleep (1);
pwait = false;
for (int i = 0; i < nthreads; ++i) {
if (tid [i])
pthread_join (tid [i], 0);
}
return 0;
}
#include <iostream>
#include <locale>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <pthread.h>
#include <unistd.h>
#define MAX_THREADS 128
static long nloops = 100000000, nthreads = 16;
static bool volatile pwait = true;
////////////////////////////////////////////////////////////////////////
struct S
{
S (char const* s) : refs (), value (s) { }
unsigned long ref () {
return __rw::__rw_atomic_preincrement (refs, false);
}
unsigned long unref () {
return __rw::__rw_atomic_predecrement (refs, false);
}
char const* get () {
#if !defined (NO_LOCK)
ref ();
#endif // NO_LOCK
#if !defined (NO_VIRTUAL_CALL)
return do_get ();
#else
return value;
#endif // NO_VIRTUAL_CALL
#if !defined (NO_LOCK)
unref ();
#endif // NO_LOCK
}
virtual char const* do_get () const;
unsigned long refs;
char const* value;
};
/* virtual */ char const*
S::do_get () const
{
return this->value;
}
extern "C" {
static void*
f (void* pv)
{
S& s = *reinterpret_cast< S* > (pv);
unsigned long n = 0;
char const* p = 0;
while (pwait) ;
for (int i = 0; i < nloops; ++i) {
p = s.get ();
n += strlen (p);
for (; p [0]; ++p)
n += p [0];
}
return (void*)n;
}
} // extern "C"
int
main (int argc, char** argv)
{
switch (argc) {
case 3:
nloops = atol (argv [2]);
case 2:
nthreads = atol (argv [1]);
break;
}
pthread_t tid [MAX_THREADS] = { 0 };
if (nthreads > MAX_THREADS)
nthreads = MAX_THREADS;
printf ("%ld, %ld", nthreads, nloops);
pthread_setconcurrency (nthreads);
S s ("01234567890123456789");
for (int i = 0; i < nthreads; ++i) {
if (pthread_create (tid + i, 0, f, &s))
exit (-1);
}
sleep (1);
pwait = false;
for (int i = 0; i < nthreads; ++i) {
if (tid [i])
pthread_join (tid [i], 0);
}
return 0;
}