Forwarding with the attachment.

-------- Original Message --------
Subject: Re: STDCXX-1071 numpunct facet defect
Date: Sun, 30 Sep 2012 12:09:10 -0600
From: Martin Sebor <>
To: Liviu Nicoara <>

On 09/27/2012 06:36 PM, Liviu Nicoara wrote:
On 9/27/12 8:27 PM, Martin Sebor wrote:
On 09/27/2012 06:41 AM, Liviu Nicoara wrote:
On 09/26/12 20:12, Liviu Nicoara wrote:
I have created STDCXX-1071 and linked to STDCXX-1056. [...]

I am open to all questions, the more the better. Most of my opinions
have been expressed earlier, but please ask if you want to know more.

I am attaching here the proposed (4.3.x) patch and the timings results
(after re-verifying the correctness of the timing program and the
results). The 4.2.x patch, the 4.3.x patch, the test program and the
results file are also attached to the incident.

The patch isn't binary compatible. We can't remove data members
in a minor release. We could only do it in a major release.

There are two of them, one for 4.2.x, one for 4.3.x.

I'm still curious about the performance, though. It doesn't make
sense to me that a call to a virtual function is faster than one
to an almost trivial inline function.

I scratched my head long on that. I wager that it depends on the
machine, too.

Here are my timings for library-reduction.cpp when compiled
GCC 4.5.3 on Solaris 10 (4 SPARCV9 CPUs). I had to make a small
number of trivial changes to get it to compile:

         With cache   No cache
real    1m38.332s     8m58.568s
user    6m30.244s    34m25.942s
sys     0m0.060s      0m3.922s

I also experimented with the program on Linux (CEL 4 with 16
CPUs). Initially, I saw no differences between the two versions.
So I modified it a bit to make it closer to the library (the
modified program is attached). With those changes the timings
are below:

         With cache   No cache
real    0m 1.107s    0m 5.669s
user    0m17.204s    0m 5.669s
sys     0m 0.000s    0m22.347s

I also recompiled and re-ran the test on Solaris. To speed
things along, I set the number threads and loops to 8 and
1000000. The numbers are as follows:

         With cache   No cache
real    0m3.341s     0m26.333s
user    0m13.052s    1m37.470s
sys     0m0.009s     0m0.132s

The numbers match my expectation. The overhead without the
"numpunct cache" is considerable.

Somewhat unexpectedly, the test with the cache didn't crash.

I also tried timing the numpunct patch attached to the issue
with the test program. The test program runs to completion
without the patch but it crashes with a SIGSEGV after the
patch is applied. That suggests there's a bug somewhere in
do_thousands_sep() (in addition to the caching).


Let me know if there is anything I could help with.


#include <iostream>
#include <locale>

#include <cstdio>
#include <cstdlib>
#include <cstring>

#include <pthread.h>
#include <unistd.h>

#define MAX_THREADS 128

static long nloops = 10000000, nthreads = 16;
static bool volatile pwait = true;


namespace X {

struct guard 
    guard (pthread_mutex_t* ptr) : lock_ (ptr) { 
        pthread_mutex_lock (lock_);

    ~guard () { 
        pthread_mutex_unlock (lock_); 


    guard (guard const&);
    guard& operator= (guard const&);


    pthread_mutex_t* lock_;

struct facet
    facet () {
        pthread_mutex_init (&_C_mutex, 0);

    void* _C_data () {
        return _C_impsize ? _C_impdata : _C_get_data ();

    void* _C_get_data ();

    size_t _C_impsize;
    void*  _C_impdata;

    pthread_mutex_t _C_mutex;

void* facet::_C_get_data ()
    if (_C_impsize)
        return _C_impdata;

    guard g (&_C_mutex);

    if (_C_impsize)
        return _C_impdata;

#if !defined (NO_USE_STDCXX_LOCALES)
    _C_impdata = (void*)"\3\3";
    _C_impsize = (size_t)(-1);

    return _C_impdata;

void* get_data (facet*);

struct numpunct : facet
    numpunct () : _C_flags () { }

    virtual std::string do_grouping () const;

    std::string grouping () const {
        numpunct* const __self = (numpunct*)this;

#if !defined (NO_USE_NUMPUNCT_CACHE)
        if (!(_C_flags & 1)) {
            __self->_C_flags  |= 1;
            __self->_C_grouping = do_grouping ();

        return _C_grouping;
        return do_grouping ();


    int          _C_flags;
    std::string  _C_grouping;

std::string numpunct::do_grouping () const
    return (const char*)get_data (const_cast<numpunct*>(this));

void* get_data (facet* pfacet)
    void* pdata = pfacet->_C_data ();

    if (pdata) 
        return pdata;

    // __rw_setlocale loc (...); <- other mutex

    if (pfacet->_C_data ())
        return get_data (pfacet);

    pfacet->_C_impdata = const_cast<char*>("\4\4");
    pfacet->_C_impsize = (size_t)(-1);

    return 0;

} // namespace X


typedef X::numpunct Numpunct;

extern "C" {

static void* 
f (void* pv)
    Numpunct const& fac = *reinterpret_cast< Numpunct* > (pv);

    unsigned long n = 0;

    while (pwait) ;
    for (int i = 0; i < nloops; ++i) {
        const std::string grouping = fac.grouping ();
        n += strlen (grouping.c_str ());

    return (void*)n;

} // extern "C"

main (int argc, char** argv)
    switch (argc) {
    case 3:
        nloops = atol (argv [2]);
    case 2:
        nthreads = atol (argv [1]);

    pthread_t tid [MAX_THREADS] = { 0 };

    if (nthreads > MAX_THREADS)
        nthreads = MAX_THREADS;

    printf ("%i, %i\n", nthreads, nloops);

    pthread_setconcurrency (nthreads);

    Numpunct fac;
    for (int i = 0; i < nthreads; ++i) {
        if (pthread_create (tid + i, 0, f, const_cast<Numpunct*> (&fac)))
            exit (-1);

    puts ("threads created...");

    sleep (1);
    pwait = false;

    for (int i = 0; i < nthreads; ++i) {
        if (tid [i])
            pthread_join (tid [i], 0);

    return 0;

Reply via email to