Hi, Martin

n 2021/07/10 16:55, Martin Pieuchot wrote:
> Hello Yuichiro, thanks for your work !

Thanks for the response.

>> On 2021/06/16 16:34, Yuichiro NAITO wrote:
>>> When I compile a multi-threaded application with '-pg' option, I always get 
>>> no
>>> results in gmon.out. With bad luck, my application gets SIGSEGV while 
>>> running.
>>> Because the buffer that holds number of caller/callee count is the only one
>>> in the process and will be broken by multi-threaded access.
>>>
>>> I get the idea to solve this problem from NetBSD. NetBSD has individual 
>>> buffers
>>> for each thread and merges them at the end of profiling.
> 
> Note that the kernel use a similar approach but doesn't merge the buffer,
> instead it generates a file for each CPU.

Yes, so the number of output files are limited by the number of CPUs in case of 
the kernel profiling. I think number of application threads can be increased 
more
casually. I don't want to see dozens of 'gmon.out' files.

>>> NetBSD stores the reference to the individual buffer by 
>>> pthread_setspecific(3).
>>> I think it causes infinite recursive call if whole libc library (except
>>> mcount.c) is compiled with -pg.
>>>
>>> The compiler generates '_mcount' function call at the beginning of every
>>> functions. If '_mcount' calls pthread_getspecific(3) to get the individual
>>> buffer, pthread_getspecific() calls '_mcount' again and causes infinite
>>> recursion.
>>>
>>> NetBSD prevents from infinite recursive call by checking a global variable. 
>>> But
>>> this approach also prevents simultaneously call of '_mcount' on a 
>>> multi-threaded
>>> application. It makes a little bit wrong results of profiling.
>>>
>>> So I added a pointer to the buffer in `struct pthread` that can be 
>>> accessible
>>> via macro call as same as pthread_self(3). This approach can prevent of
>>> infinite recursive call of '_mcount'.
> 
> Not calling a libc function for this makes sense.  However I'm not
> convinced that accessing `tib_thread' before _rthread_init() has been
> called is safe.

Before '_rthread_init’ is called, '__isthreaded' global variable is kept to be 
0.
My patch doesn't access tib_thread in this case.
After calling `_rthread_init`, `pthread_create()` changes `__isthreaded` to 1.
Tib_thread will be accessed by all threads that are newly created and the 
initial one.

I believe tib of the initial thread has been initialized in `_libc_preinit' 
function
in 'lib/libc/dlfcn/init.c'.

> I'm not sure where is the cleanest way to place the per-thread buffer,
> I'd suggest you ask guenther@ about this.

I added guenther@ in CC of this mail.
I hope I can get an advise about per-thread buffer.

> Maybe the initialization can be done outside of _mcount()?

Yes, I think tib is initialized in `pthread_create()` and `_libc_preinit()`.

>>> I obtained merging function from NetBSD that is called in '_mcleanup' 
>>> function.
>>> Merging function needs to walk through all the individual buffers,
>>> I added SLIST_ENTRY member in 'struct gmonparam' to make a list of the 
>>> buffers.
>>> And also added '#ifndef _KERNEL' for the SLIST_ENTRY member not to be used 
>>> for
>>> the kernel.
>>>
>>> But I still use pthread_getspecific(3) for that can call destructor when
>>> a thread is terminated. The individual buffer is moved to free list to reuse
>>> for a new thread.
>>
>> Here is a patch for this approach.
> 
> I have various comments:
> 
> We choose not to use C++ style comment (// comment) in the tree, could you
> fix yours?
> 
> There's two copies of mcount.c, the other one lies in sys/lib/libkern could
> you keep them in sync?
> 
> gmon.c is only compiled in userland and don't need #ifndef _KERNEL
> 
> In libc there's also the use of _MUTEX_LOCK/UNLOCK() macro instead of
> calling pthread_mutex* directly.  This might help reduce the differences
> between ST and MT paths.

Thanks for letting me know them.
I updated my patch as following according to the advice.

diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c
index 1ce0a1c289e..5169fa70449 100644
--- a/lib/libc/gmon/gmon.c
+++ b/lib/libc/gmon/gmon.c
@@ -42,6 +42,15 @@
 
 struct gmonparam _gmonparam = { GMON_PROF_OFF };
 
+#include <pthread.h>
+#include <thread_private.h>
+
+SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree);
+SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse);
+void* _gmonlock = NULL;
+pthread_key_t _gmonkey;
+struct gmonparam _gmondummy;
+
 static int     s_scale;
 /* see profil(2) where this is describe (incorrectly) */
 #define                SCALE_1_TO_1    0x10000L
@@ -52,6 +61,11 @@ PROTO_NORMAL(moncontrol);
 PROTO_DEPRECATED(monstartup);
 static int hertz(void);
 
+static void _gmon_destructor(void *);
+struct gmonparam *_gmon_alloc(void);
+static void _gmon_merge(void);
+static void _gmon_merge_two(struct gmonparam *, struct gmonparam *);
+
 void
 monstartup(u_long lowpc, u_long highpc)
 {
@@ -114,6 +128,9 @@ monstartup(u_long lowpc, u_long highpc)
        } else
                s_scale = SCALE_1_TO_1;
 
+       _gmondummy.state = GMON_PROF_BUSY;
+       pthread_key_create(&_gmonkey, _gmon_destructor);
+
        moncontrol(1);
        return;
 
@@ -134,6 +151,192 @@ mapfailed:
 }
 __strong_alias(_monstartup,monstartup);
 
+static void
+_gmon_destructor(void *arg)
+{
+       struct gmonparam *p = arg, *q, **prev;
+
+       if (p == &_gmondummy)
+               return;
+
+       pthread_setspecific(_gmonkey, &_gmondummy);
+
+       _MUTEX_LOCK(&_gmonlock);
+       SLIST_REMOVE(&_gmoninuse, p, gmonparam, next);
+       SLIST_INSERT_HEAD(&_gmonfree, p, next);
+       _MUTEX_UNLOCK(&_gmonlock);
+
+       pthread_setspecific(_gmonkey, NULL);
+}
+
+struct gmonparam *
+_gmon_alloc(void)
+{
+       void *addr;
+       struct gmonparam *p;
+
+       _MUTEX_LOCK(&_gmonlock);
+       p = SLIST_FIRST(&_gmonfree);
+       if (p != NULL) {
+               SLIST_REMOVE_HEAD(&_gmonfree, next);
+               SLIST_INSERT_HEAD(&_gmoninuse, p ,next);
+       } else {
+               _MUTEX_UNLOCK(&_gmonlock);
+               p = mmap(NULL, sizeof (struct gmonparam),
+                        PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+               if (p == MAP_FAILED)
+                       goto mapfailed_2;
+               *p = _gmonparam;
+               p->kcount = NULL;
+               p->kcountsize = 0;
+               p->froms = NULL;
+               p->tos = NULL;
+               addr = mmap(NULL, p->fromssize, PROT_READ|PROT_WRITE,
+                           MAP_ANON|MAP_PRIVATE, -1, 0);
+               if (addr == MAP_FAILED)
+                       goto mapfailed;
+               p->froms = addr;
+
+               addr = mmap(NULL, p->tossize, PROT_READ|PROT_WRITE,
+                           MAP_ANON|MAP_PRIVATE, -1, 0);
+               if (addr == MAP_FAILED)
+                       goto mapfailed;
+               p->tos = addr;
+               _MUTEX_LOCK(&_gmonlock);
+               SLIST_INSERT_HEAD(&_gmoninuse, p ,next);
+       }
+       _MUTEX_UNLOCK(&_gmonlock);
+       pthread_setspecific(_gmonkey, p);
+
+       return p;
+
+mapfailed:
+       if (p->froms != NULL) {
+               munmap(p->froms, p->fromssize);
+               p->froms = NULL;
+       }
+       if (p->tos != NULL) {
+               munmap(p->tos, p->tossize);
+               p->tos = NULL;
+       }
+mapfailed_2:
+       pthread_setspecific(_gmonkey, NULL);
+       ERR("_gmon_alloc: out of memory\n");
+       return NULL;
+}
+
+static void
+_gmon_merge_two(struct gmonparam *p, struct gmonparam *q)
+{
+       u_long fromindex;
+       u_short *frompcindex, qtoindex, toindex;
+       u_long selfpc;
+       u_long endfrom;
+       long count;
+       struct tostruct *top;
+
+       endfrom = (q->fromssize / sizeof(*q->froms));
+       for (fromindex = 0; fromindex < endfrom; fromindex++) {
+               if (q->froms[fromindex] == 0)
+                       continue;
+               for (qtoindex = q->froms[fromindex]; qtoindex != 0;
+                    qtoindex = q->tos[qtoindex].link) {
+                       selfpc = q->tos[qtoindex].selfpc;
+                       count = q->tos[qtoindex].count;
+                       /* cribbed from mcount */
+                       frompcindex = &p->froms[fromindex];
+                       toindex = *frompcindex;
+                       if (toindex == 0) {
+                               /*
+                                *      first time traversing this arc
+                                */
+                               toindex = ++p->tos[0].link;
+                               if (toindex >= p->tolimit)
+                                       /* halt further profiling */
+                                       goto overflow;
+
+                               *frompcindex = (u_short)toindex;
+                               top = &p->tos[(size_t)toindex];
+                               top->selfpc = selfpc;
+                               top->count = count;
+                               top->link = 0;
+                               goto done;
+                       }
+                       top = &p->tos[(size_t)toindex];
+                       if (top->selfpc == selfpc) {
+                               /*
+                                * arc at front of chain; usual case.
+                                */
+                               top->count+= count;
+                               goto done;
+                       }
+                       /*
+                        * have to go looking down chain for it.
+                        * top points to what we are looking at,
+                        * we know it is not at the head of the chain.
+                        */
+                       for (; /* goto done */; ) {
+                               if (top->link == 0) {
+                                       /*
+                                        * top is end of the chain and
+                                        * none of the chain had
+                                        * top->selfpc == selfpc.  so
+                                        * we allocate a new tostruct
+                                        * and link it to the head of
+                                        * the chain.
+                                        */
+                                       toindex = ++p->tos[0].link;
+                                       if (toindex >= p->tolimit)
+                                               goto overflow;
+                                       top = &p->tos[(size_t)toindex];
+                                       top->selfpc = selfpc;
+                                       top->count = count;
+                                       top->link = *frompcindex;
+                                       *frompcindex = (u_short)toindex;
+                                       goto done;
+                               }
+                               /*
+                                * otherwise, check the next arc on the chain.
+                                */
+                               top = &p->tos[top->link];
+                               if (top->selfpc == selfpc) {
+                                       /*
+                                        * there it is.
+                                        * add to its count.
+                                        */
+                                       top->count += count;
+                                       goto done;
+                               }
+
+                       }
+
+               done: ;
+               }
+
+       }
+overflow: ;
+
+}
+
+static void
+_gmon_merge(void)
+{
+       struct gmonparam *q;
+
+       _MUTEX_LOCK(&_gmonlock);
+
+       SLIST_FOREACH(q, &_gmonfree, next)
+               _gmon_merge_two(&_gmonparam, q);
+
+       SLIST_FOREACH(q, &_gmoninuse, next) {
+               q->state = GMON_PROF_OFF;
+               _gmon_merge_two(&_gmonparam, q);
+       }
+
+       _MUTEX_UNLOCK(&_gmonlock);
+}
+
+
 void
 _mcleanup(void)
 {
@@ -234,6 +437,9 @@ _mcleanup(void)
            p->kcount, p->kcountsize);
        write(log, dbuf, strlen(dbuf));
 #endif
+
+       _gmon_merge();
+
        hdr = (struct gmonhdr *)&gmonhdr;
        bzero(hdr, sizeof(*hdr));
        hdr->lpc = p->lowpc;
diff --git a/lib/libc/gmon/mcount.c b/lib/libc/gmon/mcount.c
index f0ce70dd6ae..21ca16ab9e7 100644
--- a/lib/libc/gmon/mcount.c
+++ b/lib/libc/gmon/mcount.c
@@ -31,6 +31,15 @@
 #include <sys/types.h>
 #include <sys/gmon.h>
 
+#ifndef _KERNEL
+#include <stdio.h>             /* for the use of '__isthreaded'. */
+#include <pthread.h>
+#include <thread_private.h>
+#include <tib.h>
+extern struct gmonparam _gmondummy;
+struct gmonparam *_gmon_alloc(void);
+#endif
+
 /*
  * mcount is called on entry to each function compiled with the profiling
  * switch set.  _mcount(), which is declared in a machine-dependent way
@@ -65,7 +74,22 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
        if ((p = curcpu()->ci_gmon) == NULL)
                return;
 #else
-       p = &_gmonparam;
+       if (__isthreaded) {
+               if (_gmonparam.state != GMON_PROF_ON)
+                       return;
+               pthread_t t = TIB_GET()->tib_thread;
+               p = t->gmonparam;
+               if (p == &_gmondummy)
+                       return;
+               if (p == NULL) {
+                       /* prevent recursive call of _gmon_alloc(). */
+                       t->gmonparam = &_gmondummy;
+                       if ((t->gmonparam = _gmon_alloc()) == NULL)
+                               return;
+                       p = t->gmonparam;
+               }
+       } else
+               p = &_gmonparam;
 #endif
        /*
         * check that we are profiling
diff --git a/lib/libc/include/thread_private.h 
b/lib/libc/include/thread_private.h
index 237c3fbd034..d6c89c8334b 100644
--- a/lib/libc/include/thread_private.h
+++ b/lib/libc/include/thread_private.h
@@ -5,6 +5,8 @@
 #ifndef _THREAD_PRIVATE_H_
 #define _THREAD_PRIVATE_H_
 
+#include <sys/types.h>
+#include <sys/gmon.h>
 #include <stdio.h>             /* for FILE and __isthreaded */
 
 #define _MALLOC_MUTEXES 32
@@ -389,6 +391,7 @@ struct pthread {
 
        /* cancel received in a delayed cancel block? */
        int delayed_cancel;
+       struct gmonparam *gmonparam;
 };
 /* flags in pthread->flags */
 #define        THREAD_DONE             0x001
diff --git a/sys/lib/libkern/mcount.c b/sys/lib/libkern/mcount.c
index 684ddadb9aa..e30c9554ff5 100644
--- a/sys/lib/libkern/mcount.c
+++ b/sys/lib/libkern/mcount.c
@@ -33,6 +33,15 @@
 #include <sys/param.h>
 #include <sys/gmon.h>
 
+#ifndef _KERNEL
+#include <stdio.h>             /* for the use of '__isthreaded'. */
+#include <pthread.h>
+#include <thread_private.h>
+#include <tib.h>
+extern struct gmonparam _gmondummy;
+struct gmonparam *_gmon_alloc(void);
+#endif
+
 /*
  * mcount is called on entry to each function compiled with the profiling
  * switch set.  _mcount(), which is declared in a machine-dependent way
@@ -67,7 +76,22 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
        if ((p = curcpu()->ci_gmon) == NULL)
                return;
 #else
-       p = &_gmonparam;
+       if (__isthreaded) {
+               if (_gmonparam.state != GMON_PROF_ON)
+                       return;
+               pthread_t t = TIB_GET()->tib_thread;
+               p = t->gmonparam;
+               if (p == &_gmondummy)
+                       return;
+               if (p == NULL) {
+                       /* prevent recursive call of _gmon_alloc(). */
+                       t->gmonparam = &_gmondummy;
+                       if ((t->gmonparam = _gmon_alloc()) == NULL)
+                               return;
+                       p = t->gmonparam;
+               }
+       } else
+               p = &_gmonparam;
 #endif
        /*
         * check that we are profiling
diff --git a/sys/sys/gmon.h b/sys/sys/gmon.h
index 1a63cb52b1b..cce98752346 100644
--- a/sys/sys/gmon.h
+++ b/sys/sys/gmon.h
@@ -35,6 +35,7 @@
 #ifndef _SYS_GMON_H_
 #define _SYS_GMON_H_
 
+#include <sys/queue.h>
 #include <machine/profile.h>
 
 /*
@@ -136,6 +137,7 @@ struct gmonparam {
        u_long          highpc;
        u_long          textsize;
        u_long          hashfraction;
+       SLIST_ENTRY(gmonparam)  next;
 };
 
 /*


-- 
Yuichiro NAITO ([email protected])

Reply via email to