Hello Yuichiro, thanks for your work !

> On 2021/06/16 16:34, Yuichiro NAITO wrote:
> > When I compile a multi-threaded application with '-pg' option, I always get 
> > no
> > results in gmon.out. With bad luck, my application gets SIGSEGV while 
> > running.
> > Because the buffer that holds number of caller/callee count is the only one
> > in the process and will be broken by multi-threaded access.
> > 
> > I get the idea to solve this problem from NetBSD. NetBSD has individual 
> > buffers
> > for each thread and merges them at the end of profiling.

Note that the kernel use a similar approach but doesn't merge the buffer,
instead it generates a file for each CPU.

> > 
> > NetBSD stores the reference to the individual buffer by 
> > pthread_setspecific(3).
> > I think it causes infinite recursive call if whole libc library (except
> > mcount.c) is compiled with -pg.
> > 
> > The compiler generates '_mcount' function call at the beginning of every
> > functions. If '_mcount' calls pthread_getspecific(3) to get the individual
> > buffer, pthread_getspecific() calls '_mcount' again and causes infinite
> > recursion.
> > 
> > NetBSD prevents from infinite recursive call by checking a global variable. 
> > But
> > this approach also prevents simultaneously call of '_mcount' on a 
> > multi-threaded
> > application. It makes a little bit wrong results of profiling.
> > 
> > So I added a pointer to the buffer in `struct pthread` that can be 
> > accessible
> > via macro call as same as pthread_self(3). This approach can prevent of
> > infinite recursive call of '_mcount'.

Not calling a libc function for this makes sense.  However I'm not
convinced that accessing `tib_thread' before _rthread_init() has been
called is safe.

I'm not sure where is the cleanest way to place the per-thread buffer,
I'd suggest you ask guenther@ about this.

Maybe the initialization can be done outside of _mcount()?

> > I obtained merging function from NetBSD that is called in '_mcleanup' 
> > function.
> > Merging function needs to walk through all the individual buffers,
> > I added SLIST_ENTRY member in 'struct gmonparam' to make a list of the 
> > buffers.
> > And also added '#ifndef _KERNEL' for the SLIST_ENTRY member not to be used 
> > for
> > the kernel.
> > 
> > But I still use pthread_getspecific(3) for that can call destructor when
> > a thread is terminated. The individual buffer is moved to free list to reuse
> > for a new thread.
> 
> Here is a patch for this approach.

I have various comments:

We choose not to use C++ style comment (// comment) in the tree, could you
fix yours?

There's two copies of mcount.c, the other one lies in sys/lib/libkern could
you keep them in sync?

gmon.c is only compiled in userland and don't need #ifndef _KERNEL

In libc there's also the use of _MUTEX_LOCK/UNLOCK() macro instead of
calling pthread_mutex* directly.  This might help reduce the differences
between ST and MT paths.

> diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c
> index 1ce0a1c289e..6887f4f5987 100644
> --- a/lib/libc/gmon/gmon.c
> +++ b/lib/libc/gmon/gmon.c
> @@ -42,6 +42,16 @@
>  
>  struct gmonparam _gmonparam = { GMON_PROF_OFF };
>  
> +#ifndef _KERNEL
> +#include <pthread.h>
> +
> +SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree);
> +SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse);
> +pthread_mutex_t _gmonlock = PTHREAD_MUTEX_INITIALIZER;
> +pthread_key_t _gmonkey;
> +struct gmonparam _gmondummy;
> +#endif
> +
>  static int   s_scale;
>  /* see profil(2) where this is describe (incorrectly) */
>  #define              SCALE_1_TO_1    0x10000L
> @@ -52,6 +62,13 @@ PROTO_NORMAL(moncontrol);
>  PROTO_DEPRECATED(monstartup);
>  static int hertz(void);
>  
> +#ifndef _KERNEL
> +static void _gmon_destructor(void *);
> +struct gmonparam *_gmon_alloc(void);
> +static void _gmon_merge(void);
> +static void _gmon_merge_two(struct gmonparam *, struct gmonparam *);
> +#endif
> +
>  void
>  monstartup(u_long lowpc, u_long highpc)
>  {
> @@ -114,6 +131,11 @@ monstartup(u_long lowpc, u_long highpc)
>       } else
>               s_scale = SCALE_1_TO_1;
>  
> +#ifndef _KERNEL
> +     _gmondummy.state = GMON_PROF_BUSY;
> +     pthread_key_create(&_gmonkey, _gmon_destructor);
> +#endif
> +
>       moncontrol(1);
>       return;
>  
> @@ -134,6 +156,194 @@ mapfailed:
>  }
>  __strong_alias(_monstartup,monstartup);
>  
> +#ifndef _KERNEL
> +static void
> +_gmon_destructor(void *arg)
> +{
> +     struct gmonparam *p = arg, *q, **prev;
> +
> +     if (p == &_gmondummy)
> +             return;
> +
> +     pthread_setspecific(_gmonkey, &_gmondummy);
> +
> +     pthread_mutex_lock(&_gmonlock);
> +     SLIST_REMOVE(&_gmoninuse, p, gmonparam, next);
> +     SLIST_INSERT_HEAD(&_gmonfree, p, next);
> +     pthread_mutex_unlock(&_gmonlock);
> +
> +     pthread_setspecific(_gmonkey, NULL);
> +}
> +
> +struct gmonparam *
> +_gmon_alloc(void)
> +{
> +     void *addr;
> +     struct gmonparam *p;
> +
> +     pthread_mutex_lock(&_gmonlock);
> +     p = SLIST_FIRST(&_gmonfree);
> +     if (p != NULL) {
> +             SLIST_REMOVE_HEAD(&_gmonfree, next);
> +             SLIST_INSERT_HEAD(&_gmoninuse, p ,next);
> +     } else {
> +             pthread_mutex_unlock(&_gmonlock);
> +             p = mmap(NULL, sizeof (struct gmonparam),
> +                      PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
> +             if (p == MAP_FAILED)
> +                     goto mapfailed_2;
> +             *p = _gmonparam;
> +             p->kcount = NULL;
> +             p->kcountsize = 0;
> +             p->froms = NULL;
> +             p->tos = NULL;
> +             addr = mmap(NULL, p->fromssize, PROT_READ|PROT_WRITE,
> +                         MAP_ANON|MAP_PRIVATE, -1, 0);
> +             if (addr == MAP_FAILED)
> +                     goto mapfailed;
> +             p->froms = addr;
> +
> +             addr = mmap(NULL, p->tossize, PROT_READ|PROT_WRITE,
> +                         MAP_ANON|MAP_PRIVATE, -1, 0);
> +             if (addr == MAP_FAILED)
> +                     goto mapfailed;
> +             p->tos = addr;
> +             pthread_mutex_lock(&_gmonlock);
> +             SLIST_INSERT_HEAD(&_gmoninuse, p ,next);
> +     }
> +     pthread_mutex_unlock(&_gmonlock);
> +     pthread_setspecific(_gmonkey, p);
> +
> +     return p;
> +
> +mapfailed:
> +     if (p->froms != NULL) {
> +             munmap(p->froms, p->fromssize);
> +             p->froms = NULL;
> +     }
> +     if (p->tos != NULL) {
> +             munmap(p->tos, p->tossize);
> +             p->tos = NULL;
> +     }
> +mapfailed_2:
> +     pthread_setspecific(_gmonkey, NULL);
> +     ERR("_gmon_alloc: out of memory\n");
> +     return NULL;
> +}
> +
> +static void
> +_gmon_merge_two(struct gmonparam *p, struct gmonparam *q)
> +{
> +     u_long fromindex;
> +     u_short *frompcindex, qtoindex, toindex;
> +     u_long selfpc;
> +     u_long endfrom;
> +     long count;
> +     struct tostruct *top;
> +
> +     endfrom = (q->fromssize / sizeof(*q->froms));
> +     for (fromindex = 0; fromindex < endfrom; fromindex++) {
> +             if (q->froms[fromindex] == 0)
> +                     continue;
> +             for (qtoindex = q->froms[fromindex]; qtoindex != 0;
> +                  qtoindex = q->tos[qtoindex].link) {
> +                     selfpc = q->tos[qtoindex].selfpc;
> +                     count = q->tos[qtoindex].count;
> +                     /* cribbed from mcount */
> +                     frompcindex = &p->froms[fromindex];
> +                     toindex = *frompcindex;
> +                     if (toindex == 0) {
> +                             /*
> +                              *      first time traversing this arc
> +                              */
> +                             toindex = ++p->tos[0].link;
> +                             if (toindex >= p->tolimit)
> +                                     /* halt further profiling */
> +                                     goto overflow;
> +
> +                             *frompcindex = (u_short)toindex;
> +                             top = &p->tos[(size_t)toindex];
> +                             top->selfpc = selfpc;
> +                             top->count = count;
> +                             top->link = 0;
> +                             goto done;
> +                     }
> +                     top = &p->tos[(size_t)toindex];
> +                     if (top->selfpc == selfpc) {
> +                             /*
> +                              * arc at front of chain; usual case.
> +                              */
> +                             top->count+= count;
> +                             goto done;
> +                     }
> +                     /*
> +                      * have to go looking down chain for it.
> +                      * top points to what we are looking at,
> +                      * we know it is not at the head of the chain.
> +                      */
> +                     for (; /* goto done */; ) {
> +                             if (top->link == 0) {
> +                                     /*
> +                                      * top is end of the chain and
> +                                      * none of the chain had
> +                                      * top->selfpc == selfpc.  so
> +                                      * we allocate a new tostruct
> +                                      * and link it to the head of
> +                                      * the chain.
> +                                      */
> +                                     toindex = ++p->tos[0].link;
> +                                     if (toindex >= p->tolimit)
> +                                             goto overflow;
> +                                     top = &p->tos[(size_t)toindex];
> +                                     top->selfpc = selfpc;
> +                                     top->count = count;
> +                                     top->link = *frompcindex;
> +                                     *frompcindex = (u_short)toindex;
> +                                     goto done;
> +                             }
> +                             /*
> +                              * otherwise, check the next arc on the chain.
> +                              */
> +                             top = &p->tos[top->link];
> +                             if (top->selfpc == selfpc) {
> +                                     /*
> +                                      * there it is.
> +                                      * add to its count.
> +                                      */
> +                                     top->count += count;
> +                                     goto done;
> +                             }
> +
> +                     }
> +
> +             done: ;
> +             }
> +
> +     }
> +overflow: ;
> +
> +}
> +
> +static void
> +_gmon_merge(void)
> +{
> +     struct gmonparam *q;
> +
> +     pthread_mutex_lock(&_gmonlock);
> +
> +     SLIST_FOREACH(q, &_gmonfree, next)
> +             _gmon_merge_two(&_gmonparam, q);
> +
> +     SLIST_FOREACH(q, &_gmoninuse, next) {
> +             q->state = GMON_PROF_OFF;
> +             _gmon_merge_two(&_gmonparam, q);
> +     }
> +
> +     pthread_mutex_unlock(&_gmonlock);
> +}
> +#endif
> +
> +
>  void
>  _mcleanup(void)
>  {
> @@ -233,6 +443,9 @@ _mcleanup(void)
>       snprintf(dbuf, sizeof dbuf, "[mcleanup1] kcount 0x%x ssiz %d\n",
>           p->kcount, p->kcountsize);
>       write(log, dbuf, strlen(dbuf));
> +#endif
> +#ifndef _KERNEL
> +     _gmon_merge();
>  #endif
>       hdr = (struct gmonhdr *)&gmonhdr;
>       bzero(hdr, sizeof(*hdr));
> diff --git a/lib/libc/gmon/mcount.c b/lib/libc/gmon/mcount.c
> index f0ce70dd6ae..2394e539337 100644
> --- a/lib/libc/gmon/mcount.c
> +++ b/lib/libc/gmon/mcount.c
> @@ -31,6 +31,15 @@
>  #include <sys/types.h>
>  #include <sys/gmon.h>
>  
> +#ifndef _KERNEL
> +#include <stdio.h>           // for the use of '__isthreaded'.
> +#include <pthread.h>
> +#include <thread_private.h>
> +#include <tib.h>
> +extern struct gmonparam _gmondummy;
> +struct gmonparam *_gmon_alloc(void);
> +#endif
> +
>  /*
>   * mcount is called on entry to each function compiled with the profiling
>   * switch set.  _mcount(), which is declared in a machine-dependent way
> @@ -65,7 +74,22 @@ _MCOUNT_DECL(u_long frompc, u_long selfpc)
>       if ((p = curcpu()->ci_gmon) == NULL)
>               return;
>  #else
> -     p = &_gmonparam;
> +     if (__isthreaded) {
> +             if (_gmonparam.state != GMON_PROF_ON)
> +                     return;
> +             pthread_t t = TIB_GET()->tib_thread;
> +             p = t->gmonparam;
> +             if (p == &_gmondummy)
> +                     return;
> +             if (p == NULL) {
> +                     // prevent recursive call of _gmon_alloc().
> +                     t->gmonparam = &_gmondummy;
> +                     if ((t->gmonparam = _gmon_alloc()) == NULL)
> +                             return;
> +                     p = t->gmonparam;
> +             }
> +     } else
> +             p = &_gmonparam;
>  #endif
>       /*
>        * check that we are profiling
> diff --git a/lib/libc/include/thread_private.h 
> b/lib/libc/include/thread_private.h
> index 237c3fbd034..d6c89c8334b 100644
> --- a/lib/libc/include/thread_private.h
> +++ b/lib/libc/include/thread_private.h
> @@ -5,6 +5,8 @@
>  #ifndef _THREAD_PRIVATE_H_
>  #define _THREAD_PRIVATE_H_
>  
> +#include <sys/types.h>
> +#include <sys/gmon.h>
>  #include <stdio.h>           /* for FILE and __isthreaded */
>  
>  #define _MALLOC_MUTEXES 32
> @@ -389,6 +391,7 @@ struct pthread {
>  
>       /* cancel received in a delayed cancel block? */
>       int delayed_cancel;
> +     struct gmonparam *gmonparam;
>  };
>  /* flags in pthread->flags */
>  #define      THREAD_DONE             0x001
> diff --git a/sys/sys/gmon.h b/sys/sys/gmon.h
> index 1a63cb52b1b..cce98752346 100644
> --- a/sys/sys/gmon.h
> +++ b/sys/sys/gmon.h
> @@ -35,6 +35,7 @@
>  #ifndef _SYS_GMON_H_
>  #define _SYS_GMON_H_
>  
> +#include <sys/queue.h>
>  #include <machine/profile.h>
>  
>  /*
> @@ -136,6 +137,7 @@ struct gmonparam {
>       u_long          highpc;
>       u_long          textsize;
>       u_long          hashfraction;
> +     SLIST_ENTRY(gmonparam)  next;
>  };
>  
>  /*

Reply via email to