This sort of reminds me of another little proglet I have lying about
that might need a home..


---
/*
 * Library to hook into code compiled with -finstrument-functions it will
 * record function arcs (call_fn, this_fn) as well as the sum of whatever event
 * is being measured over that function.
 *
 * Copyright (C) 2011 Red Hat, Inc., Peter Zijlstra <pzijl...@redhat.com>
 *
 * This file is copyrighted under the GPLv2 License (and not any later version).
 *
 * SuperFastHash under LGPLv2.1 (http://www.azillionmonkeys.com/qed/hash.html)
 *
 * Compilation example:
 * gcc -shared -fPIC profviz.c -o profviz.so -lpthread -ldl -lelf
 *
 * Usage example:
 * LD_PRELOAD=./profviz.so your_program 
 */

#define _GNU_SOURCE

#include "util/util.h"
#include "perf.h"
#include "util/parse-events.h"

#include <sys/time.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <dlfcn.h>
#include <pthread.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <libelf.h>
#include <gelf.h>
#include <link.h>
#include <math.h>

#define barrier() asm volatile("" ::: "memory")

static u64 rdpmc(unsigned int counter)
{
        unsigned int low, high;

        asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));

        return low | ((u64)high) << 32;
}

static u64 rdtsc(void)
{
        unsigned int low, high;

        asm volatile("rdtsc" : "=a" (low), "=d" (high));

        return low | ((u64)high) << 32;
}

static u64 mmap_read_self(void *addr)
{
        struct perf_event_mmap_page *pc = addr;
        u32 seq, idx, time_mult = 0, time_shift = 0, width = 0;
        u64 count, cyc = 0, time_offset = 0, enabled, running, delta;
        s64 pmc = 0;

        do {
                seq = pc->lock;
                barrier();

                enabled = pc->time_enabled;
                running = pc->time_running;

                if (pc->cap_usr_time && enabled != running) {
                        cyc = rdtsc();
                        time_mult = pc->time_mult;
                        time_shift = pc->time_shift;
                        time_offset = pc->time_offset;
                }

                idx = pc->index;
                count = pc->offset;
                if (pc->cap_usr_rdpmc && idx) {
                        width = pc->pmc_width;
                        pmc = rdpmc(idx - 1);
                }

                barrier();
        } while (pc->lock != seq);

        if (idx) {
                pmc <<= 64 - width;
                pmc >>= 64 - width; /* shift right signed */
                count += pmc;
        }

        if (enabled != running) {
                u64 quot, rem;

                quot = (cyc >> time_shift);
                rem = cyc & ((1 << time_shift) - 1);
                delta = time_offset + quot * time_mult +
                        ((rem * time_mult) >> time_shift);

                enabled += delta;
                if (idx)
                        running += delta;

                quot = count / running;
                rem = count % running;
                count = quot * enabled + (rem * enabled) / running;
        }

        return count;
}

static int (*pthread_create_orig)(pthread_t *__restrict,
                __const pthread_attr_t *__restrict,
                void *(*)(void *),
                void *__restrict) = NULL;

static struct perf_event_attr perf_attr = {
        .type = PERF_TYPE_HARDWARE,
        .config = PERF_COUNT_HW_CPU_CYCLES,
        .exclude_kernel = 1,
};

struct prof_arc {
        void *call_fn;
        void *this_fn;
        uint64_t count;
};

struct prof_fn {
        void *this_fn;
        uint64_t count;
};

#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \
                                       +(uint32_t)(((const uint8_t *)(d))[0]) )

static uint32_t SuperFastHash (const char * data, int len, uint32_t hash) {
        uint32_t tmp;
        int rem;

        if (len <= 0 || data == NULL) return 0;

        rem = len & 3;
        len >>= 2;

        /* Main loop */
        for (;len > 0; len--) {
                hash  += get16bits (data);
                tmp    = (get16bits (data+2) << 11) ^ hash;
                hash   = (hash << 16) ^ tmp;
                data  += 2*sizeof (uint16_t);
                hash  += hash >> 11;
        }

        /* Handle end cases */
        switch (rem) {
                case 3: hash += get16bits (data);
                        hash ^= hash << 16;
                        hash ^= data[sizeof (uint16_t)] << 18;
                        hash += hash >> 11;
                        break;
                case 2: hash += get16bits (data);
                        hash ^= hash << 11;
                        hash += hash >> 17;
                        break;
                case 1: hash += *data;
                        hash ^= hash << 10;
                        hash += hash >> 1;
        }

        /* Force "avalanching" of final 127 bits */
        hash ^= hash << 3;
        hash += hash >> 5;
        hash ^= hash << 4;
        hash += hash >> 17;
        hash ^= hash << 25;
        hash += hash >> 6;

        return hash;
}

void die(const char *err, ...)
{
        va_list params;

        va_start(params, err);
        vfprintf(stderr, err, params);
        va_end(params);

        exit(-1);
}


#define HASH_TABLE_SIZE         16384
#define AVG_STACK_DEPTH         8

#define ARC_HASH_SIZE   (HASH_TABLE_SIZE * AVG_STACK_DEPTH)

static struct prof_arc prof_arc_hash[ARC_HASH_SIZE];
static pthread_mutex_t prof_arc_lock;

#define FN_HASH_SIZE    (HASH_TABLE_SIZE)

static struct prof_fn  prof_fn_hash[FN_HASH_SIZE];
static pthread_mutex_t prof_fn_lock;

#define HASH_INIT               0x9e370001UL    
#define HASH_CHAIN              16

static inline void *
prof_hash_find(void *hash_base, const size_t hash_size, 
                const void *key, const size_t key_size,
                const size_t entry_size, pthread_mutex_t *lock)

{
        uint32_t hash = HASH_INIT;
        void *entry = NULL;
        int i, j;

        for (i = 0; i < HASH_CHAIN; i++) {
                hash = SuperFastHash(key, key_size, hash);
                entry = hash_base + entry_size * (hash % FN_HASH_SIZE);
                if (!memcmp(entry, key, key_size))
                        return entry;

                for (j = 0; j < key_size; j++) {
                        if (*((char *)entry + j))
                                goto next_1;
                }

                goto found_empty;
next_1:
                continue;
        }
        die("fn_hash too full");

found_empty:
        hash = HASH_INIT;
        pthread_mutex_lock(lock);
        for (i = 0; i < HASH_CHAIN; i++) {
                hash = SuperFastHash(key, key_size, hash);
                entry = hash_base + entry_size * (hash % FN_HASH_SIZE);

                if (!memcmp(entry, key, key_size))
                        goto unlock;

                for (j = 0; j < key_size; j++) {
                        if (*((char *)entry + j))
                                goto next_2;
                }

                memcpy(entry, key, key_size);
                goto unlock;
next_2:
                continue;
        }
        die("fn_hash too full (locked)");
unlock:
        pthread_mutex_unlock(lock);

        return entry;
}

static struct prof_fn *prof_fn_find(void *this_fn)
{
        return prof_hash_find(prof_fn_hash, FN_HASH_SIZE,
                        &this_fn, sizeof(this_fn), sizeof(struct prof_fn),
                        &prof_fn_lock);
}

static struct prof_arc *prof_arc_find(void *this_fn, void *call_fn)
{
        struct prof_arc arc = { .this_fn = this_fn, .call_fn = call_fn, };

        return prof_hash_find(prof_arc_hash, ARC_HASH_SIZE,
                        &arc, 2*sizeof(void *), sizeof(struct prof_arc),
                        &prof_arc_lock);
}

static unsigned long page_size;

struct prof_stack {
        struct prof_arc *arc;
        struct prof_fn *fn;
        uint64_t stamp;
};

static __thread struct prof_stack prof_stack[128];
static __thread int prof_stack_idx = 0;

static __thread void *perf_event;

static void thread_init(void)
{
        int fd;

        fd = sys_perf_event_open(&perf_attr, 0, -1, -1, 0);
        if (fd < 0)
                die("failed to create perf_event");

        perf_event = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
        if (perf_event == (void *)(-1))
                die("failed to mmap perf_event");

        close(fd);

        prof_stack_idx = 0;
}

static u64 first_count;

void prof_init(void) __attribute__((constructor));
void prof_init(void)
{
        char *event_str;

        page_size = sysconf(_SC_PAGESIZE);

        pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create");
        if (!pthread_create_orig) {
                char *error = dlerror();
                if (!error)
                        error = "pthread_create is NULL";
                die("%s\n", error);
        }

        memset(prof_arc_hash, 0, sizeof(prof_arc_hash));
        pthread_mutex_init(&prof_arc_lock, NULL);

        memset(prof_fn_hash, 0, sizeof(prof_fn_hash));
        pthread_mutex_init(&prof_fn_lock, NULL);

        event_str = getenv("PROF_EVENT");
        if (event_str)
                /* perf_attr = parse_attr_crap(event_str); */
                ;

        thread_init(); /* main thread */

        first_count = mmap_read_self(perf_event);
}

struct prof_symbol {
        void *addr;
        unsigned long size;
        const char *name;
};

static struct prof_symbol *prof_symbols;
static unsigned long prof_nr_symbols;

static void prof_gelf(const char *name, void (*func)(Elf *elf, Elf_Scn *scn, 
GElf_Shdr *shdr, void *data), void *data)
{
        Elf         *elf;
        Elf_Scn     *scn = NULL;
        GElf_Shdr   shdr;
        int         fd;

        elf_version(EV_CURRENT);

        fd = open(name, O_RDONLY);
        elf = elf_begin(fd, ELF_C_READ, NULL);

        while ((scn = elf_nextscn(elf, scn)) != NULL) {
                gelf_getshdr(scn, &shdr);
                if (shdr.sh_type == SHT_SYMTAB) {
                        func(elf, scn, &shdr, data);
                }
        }

        elf_end(elf);
        close(fd);
}

static void prof_gelf_nr_symbols(Elf *elf, Elf_Scn *scn, GElf_Shdr *shdr, void 
*data)
{
        unsigned long *nr_symbols = data;
        unsigned long count;

        count = shdr->sh_size / shdr->sh_entsize;
        *nr_symbols += count;
}

static int prof_count_symbols(struct dl_phdr_info *info, size_t size, void 
*data)
{
        prof_gelf(info->dlpi_name, prof_gelf_nr_symbols, data);
        return 0;
}

static void prof_gelf_load_symbols(Elf *elf, Elf_Scn *scn, GElf_Shdr *shdr, 
void *_info)
{
        struct dl_phdr_info *info = _info;
        unsigned long count = shdr->sh_size / shdr->sh_entsize;
        unsigned long i;
        Elf_Data *data;

        data = elf_getdata(scn, NULL);

        for (i = 0; i < count; i++) {
                GElf_Sym sym;
                char *name;
                struct prof_symbol *symbol = &prof_symbols[prof_nr_symbols];

                gelf_getsym(data, i, &sym);

                if (GELF_ST_TYPE(sym.st_info) != STT_FUNC)
                        continue;

                if (!sym.st_size)
                        continue;

                name = elf_strptr(elf, shdr->sh_link, sym.st_name);
                if (!name)
                        continue;

                symbol->name = strdup(name);
                symbol->addr = (void *)(sym.st_value + info->dlpi_addr);
                symbol->size = sym.st_size;

                prof_nr_symbols++;
        }
}

static int prof_load_symbols(struct dl_phdr_info *info, size_t size, void *data)
{
        prof_gelf(info->dlpi_name, prof_gelf_load_symbols, info);
        return 0;
}

static int prof_cmp_symbol(const void *_a, const void *_b)
{
        const struct prof_symbol *a = _a, *b = _b;

        if (a->addr < b->addr)
                return -1;

        if (a->addr > b->addr)
                return 1;

        return 0;
}

static void load_symbols(void)
{
        unsigned long nr_symbols = 0;
        struct dl_phdr_info dl_info = {
                .dlpi_name = "/proc/self/exe",
                .dlpi_addr = 0,
        };

        prof_count_symbols(&dl_info, sizeof(dl_info), &nr_symbols);
        dl_iterate_phdr(prof_count_symbols, &nr_symbols);

        prof_symbols = calloc(nr_symbols, sizeof(struct prof_symbol));

        prof_load_symbols(&dl_info, sizeof(dl_info), NULL);
        dl_iterate_phdr(prof_load_symbols, NULL);

        qsort(prof_symbols, prof_nr_symbols, 
                        sizeof(struct prof_symbol), prof_cmp_symbol);
}

static struct prof_symbol *find_symbol(void *addr)
{
        struct prof_symbol *sym;
        unsigned long l, u, i;

        l = 0;
        u = prof_nr_symbols;

        while (l < u) {
                i = (l + u) / 2;
                sym = &prof_symbols[i];

                if (addr >= sym->addr && addr < sym->addr + sym->size)
                        return sym;

                if (addr < sym->addr)
                        u = i;
                else
                        l = i + 1;
        }

        return NULL;
}

void prof_exit(void) __attribute__((destructor));
void prof_exit(void)
{
        FILE *file;
        int i;
        int64_t max_fn_count = 0;

        load_symbols();

        file = fopen("prof.dot", "w");
        if (!file)
                die("failed to create output file");

        fprintf(file, "#\n# first count: %lu\n#\n", first_count);

        /*
         * Maybe replace "profile" with the argv
         */
        fprintf(file, "digraph profile {\n");

        for (i = 0; i < ARC_HASH_SIZE; i++) {
                struct prof_arc *arc = &prof_arc_hash[i];
                struct prof_symbol *c_sym, *t_sym;
                struct prof_fn *c;
                double p;

                if (!(arc->call_fn && arc->this_fn))
                        continue;

                c_sym = find_symbol(arc->call_fn);
                t_sym = find_symbol(arc->this_fn);

                if (!c_sym || !t_sym)
                        die("symbols missing");

                c = prof_fn_find(c_sym->addr);
                if (!c)
                        die("fn_hash|symtab borken");

                p = (double)arc->count / (double)c->count;
                fprintf(file, 
                "  \"%s\" -> \"%s\" [label=\"%f\", color=\"%.3f %.3f %.3f\"]\n",
                        c_sym->name, t_sym->name, 100.0 * p,
                        0.33, 1.0, p);
        }

        for (i = 0; i < ARC_HASH_SIZE; i++) {
                struct prof_arc *arc = &prof_arc_hash[i];
                struct prof_symbol *c_sym;
                struct prof_fn *c;
                double p;

                if (!(arc->call_fn && arc->this_fn))
                        continue;

                c_sym = find_symbol(arc->call_fn);
                c = prof_fn_find(c_sym->addr);

                c->count -= arc->count;
        }

        for (i = 0; i < FN_HASH_SIZE; i++) {
                struct prof_fn *fn = &prof_fn_hash[i];
                struct prof_symbol *s;

                if (!fn->this_fn)
                        continue;

                if ((int64_t)fn->count > max_fn_count)
                        max_fn_count = fn->count;

                s = find_symbol(fn->this_fn);
//              fprintf(file, "#  %s %ld\n", s->name, fn->count);
        }

        for (i = 0; i < FN_HASH_SIZE; i++) {
                struct prof_fn *fn = &prof_fn_hash[i];
                struct prof_symbol *s;
                double p;

                if (!fn->this_fn)
                        continue;

                s = find_symbol(fn->this_fn);
                p = (double)fn->count / (double)max_fn_count;
                fprintf(file, "  \"%s\" [color=\"%.3f %.3f %.3f\"]\n", 
                                s->name, 0.0, 1.0, p);
        }

        fprintf(file, "}\n");
        fflush(file);
        fclose(file);
}

struct tramp_data {

        void *(*func)(void *);
        void *arg;

        pthread_mutex_t lock;
        pthread_cond_t  wait;
};

static void *tramp_func(void *data)
{
        struct tramp_data *tramp_data = data;
        void *(*func)(void *) = tramp_data->func;
        void *arg = tramp_data->arg;
        void *ret;

        thread_init();

        pthread_mutex_lock(&tramp_data->lock);
        pthread_cond_signal(&tramp_data->wait);
        pthread_mutex_unlock(&tramp_data->lock);

        ret = func(arg);

        munmap(perf_event, page_size);

        return ret;
}

/* hijack pthread_create() */
int pthread_create(pthread_t *__restrict thread,
                __const pthread_attr_t *__restrict attr,
                void *(*func)(void *),
                void *__restrict arg)
{
        struct tramp_data tramp_data = {
                .func = func,
                .arg = arg,
        };
        int ret;

        pthread_cond_init(&tramp_data.wait, NULL);
        pthread_mutex_init(&tramp_data.lock, NULL);

        pthread_mutex_lock(&tramp_data.lock);

        ret = pthread_create_orig(thread, attr, &tramp_func, &tramp_data);
        if (!ret)
                pthread_cond_wait(&tramp_data.wait, &tramp_data.lock);

        pthread_mutex_unlock(&tramp_data.lock);

        pthread_mutex_destroy(&tramp_data.lock);
        pthread_cond_destroy(&tramp_data.wait);

        return ret;
} 

void __cyg_profile_func_enter(void *this_fn, void *call_fn)
{
        struct prof_stack *st = &prof_stack[prof_stack_idx++];

        st->arc = prof_arc_find(this_fn, call_fn);
        st->fn = prof_fn_find(this_fn);
        st->stamp = mmap_read_self(perf_event);
}

void __cyg_profile_func_exit(void *this_fn, void *call_fn)
{
        struct prof_stack *st = &prof_stack[--prof_stack_idx];
        uint64_t now, delta;

        now = mmap_read_self(perf_event);
        delta = now - st->stamp;
        (void)__sync_fetch_and_add(&st->fn->count, delta);
        (void)__sync_fetch_and_add(&st->arc->count, delta);
}
/*
 * Library to hook into code compiled with -finstrument-functions it will
 * record function arcs (call_fn, this_fn) as well as the sum of whatever event
 * is being measured over that function.
 *
 * Copyright (C) 2011 Red Hat, Inc., Peter Zijlstra <pzijl...@redhat.com>
 *
 * This file is copyrighted under the GPLv2 License (and not any later version).
 *
 * SuperFastHash under LGPLv2.1 (http://www.azillionmonkeys.com/qed/hash.html)
 *
 * Compilation example:
 * gcc -shared -fPIC profviz.c -o profviz.so -lpthread -ldl -lelf
 *
 * Usage example:
 * LD_PRELOAD=./profviz.so your_program 
 */

#define _GNU_SOURCE

#include "util/util.h"
#include "perf.h"
#include "util/parse-events.h"

#include <sys/time.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <dlfcn.h>
#include <pthread.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <libelf.h>
#include <gelf.h>
#include <link.h>
#include <math.h>

#define barrier() asm volatile("" ::: "memory")

static u64 rdpmc(unsigned int counter)
{
        unsigned int low, high;

        asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));

        return low | ((u64)high) << 32;
}

static u64 rdtsc(void)
{
        unsigned int low, high;

        asm volatile("rdtsc" : "=a" (low), "=d" (high));

        return low | ((u64)high) << 32;
}

static u64 mmap_read_self(void *addr)
{
        struct perf_event_mmap_page *pc = addr;
        u32 seq, idx, time_mult = 0, time_shift = 0, width = 0;
        u64 count, cyc = 0, time_offset = 0, enabled, running, delta;
        s64 pmc = 0;

        do {
                seq = pc->lock;
                barrier();

                enabled = pc->time_enabled;
                running = pc->time_running;

                if (pc->cap_usr_time && enabled != running) {
                        cyc = rdtsc();
                        time_mult = pc->time_mult;
                        time_shift = pc->time_shift;
                        time_offset = pc->time_offset;
                }

                idx = pc->index;
                count = pc->offset;
                if (pc->cap_usr_rdpmc && idx) {
                        width = pc->pmc_width;
                        pmc = rdpmc(idx - 1);
                }

                barrier();
        } while (pc->lock != seq);

        if (idx) {
                pmc <<= 64 - width;
                pmc >>= 64 - width; /* shift right signed */
                count += pmc;
        }

        if (enabled != running) {
                u64 quot, rem;

                quot = (cyc >> time_shift);
                rem = cyc & ((1 << time_shift) - 1);
                delta = time_offset + quot * time_mult +
                        ((rem * time_mult) >> time_shift);

                enabled += delta;
                if (idx)
                        running += delta;

                quot = count / running;
                rem = count % running;
                count = quot * enabled + (rem * enabled) / running;
        }

        return count;
}

static int (*pthread_create_orig)(pthread_t *__restrict,
                __const pthread_attr_t *__restrict,
                void *(*)(void *),
                void *__restrict) = NULL;

static struct perf_event_attr perf_attr = {
        .type = PERF_TYPE_HARDWARE,
        .config = PERF_COUNT_HW_CPU_CYCLES,
        .exclude_kernel = 1,
};

struct prof_arc {
        void *call_fn;
        void *this_fn;
        uint64_t count;
};

struct prof_fn {
        void *this_fn;
        uint64_t count;
};

#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \
                                       +(uint32_t)(((const uint8_t *)(d))[0]) )

static uint32_t SuperFastHash (const char * data, int len, uint32_t hash) {
        uint32_t tmp;
        int rem;

        if (len <= 0 || data == NULL) return 0;

        rem = len & 3;
        len >>= 2;

        /* Main loop */
        for (;len > 0; len--) {
                hash  += get16bits (data);
                tmp    = (get16bits (data+2) << 11) ^ hash;
                hash   = (hash << 16) ^ tmp;
                data  += 2*sizeof (uint16_t);
                hash  += hash >> 11;
        }

        /* Handle end cases */
        switch (rem) {
                case 3: hash += get16bits (data);
                        hash ^= hash << 16;
                        hash ^= data[sizeof (uint16_t)] << 18;
                        hash += hash >> 11;
                        break;
                case 2: hash += get16bits (data);
                        hash ^= hash << 11;
                        hash += hash >> 17;
                        break;
                case 1: hash += *data;
                        hash ^= hash << 10;
                        hash += hash >> 1;
        }

        /* Force "avalanching" of final 127 bits */
        hash ^= hash << 3;
        hash += hash >> 5;
        hash ^= hash << 4;
        hash += hash >> 17;
        hash ^= hash << 25;
        hash += hash >> 6;

        return hash;
}

void die(const char *err, ...)
{
        va_list params;

        va_start(params, err);
        vfprintf(stderr, err, params);
        va_end(params);

        exit(-1);
}


#define HASH_TABLE_SIZE         16384
#define AVG_STACK_DEPTH         8

#define ARC_HASH_SIZE   (HASH_TABLE_SIZE * AVG_STACK_DEPTH)

static struct prof_arc prof_arc_hash[ARC_HASH_SIZE];
static pthread_mutex_t prof_arc_lock;

#define FN_HASH_SIZE    (HASH_TABLE_SIZE)

static struct prof_fn  prof_fn_hash[FN_HASH_SIZE];
static pthread_mutex_t prof_fn_lock;

#define HASH_INIT               0x9e370001UL    
#define HASH_CHAIN              16

static inline void *
prof_hash_find(void *hash_base, const size_t hash_size, 
                const void *key, const size_t key_size,
                const size_t entry_size, pthread_mutex_t *lock)

{
        uint32_t hash = HASH_INIT;
        void *entry = NULL;
        int i, j;

        for (i = 0; i < HASH_CHAIN; i++) {
                hash = SuperFastHash(key, key_size, hash);
                entry = hash_base + entry_size * (hash % FN_HASH_SIZE);
                if (!memcmp(entry, key, key_size))
                        return entry;

                for (j = 0; j < key_size; j++) {
                        if (*((char *)entry + j))
                                goto next_1;
                }

                goto found_empty;
next_1:
                continue;
        }
        die("fn_hash too full");

found_empty:
        hash = HASH_INIT;
        pthread_mutex_lock(lock);
        for (i = 0; i < HASH_CHAIN; i++) {
                hash = SuperFastHash(key, key_size, hash);
                entry = hash_base + entry_size * (hash % FN_HASH_SIZE);

                if (!memcmp(entry, key, key_size))
                        goto unlock;

                for (j = 0; j < key_size; j++) {
                        if (*((char *)entry + j))
                                goto next_2;
                }

                memcpy(entry, key, key_size);
                goto unlock;
next_2:
                continue;
        }
        die("fn_hash too full (locked)");
unlock:
        pthread_mutex_unlock(lock);

        return entry;
}

static struct prof_fn *prof_fn_find(void *this_fn)
{
        return prof_hash_find(prof_fn_hash, FN_HASH_SIZE,
                        &this_fn, sizeof(this_fn), sizeof(struct prof_fn),
                        &prof_fn_lock);
}

static struct prof_arc *prof_arc_find(void *this_fn, void *call_fn)
{
        struct prof_arc arc = { .this_fn = this_fn, .call_fn = call_fn, };

        return prof_hash_find(prof_arc_hash, ARC_HASH_SIZE,
                        &arc, 2*sizeof(void *), sizeof(struct prof_arc),
                        &prof_arc_lock);
}

static unsigned long page_size;

struct prof_stack {
        struct prof_arc *arc;
        struct prof_fn *fn;
        uint64_t stamp;
};

static __thread struct prof_stack prof_stack[128];
static __thread int prof_stack_idx = 0;

static __thread void *perf_event;

static void thread_init(void)
{
        int fd;

        fd = sys_perf_event_open(&perf_attr, 0, -1, -1, 0);
        if (fd < 0)
                die("failed to create perf_event");

        perf_event = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
        if (perf_event == (void *)(-1))
                die("failed to mmap perf_event");

        close(fd);

        prof_stack_idx = 0;
}

static u64 first_count;

void prof_init(void) __attribute__((constructor));
void prof_init(void)
{
        char *event_str;

        page_size = sysconf(_SC_PAGESIZE);

        pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create");
        if (!pthread_create_orig) {
                char *error = dlerror();
                if (!error)
                        error = "pthread_create is NULL";
                die("%s\n", error);
        }

        memset(prof_arc_hash, 0, sizeof(prof_arc_hash));
        pthread_mutex_init(&prof_arc_lock, NULL);

        memset(prof_fn_hash, 0, sizeof(prof_fn_hash));
        pthread_mutex_init(&prof_fn_lock, NULL);

        event_str = getenv("PROF_EVENT");
        if (event_str)
                /* perf_attr = parse_attr_crap(event_str); */
                ;

        thread_init(); /* main thread */

        first_count = mmap_read_self(perf_event);
}

struct prof_symbol {
        void *addr;
        unsigned long size;
        const char *name;
};

static struct prof_symbol *prof_symbols;
static unsigned long prof_nr_symbols;

static void prof_gelf(const char *name, void (*func)(Elf *elf, Elf_Scn *scn, 
GElf_Shdr *shdr, void *data), void *data)
{
        Elf         *elf;
        Elf_Scn     *scn = NULL;
        GElf_Shdr   shdr;
        int         fd;

        elf_version(EV_CURRENT);

        fd = open(name, O_RDONLY);
        elf = elf_begin(fd, ELF_C_READ, NULL);

        while ((scn = elf_nextscn(elf, scn)) != NULL) {
                gelf_getshdr(scn, &shdr);
                if (shdr.sh_type == SHT_SYMTAB) {
                        func(elf, scn, &shdr, data);
                }
        }

        elf_end(elf);
        close(fd);
}

static void prof_gelf_nr_symbols(Elf *elf, Elf_Scn *scn, GElf_Shdr *shdr, void 
*data)
{
        unsigned long *nr_symbols = data;
        unsigned long count;

        count = shdr->sh_size / shdr->sh_entsize;
        *nr_symbols += count;
}

static int prof_count_symbols(struct dl_phdr_info *info, size_t size, void 
*data)
{
        prof_gelf(info->dlpi_name, prof_gelf_nr_symbols, data);
        return 0;
}

static void prof_gelf_load_symbols(Elf *elf, Elf_Scn *scn, GElf_Shdr *shdr, 
void *_info)
{
        struct dl_phdr_info *info = _info;
        unsigned long count = shdr->sh_size / shdr->sh_entsize;
        unsigned long i;
        Elf_Data *data;

        data = elf_getdata(scn, NULL);

        for (i = 0; i < count; i++) {
                GElf_Sym sym;
                char *name;
                struct prof_symbol *symbol = &prof_symbols[prof_nr_symbols];

                gelf_getsym(data, i, &sym);

                if (GELF_ST_TYPE(sym.st_info) != STT_FUNC)
                        continue;

                if (!sym.st_size)
                        continue;

                name = elf_strptr(elf, shdr->sh_link, sym.st_name);
                if (!name)
                        continue;

                symbol->name = strdup(name);
                symbol->addr = (void *)(sym.st_value + info->dlpi_addr);
                symbol->size = sym.st_size;

                prof_nr_symbols++;
        }
}

static int prof_load_symbols(struct dl_phdr_info *info, size_t size, void *data)
{
        prof_gelf(info->dlpi_name, prof_gelf_load_symbols, info);
        return 0;
}

static int prof_cmp_symbol(const void *_a, const void *_b)
{
        const struct prof_symbol *a = _a, *b = _b;

        if (a->addr < b->addr)
                return -1;

        if (a->addr > b->addr)
                return 1;

        return 0;
}

static void load_symbols(void)
{
        unsigned long nr_symbols = 0;
        struct dl_phdr_info dl_info = {
                .dlpi_name = "/proc/self/exe",
                .dlpi_addr = 0,
        };

        prof_count_symbols(&dl_info, sizeof(dl_info), &nr_symbols);
        dl_iterate_phdr(prof_count_symbols, &nr_symbols);

        prof_symbols = calloc(nr_symbols, sizeof(struct prof_symbol));

        prof_load_symbols(&dl_info, sizeof(dl_info), NULL);
        dl_iterate_phdr(prof_load_symbols, NULL);

        qsort(prof_symbols, prof_nr_symbols, 
                        sizeof(struct prof_symbol), prof_cmp_symbol);
}

static struct prof_symbol *find_symbol(void *addr)
{
        struct prof_symbol *sym;
        unsigned long l, u, i;

        l = 0;
        u = prof_nr_symbols;

        while (l < u) {
                i = (l + u) / 2;
                sym = &prof_symbols[i];

                if (addr >= sym->addr && addr < sym->addr + sym->size)
                        return sym;

                if (addr < sym->addr)
                        u = i;
                else
                        l = i + 1;
        }

        return NULL;
}

void prof_exit(void) __attribute__((destructor));
void prof_exit(void)
{
        FILE *file;
        int i;
        int64_t max_fn_count = 0;

        load_symbols();

        file = fopen("prof.dot", "w");
        if (!file)
                die("failed to create output file");

        fprintf(file, "#\n# first count: %lu\n#\n", first_count);

        /*
         * Maybe replace "profile" with the argv
         */
        fprintf(file, "digraph profile {\n");

        for (i = 0; i < ARC_HASH_SIZE; i++) {
                struct prof_arc *arc = &prof_arc_hash[i];
                struct prof_symbol *c_sym, *t_sym;
                struct prof_fn *c;
                double p;

                if (!(arc->call_fn && arc->this_fn))
                        continue;

                c_sym = find_symbol(arc->call_fn);
                t_sym = find_symbol(arc->this_fn);

                if (!c_sym || !t_sym)
                        die("symbols missing");

                c = prof_fn_find(c_sym->addr);
                if (!c)
                        die("fn_hash|symtab borken");

                p = (double)arc->count / (double)c->count;
                fprintf(file, 
                "  \"%s\" -> \"%s\" [label=\"%f\", color=\"%.3f %.3f %.3f\"]\n",
                        c_sym->name, t_sym->name, 100.0 * p,
                        0.33, 1.0, p);
        }

        for (i = 0; i < ARC_HASH_SIZE; i++) {
                struct prof_arc *arc = &prof_arc_hash[i];
                struct prof_symbol *c_sym;
                struct prof_fn *c;
                double p;

                if (!(arc->call_fn && arc->this_fn))
                        continue;

                c_sym = find_symbol(arc->call_fn);
                c = prof_fn_find(c_sym->addr);

                c->count -= arc->count;
        }

        for (i = 0; i < FN_HASH_SIZE; i++) {
                struct prof_fn *fn = &prof_fn_hash[i];
                struct prof_symbol *s;

                if (!fn->this_fn)
                        continue;

                if ((int64_t)fn->count > max_fn_count)
                        max_fn_count = fn->count;

                s = find_symbol(fn->this_fn);
//              fprintf(file, "#  %s %ld\n", s->name, fn->count);
        }

        for (i = 0; i < FN_HASH_SIZE; i++) {
                struct prof_fn *fn = &prof_fn_hash[i];
                struct prof_symbol *s;
                double p;

                if (!fn->this_fn)
                        continue;

                s = find_symbol(fn->this_fn);
                p = (double)fn->count / (double)max_fn_count;
                fprintf(file, "  \"%s\" [color=\"%.3f %.3f %.3f\"]\n", 
                                s->name, 0.0, 1.0, p);
        }

        fprintf(file, "}\n");
        fflush(file);
        fclose(file);
}

struct tramp_data {

        void *(*func)(void *);
        void *arg;

        pthread_mutex_t lock;
        pthread_cond_t  wait;
};

static void *tramp_func(void *data)
{
        struct tramp_data *tramp_data = data;
        void *(*func)(void *) = tramp_data->func;
        void *arg = tramp_data->arg;
        void *ret;

        thread_init();

        pthread_mutex_lock(&tramp_data->lock);
        pthread_cond_signal(&tramp_data->wait);
        pthread_mutex_unlock(&tramp_data->lock);

        ret = func(arg);

        munmap(perf_event, page_size);

        return ret;
}

/* hijack pthread_create() */
int pthread_create(pthread_t *__restrict thread,
                __const pthread_attr_t *__restrict attr,
                void *(*func)(void *),
                void *__restrict arg)
{
        struct tramp_data tramp_data = {
                .func = func,
                .arg = arg,
        };
        int ret;

        pthread_cond_init(&tramp_data.wait, NULL);
        pthread_mutex_init(&tramp_data.lock, NULL);

        pthread_mutex_lock(&tramp_data.lock);

        ret = pthread_create_orig(thread, attr, &tramp_func, &tramp_data);
        if (!ret)
                pthread_cond_wait(&tramp_data.wait, &tramp_data.lock);

        pthread_mutex_unlock(&tramp_data.lock);

        pthread_mutex_destroy(&tramp_data.lock);
        pthread_cond_destroy(&tramp_data.wait);

        return ret;
} 

void __cyg_profile_func_enter(void *this_fn, void *call_fn)
{
        struct prof_stack *st = &prof_stack[prof_stack_idx++];

        st->arc = prof_arc_find(this_fn, call_fn);
        st->fn = prof_fn_find(this_fn);
        st->stamp = mmap_read_self(perf_event);
}

void __cyg_profile_func_exit(void *this_fn, void *call_fn)
{
        struct prof_stack *st = &prof_stack[--prof_stack_idx];
        uint64_t now, delta;

        now = mmap_read_self(perf_event);
        delta = now - st->stamp;
        (void)__sync_fetch_and_add(&st->fn->count, delta);
        (void)__sync_fetch_and_add(&st->arc->count, delta);
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to