vandersonmr <vanderson...@gmail.com> writes:
> We add some of the statistics collected in the TCGProfiler > into the TBStats, having the statistics not only for the whole > emulation but for each TB. Then, we removed these stats > from TCGProfiler and reconstruct the information for the > "info jit" using the sum of all TBStats statistics. > > The goal is to have one unique and better way of collecting > emulation statistics. Moreover, checking dynamiclly if the > profiling is enabled showed to have an insignificant impact > on the performance: > https://wiki.qemu.org/Internships/ProjectIdeas/TCGCodeQuality#Overheads. > > Signed-off-by: Vanderson M. do Rosario <vanderson...@gmail.com> > --- > accel/tcg/Makefile.objs | 2 +- > accel/tcg/tb-stats.c | 107 ++++++++++++++++++++++++++++++++++++++ > accel/tcg/translate-all.c | 10 ++-- > include/exec/tb-stats.h | 9 ++++ > tcg/tcg.c | 93 ++++----------------------------- > tcg/tcg.h | 10 ---- > 6 files changed, 131 insertions(+), 100 deletions(-) > create mode 100644 accel/tcg/tb-stats.c > > diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs > index d381a02f34..49ffe81b5d 100644 > --- a/accel/tcg/Makefile.objs > +++ b/accel/tcg/Makefile.objs > @@ -2,7 +2,7 @@ obj-$(CONFIG_SOFTMMU) += tcg-all.o > obj-$(CONFIG_SOFTMMU) += cputlb.o > obj-y += tcg-runtime.o tcg-runtime-gvec.o > obj-y += cpu-exec.o cpu-exec-common.o translate-all.o > -obj-y += translator.o > +obj-y += translator.o tb-stats.o > > obj-$(CONFIG_USER_ONLY) += user-exec.o > obj-$(call lnot,$(CONFIG_SOFTMMU)) += user-exec-stub.o > diff --git a/accel/tcg/tb-stats.c b/accel/tcg/tb-stats.c > new file mode 100644 > index 0000000000..44497d4f9b > --- /dev/null > +++ b/accel/tcg/tb-stats.c > @@ -0,0 +1,107 @@ > +#include "qemu/osdep.h" > + > +#include "disas/disas.h" > +#include "exec/exec-all.h" > +#include "tcg.h" > + > +#include "qemu/qemu-print.h" > + > +struct jit_profile_info { > + uint64_t translations; > + uint64_t aborted; > + uint64_t ops; > + unsigned ops_max; > + uint64_t del_ops; > + uint64_t temps; > + unsigned temps_max; > + uint64_t host; > + uint64_t guest; > + uint64_t host_ins; > + uint64_t search_data; > +}; > + > +/* accumulate the statistics from all TBs */ > +static void collect_jit_profile_info(void *p, uint32_t hash, void *userp) > +{ > + struct jit_profile_info *jpi = userp; > + TBStatistics *tbs = p; > + > + jpi->translations += tbs->translations.total; > + jpi->ops += tbs->code.num_tcg_ops; > + if (tbs->translations.total && tbs->code.num_tcg_ops / > tbs->translations.total > + > jpi->ops_max) { > + jpi->ops_max = tbs->code.num_tcg_ops / tbs->translations.total; > + } > + jpi->del_ops += tbs->code.deleted_ops; > + jpi->temps += tbs->code.temps; > + if (tbs->translations.total && tbs->code.temps / tbs->translations.total > > > + jpi->temps_max) { > + jpi->temps_max = tbs->code.temps / tbs->translations.total; > + } > + jpi->host += tbs->code.out_len; > + jpi->guest += tbs->code.in_len; > + jpi->host_ins += tbs->code.num_host_inst; > + jpi->search_data += tbs->code.search_out_len; > +} > + > +/* dump JIT statisticis using TCGProfile and TBStats */ > +void dump_jit_profile_info(TCGProfile *s) > +{ > + if (!tb_stats_collection_enabled()) { > + return; > + } > + > + struct jit_profile_info *jpi = g_new0(struct jit_profile_info, > 1); This leaks and needs to be freed after we are done. > + > + qht_iter(&tb_ctx.tb_stats, collect_jit_profile_info, jpi); > + > + if (jpi->translations) { > + qemu_printf("translated TBs %" PRId64 "\n", jpi->translations); > + qemu_printf("avg ops/TB %0.1f max=%d\n", > + jpi->ops / (double) jpi->translations, jpi->ops_max); > + qemu_printf("deleted ops/TB %0.2f\n", > + jpi->del_ops / (double) jpi->translations); > + qemu_printf("avg temps/TB %0.2f max=%d\n", > + jpi->temps / (double) jpi->translations, jpi->temps_max); > + qemu_printf("avg host code/TB %0.1f\n", > + jpi->host / (double) jpi->translations); > + qemu_printf("avg host ins/TB %0.1f\n", > + jpi->host_ins / (double) jpi->translations); > + qemu_printf("avg search data/TB %0.1f\n", > + jpi->search_data / (double) jpi->translations); > + > + if (s) { > + int64_t tot = s->interm_time + s->code_time; > + qemu_printf("JIT cycles %" PRId64 " (%0.3f s at 2.4 > GHz)\n", > + tot, tot / 2.4e9); > + qemu_printf("cycles/op %0.1f\n", > + jpi->ops ? (double)tot / jpi->ops : 0); > + qemu_printf("cycles/in byte %0.1f\n", > + jpi->guest ? (double)tot / jpi->guest : 0); > + qemu_printf("cycles/out byte %0.1f\n", > + jpi->host ? (double)tot / jpi->host : 0); > + qemu_printf("cycles/out inst %0.1f\n", > + jpi->host_ins ? (double)tot / jpi->host_ins : 0); > + qemu_printf("cycles/search byte %0.1f\n", > + jpi->search_data ? (double)tot / jpi->search_data : > 0); > + if (tot == 0) { > + tot = 1; > + } > + qemu_printf(" gen_interm time %0.1f%%\n", > + (double)s->interm_time / tot * 100.0); > + qemu_printf(" gen_code time %0.1f%%\n", > + (double)s->code_time / tot * 100.0); > + qemu_printf("optim./code time %0.1f%%\n", > + (double)s->opt_time / (s->code_time ? s->code_time : > 1) > + * 100.0); > + qemu_printf("liveness/code time %0.1f%%\n", > + (double)s->la_time / (s->code_time ? s->code_time : 1) * > 100.0); > + qemu_printf("cpu_restore count %" PRId64 "\n", > + s->restore_count); > + qemu_printf(" avg cycles %0.1f\n", > + s->restore_count ? (double)s->restore_time / > s->restore_count : 0); > + } > + } > +} > + > + > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c > index 3a47ac6f2c..bbdb0f23a4 100644 > --- a/accel/tcg/translate-all.c > +++ b/accel/tcg/translate-all.c > @@ -1773,8 +1773,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu, > tb_overflow: > > #ifdef CONFIG_PROFILER > - /* includes aborted translations because of exceptions */ > - atomic_set(&prof->tb_count1, prof->tb_count1 + 1); > ti = profile_getclock(); > #endif I assume from the conversation on IRC we can add the profile clocks here as well. I wonder if execution time could also be added (under a new tbstats flag)? The data might be too noisy given the time to collect timestamps vs the relatively short block length. > > @@ -1822,7 +1820,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu, > } > > #ifdef CONFIG_PROFILER > - atomic_set(&prof->tb_count, prof->tb_count + 1); > atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() - > ti); > ti = profile_getclock(); > #endif > @@ -1869,12 +1866,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu, > > #ifdef CONFIG_PROFILER > atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti); > - atomic_set(&prof->code_in_len, prof->code_in_len + tb->size); > - atomic_set(&prof->code_out_len, prof->code_out_len + gen_code_size); > - atomic_set(&prof->search_out_len, prof->search_out_len + search_size); > #endif > > if (tb_stats_enabled(tb, TB_JIT_STATS)) { > + atomic_add(&tb->tb_stats->code.in_len, tb->size); > + atomic_add(&tb->tb_stats->code.out_len, gen_code_size); If we are switching num_host_insns to num_host_bytes we don't need a duplicate here. > + atomic_add(&tb->tb_stats->code.search_out_len, search_size); > + > size_t code_size = gen_code_size; > if (tcg_ctx->data_gen_ptr) { > code_size = tcg_ctx->data_gen_ptr - tb->tc.ptr; > diff --git a/include/exec/tb-stats.h b/include/exec/tb-stats.h > index 6584b78c4a..7d775f2c0d 100644 > --- a/include/exec/tb-stats.h > +++ b/include/exec/tb-stats.h > @@ -40,6 +40,13 @@ struct TBStatistics { > unsigned num_tcg_ops; > unsigned num_tcg_ops_opt; > unsigned spills; > + > + /* CONFIG_PROFILE */ > + unsigned temps; > + unsigned deleted_ops; > + unsigned in_len; > + unsigned out_len; > + unsigned search_out_len; > } code; > > struct { > @@ -51,4 +58,6 @@ struct TBStatistics { > > bool tb_stats_cmp(const void *ap, const void *bp); > > +void dump_jit_profile_info(TCGProfile *s); > + > #endif > diff --git a/tcg/tcg.c b/tcg/tcg.c > index 446e3d1708..46b31a2f68 100644 > --- a/tcg/tcg.c > +++ b/tcg/tcg.c > @@ -2295,9 +2295,9 @@ void tcg_op_remove(TCGContext *s, TCGOp *op) > QTAILQ_INSERT_TAIL(&s->free_ops, op, link); > s->nb_ops--; > > -#ifdef CONFIG_PROFILER > - atomic_set(&s->prof.del_op_count, s->prof.del_op_count + 1); > -#endif > + if (tb_stats_enabled(s->current_tb, TB_JIT_STATS)) { > + atomic_inc(&s->current_tb->tb_stats->code.deleted_ops); > + } > } > > static TCGOp *tcg_op_alloc(TCGOpcode opc) > @@ -3914,16 +3914,6 @@ void tcg_profile_snapshot(TCGProfile *prof, bool > counters, bool table) > > if (counters) { > PROF_ADD(prof, orig, cpu_exec_time); > - PROF_ADD(prof, orig, tb_count1); > - PROF_ADD(prof, orig, tb_count); > - PROF_ADD(prof, orig, op_count); > - PROF_MAX(prof, orig, op_count_max); > - PROF_ADD(prof, orig, temp_count); > - PROF_MAX(prof, orig, temp_count_max); > - PROF_ADD(prof, orig, del_op_count); > - PROF_ADD(prof, orig, code_in_len); > - PROF_ADD(prof, orig, code_out_len); > - PROF_ADD(prof, orig, search_out_len); > PROF_ADD(prof, orig, interm_time); > PROF_ADD(prof, orig, code_time); > PROF_ADD(prof, orig, la_time); > @@ -4003,26 +3993,17 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) > TCGOp *op; > > s->current_tb = tb; > - > -#ifdef CONFIG_PROFILER > - { > + if (tb_stats_enabled(tb, TB_JIT_STATS)) { > int n = 0; > > QTAILQ_FOREACH(op, &s->ops, link) { > n++; > } > - atomic_set(&prof->op_count, prof->op_count + n); > - if (n > prof->op_count_max) { > - atomic_set(&prof->op_count_max, n); > - } > + atomic_add(&tb->tb_stats->code.num_tcg_ops, n); > > n = s->nb_temps; > - atomic_set(&prof->temp_count, prof->temp_count + n); > - if (n > prof->temp_count_max) { > - atomic_set(&prof->temp_count_max, n); > - } > + atomic_add(&tb->tb_stats->code.temps, n); > } > -#endif > > #ifdef DEBUG_DISAS > if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP) > @@ -4229,70 +4210,16 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) > return tcg_current_code_size(s); > } > > -#ifdef CONFIG_PROFILER > void tcg_dump_info(void) > { > + TCGProfile *s = NULL; > +#ifdef CONFIG_PROFILER > TCGProfile prof = {}; > - const TCGProfile *s; > - int64_t tb_count; > - int64_t tb_div_count; > - int64_t tot; > - > tcg_profile_snapshot_counters(&prof); > s = &prof; > - tb_count = s->tb_count; > - tb_div_count = tb_count ? tb_count : 1; > - tot = s->interm_time + s->code_time; > - > - qemu_printf("JIT cycles %" PRId64 " (%0.3f s at 2.4 GHz)\n", > - tot, tot / 2.4e9); > - qemu_printf("translated TBs %" PRId64 " (aborted=%" PRId64 > - " %0.1f%%)\n", > - tb_count, s->tb_count1 - tb_count, > - (double)(s->tb_count1 - s->tb_count) > - / (s->tb_count1 ? s->tb_count1 : 1) * 100.0); > - qemu_printf("avg ops/TB %0.1f max=%d\n", > - (double)s->op_count / tb_div_count, s->op_count_max); > - qemu_printf("deleted ops/TB %0.2f\n", > - (double)s->del_op_count / tb_div_count); > - qemu_printf("avg temps/TB %0.2f max=%d\n", > - (double)s->temp_count / tb_div_count, s->temp_count_max); > - qemu_printf("avg host code/TB %0.1f\n", > - (double)s->code_out_len / tb_div_count); > - qemu_printf("avg search data/TB %0.1f\n", > - (double)s->search_out_len / tb_div_count); > - > - qemu_printf("cycles/op %0.1f\n", > - s->op_count ? (double)tot / s->op_count : 0); > - qemu_printf("cycles/in byte %0.1f\n", > - s->code_in_len ? (double)tot / s->code_in_len : 0); > - qemu_printf("cycles/out byte %0.1f\n", > - s->code_out_len ? (double)tot / s->code_out_len : 0); > - qemu_printf("cycles/search byte %0.1f\n", > - s->search_out_len ? (double)tot / s->search_out_len : 0); > - if (tot == 0) { > - tot = 1; > - } > - qemu_printf(" gen_interm time %0.1f%%\n", > - (double)s->interm_time / tot * 100.0); > - qemu_printf(" gen_code time %0.1f%%\n", > - (double)s->code_time / tot * 100.0); > - qemu_printf("optim./code time %0.1f%%\n", > - (double)s->opt_time / (s->code_time ? s->code_time : 1) > - * 100.0); > - qemu_printf("liveness/code time %0.1f%%\n", > - (double)s->la_time / (s->code_time ? s->code_time : 1) * > 100.0); > - qemu_printf("cpu_restore count %" PRId64 "\n", > - s->restore_count); > - qemu_printf(" avg cycles %0.1f\n", > - s->restore_count ? (double)s->restore_time / > s->restore_count : 0); > -} > -#else > -void tcg_dump_info(void) > -{ > - qemu_printf("[TCG profiler not compiled]\n"); > -} > #endif > + dump_jit_profile_info(s); > +} > > #ifdef ELF_HOST_MACHINE > /* In order to use this feature, the backend needs to do three things: > diff --git a/tcg/tcg.h b/tcg/tcg.h > index bf6f3bcba3..026a066b9a 100644 > --- a/tcg/tcg.h > +++ b/tcg/tcg.h > @@ -644,16 +644,6 @@ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8)); > > typedef struct TCGProfile { > int64_t cpu_exec_time; > - int64_t tb_count1; > - int64_t tb_count; > - int64_t op_count; /* total insn count */ > - int op_count_max; /* max insn per TB */ > - int temp_count_max; > - int64_t temp_count; > - int64_t del_op_count; > - int64_t code_in_len; > - int64_t code_out_len; > - int64_t search_out_len; > int64_t interm_time; > int64_t code_time; > int64_t la_time; -- Alex Bennée