: update to C++20, add new stackprof tool

Aaron Merey Fri, 29 May 2026 12:07:33 -0700

Hi Serhei,

I've done another round of reviews of this eu-stackprof patch set.  My
comments from the first round of reviews still apply and I've included
some more below.


On Mon, Apr 13, 2026 at 5:31 PM Serhei Makarov <[email protected]> wrote:
>
> eu-stackprof is a new tool which profiles processes on a Linux system
> using perf_events and outputs gprof gmon.out format program counter
> histograms and callgraph-arc profiles; intended as an updated demo of
> libdwfl_stacktrace functionality and as a data-gathering tool for the
> profiledb initiative.
>
> * configure.ac: Add configure checks for C++20, eu-stackprof
>   perf/libpfm dependencies.
> * src/Makefile.am (bin_PROGRAMS): Add stackprof.
>   (stackprof_*): Add stackprof SOURCES, LDADD, and so forth.
> * src/stackprof.cxx: New file.

NEWS should be updated too.

>
> Co-authored-by: <[email protected]>
> Signed-off-by: <[email protected]>
> ---
>  configure.ac      |   23 +-
>  src/Makefile.am   |   11 +-
>  src/stackprof.cxx | 2083 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 2110 insertions(+), 7 deletions(-)
>  create mode 100644 src/stackprof.cxx
>
> diff --git a/configure.ac b/configure.ac
> index f22a3f90..e5be95b8 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -888,10 +888,21 @@ fi
>  AC_CHECK_PROG(HAVE_ZSTD, zstd, yes, no)
>  AM_CONDITIONAL([HAVE_ZSTD],[test "x$HAVE_ZSTD" = "xyes"])
>
> -# For tests that need to use C++11
> -AX_CXX_COMPILE_STDCXX(11, noext, optional)
> -AS_IF([test "x$HAVE_CXX11" = "x1"], [HAVE_CXX11=yes], [HAVE_CXX11=no])
> -AM_CONDITIONAL([HAVE_CXX11],[test "x$HAVE_CXX11" = "xyes"])
> +# For tests that need to use C++20
> +AX_CXX_COMPILE_STDCXX(20, noext, optional)
> +AS_IF([test "x$HAVE_CXX20" = "x1"], [HAVE_CXX20=yes], [HAVE_CXX20=no])
> +AM_CONDITIONAL([HAVE_CXX20],[test "x$HAVE_CXX20" = "xyes"])
> +
> +
> +# For eu-stackprof
> +# optional:
> +AC_CHECK_HEADERS([perfmon/pfmlib_perf_event.h])
> +AM_CONDITIONAL([HAVE_LIBPFM], [test 
> "x${ac_cv_header_perfmon_pfmlib_perf_event_h}" = "xyes" ])
> +AC_CHECK_LIB(pfm, pfm_get_os_event_encoding, [AC_SUBST(libpfm_LIBS, 
> '-lpfm')])
> +# required:
> +AC_CHECK_HEADERS([linux/perf_event.h])
> +AM_CONDITIONAL([ENABLE_STACKPROF],[test 
> "x${ac_cv_header_linux_perf_event_h}x${HAVE_CXX20}" = "xyesxyes" ])
> +
>
>  AC_CHECK_HEADERS([execinfo.h])
>
> @@ -941,7 +952,7 @@ AS_IF([test "x$with_libarchive" = "xyes" -a 
> "x$have_libarchive" != "xyes"], [
>  # pronounce judgement on ability to build server, overridden by =yes/=no
>  if test "x$enable_debuginfod" = "xno"; then
>     true
> -elif test "x$have_jsonc$HAVE_CXX11$have_libarchive$have_sqlite3" = 
> "xyesyesyesyes"; then
> +elif test "x$have_jsonc$HAVE_CXX20$have_libarchive$have_sqlite3" = 
> "xyesyesyesyes"; then
>     enable_debuginfod=yes
>  elif test "x$enable_debuginfod" = "xyes"; then
>     AC_MSG_ERROR([unable to build debuginfod, missing libmicrohttpd, sqlite3 
> or libarchive])
> @@ -1098,7 +1109,7 @@ AC_MSG_NOTICE([
>    EXTRA TEST FEATURES (used with make check)
>      have bunzip2 installed (required)  : ${HAVE_BUNZIP2}
>      have zstd installed                : ${HAVE_ZSTD}
> -    C++11                              : ${HAVE_CXX11}
> +    C++20                              : ${HAVE_CXX20}
>      debug branch prediction            : ${use_debugpred}
>      gprof support                      : ${use_gprof}
>      gcov support                       : ${use_gcov}
> diff --git a/src/Makefile.am b/src/Makefile.am
> index f041d458..f753c70c 100644
> --- a/src/Makefile.am
> +++ b/src/Makefile.am
> @@ -35,6 +35,9 @@ bin_PROGRAMS = readelf nm size strip elflint findtextrel 
> addr2line \
>  if ENABLE_STACKTRACE
>  bin_PROGRAMS += stacktrace
>  endif
> +if ENABLE_STACKPROF
> +bin_PROGRAMS += stackprof
> +endif
>
>  noinst_LIBRARIES = libar.a
>
> @@ -127,7 +130,13 @@ endif
>  elfcompress_LDADD = $(libebl) $(libelf) $(libdw) $(libeu) $(argp_LDADD)
>  elfclassify_LDADD = $(libelf) $(libdw) $(libeu) $(argp_LDADD)
>  srcfiles_SOURCES = srcfiles.cxx
> -srcfiles_LDADD = $(libdw) $(libelf) $(libeu)  $(argp_LDADD) 
> $(libarchive_LIBS) $(libdebuginfod)
> +srcfiles_LDADD = $(libdw) $(libelf) $(libeu) $(argp_LDADD) 
> $(libarchive_LIBS) $(libdebuginfod)
> +if ENABLE_STACKPROF
> +stackprof_SOURCES = stackprof.cxx
> +stackprof_CPPFLAGS = $(AM_CPPFLAGS) $(jsonc_CXXFLAGS)
> +stackprof_CXXFLAGS = -Wall
> +stackprof_LDADD = $(libebl) $(libdw) $(libelf) $(libeu) $(argp_LDADD) 
> $(libpfm_LIBS) $(jsonc_LIBS)
> +endif
>
>  installcheck-binPROGRAMS: $(bin_PROGRAMS)
>         bad=0; pid=$$$$; list="$(bin_PROGRAMS)"; for p in $$list; do \
> diff --git a/src/stackprof.cxx b/src/stackprof.cxx
> new file mode 100644
> index 00000000..33720e80
> --- /dev/null
> +++ b/src/stackprof.cxx
> @@ -0,0 +1,2083 @@
> +/* Collect stack-trace profiles of running program(s).
> +   Copyright (C) 2025-2026 Red Hat, Inc.
> +   This file is part of elfutils.
> +
> +   This file is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3 of the License, or
> +   (at your option) any later version.
> +
> +   elfutils is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
> +
> +#ifdef HAVE_CONFIG_H
> +# include <config.h>
> +#endif
> +
> +#include "printversion.h"
> +
> +#include <string>
> +#include <memory>
> +#include <iomanip>
> +#include <map>
> +#include <unordered_map>
> +#include <vector>
> +#include <bitset>
> +#include <stdexcept>
> +#include <cstring>
> +#include <csignal>
> +#include <cassert>
> +#include <chrono>
> +#include <iostream>
> +#include <fstream>
> +#include <sstream>
> +#include <cinttypes>
> +#include <format>
> +#include <filesystem>
> +
> +#include <sys/utsname.h>
> +
> +#include <sys/syscall.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/wait.h>
> +#include <poll.h>
> +#ifdef HAVE_LINUX_PERF_EVENT_H
> +#include <linux/perf_event.h>
> +#endif
> +#include <argp.h>
> +#include <fcntl.h>
> +#include <dirent.h>
> +
> +#include <system.h>
> +
> +#ifdef HAVE_PERFMON_PFMLIB_PERF_EVENT_H
> +#include <perfmon/pfmlib_perf_event.h>
> +#endif
> +
> +#include <json-c/json.h>
> +
> +#include <gelf.h>
> +#include <dwarf.h>
> +#include <libdwfl.h>
> +#include <libdw.h>
> +#include "../libebl/libebl.h"
> +#include "../libdwfl_stacktrace/libdwfl_stacktrace.h"
> +
> +using namespace std; // so we don't have to std:: prefix everything in here
> +
> +////////////////////////////////////////////////////////////////////////
> +// find_debuginfo callbacks
> +
> +#ifdef FIND_DEBUGINFO

FIND_DEBUGINFO isn't defined anywhere. If the intention is to flesh this out
in a future patch I'd prefer we just add handling for FIND_DEBUGINFO then.

> +
> +static char *debuginfo_path = NULL;
> +
> +static const Dwfl_Callbacks dwfl_cfi_callbacks =
> +  {
> +    .find_elf = dwflst_tracker_linux_proc_find_elf,
> +    .find_debuginfo = dwfl_standard_find_debuginfo,
> +    .debuginfo_path = &debuginfo_path,
> +  };
> +
> +#else
> +
> +int
> +nop_find_debuginfo (Dwfl_Module *mod __attribute__((unused)),
> +                   void **userdata __attribute__((unused)),
> +                   const char *modname __attribute__((unused)),
> +                   GElf_Addr base __attribute__((unused)),
> +                   const char *file_name __attribute__((unused)),
> +                   const char *debuglink_file __attribute__((unused)),
> +                   GElf_Word debuglink_crc __attribute__((unused)),
> +                   char **debuginfo_file_name __attribute__((unused)))
> +{
> +#ifdef DEBUG_MODULES
> +  cerr << format("nop_find_debuginfo: modname={} file_name={} 
> debuglink_file={}\n", modname, file_name, debuglink_file);
> +#endif
> +  return -1;
> +}
> +
> +static const Dwfl_Callbacks dwfl_cfi_callbacks =
> +{
> +  .find_elf = dwflst_tracker_linux_proc_find_elf,
> +  .find_debuginfo = nop_find_debuginfo, /* work with CFI only */
> +};
> +
> +#endif /* FIND_DEBUGINFO */
> +
> +
> +////////////////////////////////////////////////////////////////////////
> +// class decls
> +
> +// Unwind statistics for a Dwfl and associated process.
> +struct UnwindDwflStats {
> +  Dwfl *dwfl;
> +  string comm;
> +  int max_frames; /* for diagnostic purposes */
> +  int total_samples; /* for diagnostic purposes */
> +  int lost_samples; /* for diagnostic purposes */
> +  int shown_errors; /* for diagnostic purposes */
> +  Dwfl_Unwound_Source last_unwound; /* track CFI source, for diagnostic 
> purposes */
> +  Dwfl_Unwound_Source worst_unwound; /* track CFI source, for diagnostic 
> purposes */
> +};
> +
> +struct hash_arc {
> +  template <class T1, class T2>
> +  size_t operator()(const pair<T1, T2> &p) const {
> +    return hash<T1>()(p.first) ^ hash<T2>()(p.second);
> +  }
> +};
> +
> +// Unwind statistics for a single module identified by build-id.
> +struct UnwindModuleStats {
> +  map<uint64_t, uint32_t> histogram; /* sorted by pc */
> +  unordered_map<pair<uint64_t, uint64_t>, uint32_t, hash_arc> callgraph;
> +
> +  void record_pc(Dwarf_Addr pc) {
> +    if (histogram.count(pc) == 0)
> +      histogram[pc]=1;
> +    else
> +      histogram[pc]++;
> +  }
> +  void record_callgraph_arc(Dwarf_Addr from, Dwarf_Addr to) {
> +    pair<uint64_t, uint64_t> arc(from, to);
> +    if (callgraph.count(arc) == 0)
> +      callgraph[arc]=1;
> +    else
> +      callgraph[arc]++;
> +  }
> +};
> +
> +struct UnwindStatsTable
> +{
> +  unordered_map<pid_t, UnwindDwflStats> dwfl_tab;
> +  unordered_map<string, UnwindModuleStats> buildid_tab;
> +  typedef map<string, UnwindModuleStats> buildid_map_t;
> +
> +  UnwindStatsTable () {}
> +  ~UnwindStatsTable () {}
> +
> +  UnwindDwflStats *pid_find_or_create(pid_t pid);
> +  string pid_find_comm(pid_t pid);
> +  Dwfl *pid_find_dwfl(pid_t pid);
> +  void pid_store_dwfl(pid_t pid, Dwfl *dwfl);
> +
> +  UnwindModuleStats *buildid_find(string buildid);
> +  UnwindModuleStats *buildid_find_or_create(string buildid, Dwfl_Module 
> *mod);
> +
> +  void print_summary() const;
> +};
> +
> +class PerfConsumer;
> +
> +// A PerfReader creates perf_events file descriptors, monitors the
> +// mmap'd ring buffers for events, and dispatches decoded forms to a
> +// PerfConsumer.
> +class PerfReader
> +{
> +private:
> +  /* Sized by number of CPUs or threads: */
> +  vector<int> perf_fds;
> +  vector<perf_event_mmap_page *> perf_headers;
> +  vector<pollfd> pollfds;
> +
> +  PerfConsumer* consumer; // pluralize!
> +  Ebl* default_ebl;
> +  uint64_t sample_regs_user;
> +  int sample_regs_count;
> +  bool enabled;
> +  int page_size;
> +  int page_count;
> +  int mmap_size;
> +  vector<uint8_t> event_wraparound_temp; // for events straddling ring 
> buffer end
> +
> +  void decode_event(const perf_event_header* ehdr);
> +
> +public:
> +  // PerfReader(perf_event_attr* attr, int pid, PerfConsumer* consumer); // 
> attach to process hierarchy; may modify *attr
> +  PerfReader(perf_event_attr* attr, PerfConsumer* consumer, int pid=-1);     
>      // systemwide; may modify *attr
> +
> +  ~PerfReader();
> +
> +  void process_some(); // run briefly, relay decoded perf_events to consumer
> +  uint64_t regs_mask() { return this->sample_regs_user; }
> +  Ebl *ebl() { return this->default_ebl; }
> +};
> +
> +// A PerfConsumer receives both raw and decoded (fields split out into 
> function parameters)
> +// perf event records from a PerfReader.  Pure interface.
> +class PerfConsumer
> +{
> +protected:
> +  PerfReader *reader; /* access sample_regs_user etc. metadata */
> +
> +public:
> +  PerfConsumer() {}
> +  PerfConsumer(PerfReader *reader) : reader(reader) {}
> +  void set_reader(PerfReader *reader) { this->reader = reader; }
> +
> +  virtual ~PerfConsumer() {}
> +  virtual void process(const perf_event_header* sample) {}
> +
> +  virtual void process_comm(const perf_event_header* sample,
> +                           uint32_t pid, uint32_t tid, bool exec, const 
> string& comm) {}
> +  virtual void process_exit(const perf_event_header* sample,
> +                           uint32_t pid, uint32_t ppid,
> +                           uint32_t tid, uint32_t ptid) {}
> +  virtual void process_fork(const perf_event_header* sample,
> +                           uint32_t pid, uint32_t ppid,
> +                           uint32_t tid, uint32_t ptid) {}
> +  virtual void process_sample(const perf_event_header* sample,
> +                             uint64_t ip,
> +                             uint32_t pid, uint32_t tid,
> +                             uint64_t time,
> +                             uint64_t abi,
> +                             uint32_t nregs, const uint64_t *regs,
> +                             uint64_t data_size, const uint8_t *data) {}
> +  virtual void process_mmap2(const perf_event_header* sample,
> +                            uint32_t pid, uint32_t tid,
> +                            uint64_t addr, uint64_t len, uint64_t pgoff,
> +                            uint8_t build_id_size, const uint8_t *build_id,
> +                            const char *filename) {}
> +};
> +
> +// A StatsPerfConsumer is a toy concrete object that accepts decoded
> +// perf events and logs and records basic stats about them.
> +class StatsPerfConsumer: public PerfConsumer
> +{
> +  unordered_map<int,unsigned> event_type_counts;
> +
> +public:
> +  StatsPerfConsumer() {}
> +  ~StatsPerfConsumer(); // report to stdout
> +  void process_comm(const perf_event_header* sample,
> +                   uint32_t pid, uint32_t tid, bool exec, const string& 
> comm);
> +  void process_exit(const perf_event_header* sample,
> +                           uint32_t pid, uint32_t ppid,
> +                   uint32_t tid, uint32_t ptid);
> +  void process_fork(const perf_event_header* sample,
> +                           uint32_t pid, uint32_t ppid,
> +                   uint32_t tid, uint32_t ptid);
> +  void process_sample(const perf_event_header* sample,
> +                             uint64_t ip,
> +                             uint32_t pid, uint32_t tid,
> +                             uint64_t time,
> +                             uint64_t abi,
> +                             uint32_t nregs, const uint64_t *regs,
> +                     uint64_t data_size, const uint8_t *data);
> +  void process_mmap2(const perf_event_header* sample,
> +                            uint32_t pid, uint32_t tid,
> +                            uint64_t addr, uint64_t len, uint64_t pgoff,
> +                            uint8_t build_id_size, const uint8_t *build_id,
> +                    const char *filename);
> +  void process(const perf_event_header* sample);
> +};
> +
> +// An UnwindSample records an unwound call stack from a perf-event
> +// sample.
> +struct UnwindSample
> +{
> +  const perf_event_header *event;
> +  Dwfl *dwfl;
> +  uint32_t pid, tid;
> +  vector<Dwarf_Addr> addrs;
> +  int elfclass;
> +
> +  Dwarf_Addr base; /* for diagnostic purposes */
> +  Dwarf_Addr sp; /* for diagnostic purposes */
> +};
> +
> +class UnwindSampleConsumer;
> +
> +// A PerfConsumerUnwinder accepts decoded perf events, and produces
> +// UnwindSample objects from them for relaying to an
> +// UnwindSampleConsumer.
> +class PerfConsumerUnwinder: public PerfConsumer
> +{
> +  UnwindSampleConsumer *consumer;
> +  UnwindSample last_us; // XXX: why & is this safe to hang onto?
> +  Dwflst_Process_Tracker *tracker;
> +  UnwindStatsTable *stats;
> +  unsigned maxframes;
> +
> +  int find_procfile(Dwfl *dwfl, pid_t *pid, Elf **elf, int *elf_fd);
> +  Dwfl *find_dwfl(pid_t pid, const uint64_t *regs, uint32_t nregs,
> +                 Elf **elf, bool *cached);
> +
> +  int get_sp_reg(bool is_abi32);
> +
> +public:
> +  PerfConsumerUnwinder(UnwindSampleConsumer* usc, UnwindStatsTable *ust);
> +  PerfConsumerUnwinder(UnwindSampleConsumer* usc, UnwindStatsTable *ust, 
> PerfReader *reader);
> +  ~PerfConsumerUnwinder();
> +
> +  /* libdwfl{st} callbacks */
> +  Dwfl *init_dwfl(pid_t pid);
> +  int unwind_frame_cb(Dwfl_Frame *state);
> +
> +  void process_comm(const perf_event_header* sample,
> +                   uint32_t pid, uint32_t tid, bool exec, const string& 
> comm);
> +  void process_exit(const perf_event_header* sample,
> +                   uint32_t pid, uint32_t ppid,
> +                   uint32_t tid, uint32_t ptid);
> +  void process_fork(const perf_event_header* sample,
> +                   uint32_t pid, uint32_t ppid,
> +                   uint32_t tid, uint32_t ptid);
> +  void process_sample(const perf_event_header* sample,
> +                     uint64_t ip,
> +                     uint32_t pid, uint32_t tid,
> +                     uint64_t time,
> +                     uint64_t abi,
> +                     uint32_t nregs, const uint64_t *regs,
> +                     uint64_t data_size, const uint8_t *data);
> +  void process_mmap2(const perf_event_header* sample,
> +                    uint32_t pid, uint32_t tid,
> +                    uint64_t addr, uint64_t len, uint64_t pgoff,
> +                    uint8_t build_id_size, const uint8_t *build_id,
> +                    const char *filename);
> +};
> +
> +// An UnwindSampleConsumer receives an UnwindSample from a 
> PerfConsumerUnwinder.
> +// Pure abstract.
> +class UnwindSampleConsumer
> +{
> +public:
> +  UnwindSampleConsumer() {}
> +  virtual ~UnwindSampleConsumer() {}
> +  virtual void process(const UnwindSample* sample) = 0;
> +  virtual int maxframes() = 0;
> +};
> +
> +
> +// An UnwindStatsConsumer is a toy that just collects statistics about
> +// a received stream of UnwindSamples.
> +class UnwindStatsConsumer: public UnwindSampleConsumer
> +{
> +  UnwindStatsTable *stats;
> +
> +public:
> +  UnwindStatsConsumer(UnwindStatsTable *usc) : stats(usc) {}
> +  ~UnwindStatsConsumer();
> +  void process(const UnwindSample* sample);
> +  int maxframes();
> +};
> +
> +
> +// An GprofUnwindSampleConsumer instance consumes UnwindSamples and tabulates
> +// them by buildid, for eventual writing out into gmon.out format files.
> +class GprofUnwindSampleConsumer: public UnwindSampleConsumer
> +{
> +  UnwindStatsTable *stats;
> +  unordered_map<string, string> buildid_to_mainfile;
> +  unordered_map<string, string> buildid_to_debugfile;
> +  void record_gmon_hist(ostream &of, map<uint64_t, uint32_t> &histogram, 
> uint64_t low_pc, uint64_t high_pc, uint64_t alignment);
> +
> +public:
> +  GprofUnwindSampleConsumer(UnwindStatsTable *usc) : stats(usc) {}
> +  ~GprofUnwindSampleConsumer(); // write out all the gmon.$BUILDID.out files
> +  void record_gmon_out(const string& buildid, UnwindModuleStats& m); // 
> write out one gmon.$BUILDID.out file
> +  void process(const UnwindSample* sample); // accumulate hits / callgraph 
> edges (need maxdepth=1 only)
> +  int maxframes();
> +};
> +
> +// hypothetical: FlamegraphUnwindSampleConsumer, taking in a bigger maxdepth
> +// hypothetical: PprofUnwindSampleConsumer, https://github.com/google/pprof
> +
> +
> +////////////////////////////////////////////////////////////////////////
> +// command line parsing and main()
> +
> +/* Name and version of program.  */
> +ARGP_PROGRAM_VERSION_HOOK_DEF = print_version;
> +
> +/* Bug report address.  */
> +ARGP_PROGRAM_BUG_ADDRESS_DEF = PACKAGE_BUGREPORT;
> +
> +#define HIST_SPLIT_OPTS "none/even/flex"
> +
> +/* Definitions of arguments for argp functions.  */
> +static const struct argp_option options[] =
> +{
> +  { NULL, 0, NULL, OPTION_DOC, N_("Output options:"), 1 },
> +  { "verbose", 'v', NULL, 0, N_ ("Increase verbosity of logging messages 
> (modules/samples/frames/more)."), 0 },
> +  /* TODO: Add "quiet" option suppressing summary table. */
> +  { "gmon", 'g', NULL, 0, N_("Generate gmon.BUILDID.out files for each 
> binary."), 0 },
> +  { "hist-split",'G', HIST_SPLIT_OPTS, 0, N_("Histogram splitting method for 
> gmon, default 'even'."), 0 },

It would be helpful to very briefly describe the difference between
even and flex here.

> +  { "maxframes", 'n', "MAXFRAMES", 0, N_("Maximum number of frames to 
> unwind, default 1 with --gmon, 256 otherwise."), 0 }, /* TODO */
> +  { "output", 'o', "DIR", 0, N_("Output directory for gmon files."), 0 },
> +  { "force", 'f', NULL, 0, N_("Unlink output files to force writing as 
> new."), 0 },
> +  { "pid", 'p', "PID", 0, N_("Profile given PID, and its future children."), 
> 0 },
> +#ifdef HAVE_PERFMON_PFMLIB_PERF_EVENT_H
> +  { "event", 'e', "EVENT", 0, N_("Sample given LIBPFM event 
> specification."), 0 },
> +#define ARGP_KEY_EVENT_LIST 0x1000
> +  { "event-list", ARGP_KEY_EVENT_LIST, NULL, 0, N_("Sample given LIBPFM 
> event specification."), 0 },
> +#endif
> +  { NULL, 0, NULL, 0, NULL, 0 }
> +};
> +
> +static error_t parse_opt (int key, char *arg, struct argp_state *state);
> +static const struct argp argp =
> +  {
> +    options, parse_opt, "[--] [CMD]...", N_("Collect systemwide stack-trace 
> profiles."),
> +    NULL, NULL, NULL
> +  };
> +
> +// How to divide the program counter histograms in gmon output:
> +enum hist_split_method {
> +  HIST_SPLIT_NONE = 0, /* one histogram for the entire executable */
> +  HIST_SPLIT_EVEN = 1, /* all histograms the same size */
> +  HIST_SPLIT_FLEX = 2, /* variable-size histograms */
> +};
> +
> +// Globals set based on command line options:
> +static unsigned verbose;
> +static bool gmon;
> +static hist_split_method gmon_hist_split = HIST_SPLIT_EVEN;
> +static string output_dir = ".";
> +static bool output_force = false; // overwrite preexisting output files?
> +static int pid;
> +static int opt_maxframes = -1; // set to >= 0 to override default maxframes 
> in consumer
> +static string libpfm_event;
> +static string libpfm_event_decoded;
> +static perf_event_attr attr;
> +static bool branch_record = false; // using accurate branch recording for 
> call-graph arcs rather than backtrace heuristics

branch_record is never set to true. Is this intended?

> +
> +// Verbosity categories:
> +static bool show_summary = true; /* XXX could suppress with --quiet */
> +static bool show_modules = false; /* -> first sample for each module */
> +static bool show_samples = false; /* -> every sample */
> +static bool show_frames = false;
> +static bool show_debugfile = false;
> +static bool show_tmi = false; /* -> perf, cfi details */
> +
> +static error_t
> +parse_opt (int key, char *arg, struct argp_state *state)
> +{
> +  (void)state;
> +
> +  switch (key)
> +    {
> +    case ARGP_KEY_INIT:
> +      break;
> +
> +    case 'v':
> +      verbose ++;
> +      break;
> +
> +    case 'g':
> +      gmon = true;
> +      break;
> +
> +    case 'G':
> +      gmon = true; /* Automatically enable gmon mode if they set a gmon 
> option. */
> +      if (std::string_view(arg) == "none")
> +       gmon_hist_split = HIST_SPLIT_NONE;
> +      else if (std::string_view(arg) == "even")
> +       gmon_hist_split = HIST_SPLIT_EVEN;
> +      else if (std::string_view(arg) == "flex")
> +       gmon_hist_split = HIST_SPLIT_FLEX;
> +      break;
> +
> +    case 'o':
> +      gmon = true;
> +      output_dir = arg;
> +      break;
> +
> +    case 'p':
> +      pid = atoi(arg);

We should have some error checking that pid > 0 in case arg isn't a valid pid.

> +      break;
> +
> +    case 'n':
> +      opt_maxframes = atoi(arg);
> +      if (opt_maxframes < 0)
> +       {
> +         argp_error (state, N_("-n MAXFRAMES should be 0 or higher."));
> +         return EINVAL;
> +       }
> +      break;
> +
> +    case 'f':
> +      output_force = true;
> +      break;
> +
> +#ifdef HAVE_PERFMON_PFMLIB_PERF_EVENT_H
> +    case 'e':
> +      libpfm_event = arg;
> +      break;
> +
> +    case ARGP_KEY_EVENT_LIST:
> +      {
> +       pfm_pmu_info_t pinfo;
> +       pfm_event_info_t info;
> +
> +       pfm_err_t rc = pfm_initialize();
> +       if (rc != PFM_SUCCESS)
> +         {
> +           cerr << format("ERROR: pfm_initialized failed: {}\n", 
> pfm_strerror(rc));
> +           exit(1);
> +         }
> +
> +       memset(&pinfo, 0, sizeof(pinfo));
> +       memset(&info, 0, sizeof(info));
> +       pinfo.size = sizeof(pinfo);
> +       info.size = sizeof(info);
> +
> +       for(int j= PFM_PMU_NONE ; j< PFM_PMU_MAX; j++)
> +         {
> +           pfm_err_t ret = pfm_get_pmu_info((pfm_pmu_t) j, &pinfo);
> +           if (ret != PFM_SUCCESS)
> +             continue;
> +           if (! pinfo.is_present)
> +             continue;
> +           for (int i = pinfo.first_event; i != -1; i = 
> pfm_get_event_next(i))
> +             {
> +               ret = pfm_get_event_info(i, PFM_OS_PERF_EVENT_EXT, &info);
> +               if (ret == PFM_SUCCESS)
> +                 clog << format("{}::{}\n", pinfo.name, info.name);
> +             }
> +         }
> +      }
> +      exit(0);
> +#endif
> +
> +    default:
> +      return ARGP_ERR_UNKNOWN;
> +    }
> +  return 0;
> +}
> +
> +sig_atomic_t interrupted;
> +
> +void sigint_handler(int sig)
> +{
> +  interrupted ++;
> +  if (interrupted > 1)
> +    _exit(1);
> +}
> +
> +int
> +main (int argc, char *argv[])
> +{
> +  int remaining;
> +  int pipefd[2] = {-1, -1}; // for CMD child process post-fork sync
> +  bool has_cmd = false;
> +  (void) argp_parse (&argp, argc, argv, 0, &remaining, NULL);
> +
> +  /* show_summary is true by default */
> +  if (verbose > 0) show_modules = true;
> +  if (verbose > 1) show_samples = true;
> +  if (verbose > 2) show_frames = true;
> +  if (verbose > 3) show_debugfile = true;
> +  if (verbose > 4) show_tmi = true;
> +
> +  if (pid > 0 && remaining < argc) // got a pid AND a cmd? reject
> +    {
> +      cerr << format("ERROR: Must not specify both -p PID and CMD\n");
> +      exit(1);
> +    }
> +
> +  bool systemwide = (pid == 0) || (remaining == argc);
> +  (void) systemwide;
> +
> +  try
> +    {
> +      memset(&attr, 0, sizeof(attr));
> +      attr.size = sizeof(attr);
> +
> +      if (libpfm_event != "")
> +       {
> +#if HAVE_PERFMON_PFMLIB_PERF_EVENT_H
> +         pfm_err_t rc = pfm_initialize();
> +         if (rc != PFM_SUCCESS)
> +           {
> +             cerr << format("ERROR: pfm_initialized failed: {}\n", 
> pfm_strerror(rc));
> +             exit(1);
> +           }
> +         char* fstr = nullptr;
> +         pfm_perf_encode_arg_t arg = { .attr = &attr, .fstr=&fstr, .size = 
> sizeof(arg) };
> +         rc = pfm_get_os_event_encoding(libpfm_event.c_str(),
> +                                        PFM_PLM3, /* userspace, whether 
> systemwide or not */
> +                                        PFM_OS_PERF_EVENT_EXT, &arg);
> +         if (rc != PFM_SUCCESS)
> +           {
> +             cerr << format("ERROR: pfm_get_os_event_encoding failed: {}\n", 
> pfm_strerror(rc));
> +             exit(1);
> +           }
> +         if (verbose)
> +           {
> +             clog << format("libpfm expanded {} to {}\n", libpfm_event, 
> fstr);
> +           }
> +         libpfm_event_decoded = fstr; // overwrite
> +         free(fstr);
> +#endif
> +       }
> +      else
> +       {
> +         // same as: -e perf::CPU-CLOCK:freq=1000
> +         attr.type = PERF_TYPE_SOFTWARE;
> +         attr.config = PERF_COUNT_SW_CPU_CLOCK;
> +         attr.sample_freq = 1000;
> +         attr.freq = 1;
> +         attr.exclude_kernel = 1;
> +         attr.exclude_hv = 1;
> +         attr.exclude_guest = 1;
> +       }
> +
> +      if (show_summary)
> +       {
> +         clog << format("perf_event_attr configuration type={:x} config={:x} 
> {}{}\n",
> +                             attr.type, attr.config,
> +                             (attr.freq ? "sample_freq=" : "sample_period="),
> +                             (attr.freq ? attr.sample_freq : 
> attr.sample_period));
> +         clog << endl;
> +       }
> +
> +      if (remaining < argc) // got a CMD... suffix?  ok start it
> +       {
> +         has_cmd = true;
> +         int rc = pipe (pipefd); // will use pipefd[] >= 0 as flag for 
> synchronization just below
> +         if (rc < 0)
> +           {
> +             cerr << format("ERROR: pipe failed: {}\n", strerror(errno));
> +             exit(1);
> +           }
> +
> +         pid = fork();
> +         if (pid == 0) // in child
> +           {
> +             close (pipefd[1]); // close write end
> +             char dummy;
> +             int rc = read (pipefd[0], &dummy, 1); // block until parent is 
> ready
> +             if (rc != 1)
> +               {
> +                 cerr << format("ERROR: child sync read failed: {}\n", 
> strerror(errno));
> +                 exit(1);
> +               }
> +             close (pipefd[0]);
> +             execvp (argv[remaining], & argv[remaining] /* not +1: child 
> argv[0] included! */ );
> +             // notreached unless error
> +             cerr << format("ERROR: execvp failed: {}\n", strerror(errno));
> +             exit(1);
> +           }
> +         else if (pid > 0) // in parent
> +           {
> +             close (pipefd[0]); // close read end
> +             // will write to pipefd[1] after perfreader sicced at child
> +           }
> +         else // error
> +           {
> +             cerr << format("ERROR: fork failed: {}\n", strerror(errno));
> +             exit(1);
> +           }
> +       }
> +
> +      // Create the perf processing pipeline as per command line options
> +      PerfReader *pr = nullptr;
> +      UnwindStatsTable *tab = nullptr;
> +      UnwindSampleConsumer *usc = nullptr;
> +      PerfConsumerUnwinder *pcu = nullptr;
> +      StatsPerfConsumer *spc = nullptr;
> +
> +      if (gmon)
> +       {
> +         tab = new UnwindStatsTable();
> +         usc = new GprofUnwindSampleConsumer(tab);
> +         pcu = new PerfConsumerUnwinder(usc, tab);
> +         pr = new PerfReader(&attr, pcu, pid);
> +       }
> +      else
> +       {
> +         tab = new UnwindStatsTable();
> +         usc = new UnwindStatsConsumer(tab);
> +         pcu = new PerfConsumerUnwinder (usc, tab);
> +         pr = new PerfReader(&attr, pcu, pid);
> +#if 0
> +         spc = new StatsPerfConsumer();
> +         pr = new PerfReader(&attr, spc, pid);
> +#endif

Do we need this #if 0 and StatsPerfConsumer in the committed code?
A comment at the StatsPerfConsumer definition says it's a "toy concrete
object".  That suggests to me that it was just for manual testing during
the initial development of eu-stackprof.

If it's intended to remain for debug builds, I suggest defining a
STACKPROF_DEBUG macro (or similar name) near the top of stackprof.cxx for
easier toggling.

> +       }
> +
> +      signal(SIGINT, sigint_handler);
> +      signal(SIGTERM, sigint_handler);
> +
> +      if (pid > 0 && has_cmd) // need to release child CMD process?
> +       {
> +         int rc = write(pipefd[1], "x", 1); // unblock child
> +         assert (rc == 1);
> +         close(pipefd[1]);
> +       }
> +
> +      if (show_summary)
> +       {
> +         clog << "Starting stack profile collection ";
> +         if (pid) clog << format("pid {}", pid);
> +         else clog << "systemwide";
> +         clog << "\n";
> +       }
> +
> +      while (true) // main loop
> +       {
> +         if (interrupted) break;
> +         if (pid > 0) waitpid(pid, NULL, WNOHANG); // reap dead child to 
> allow kill(pid, 0) to signal death
> +         if (pid > 0 && kill(pid, 0) != 0) break; // exit if child or 
> targeted non-child process died
> +         pr->process_some();
> +       }
> +
> +      delete pr;
> +      delete usc;
> +      delete pcu;
> +      delete spc;
> +      delete tab;
> +
> +      // reporting done in various destructors
> +    }
> +  catch (const exception& e)
> +    {
> +      cerr << format("{}\n", e.what());
> +    }
> +
> +  return 0;
> +}
> +
> +
> +////////////////////////////////////////////////////////////////////////
> +// perf reader
> +
> +PerfReader::PerfReader(perf_event_attr* attr, PerfConsumer* consumer, int 
> pid)
> +{
> +  this->page_size = getpagesize();
> +  this->page_count = 64; /* XXX May want to verify if this is a large-enough 
> power-of-2.  */
> +  this->mmap_size = this->page_size * (this->page_count + 1); // total mmap 
> size, incl header page
> +  this->event_wraparound_temp.resize(this->mmap_size); // NB: never resize 
> this object again!
> +  this->consumer = consumer;
> +  this->consumer->set_reader(this);
> +  this->enabled = false;
> +
> +  struct utsname u;
> +  uname(&u);
> +  int em = EM_NONE;
> +  std::string_view machine = u.machine;
> +  if (machine == "x86_64") em = EM_X86_64;
> +  else if (machine == "i686" || machine == "i386") em = EM_386;
> +  else if (machine == "aarch64" || machine == "armv7l") em = EM_ARM;
> +  else {
> +    cerr << format("ERROR: Unsupported architecture: {}\n", u.machine);
> +    exit(1);
> +  }
> +  this->default_ebl = ebl_openbackend_machine(em);
> +  this->sample_regs_user = ebl_perf_frame_regs_mask (this->default_ebl);
> +  this->sample_regs_count = bitset<64>(this->sample_regs_user).count();
> +
> +  attr->sample_regs_user = this->sample_regs_user;
> +  attr->sample_stack_user = 8192; // enough?
> +  attr->sample_type = (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME);
> +  attr->sample_type |= PERF_SAMPLE_REGS_USER;
> +  attr->sample_type |= PERF_SAMPLE_STACK_USER;
> +  // XXX Maybe: ask for PERF_SAMPLE_CALLCHAIN, in case kernel can
> +  // unwind for us?  Would want an option to control this, to allow
> +  // eu-stackprof to exercise our own unwinding functionality when
> +  // testing.
> +  attr->mmap = 1;
> +  attr->mmap2 = 1;
> +  attr->exclude_kernel = 1; /* in-kernel unwinding not relevant for our 
> usecase */
> +  attr->disabled = 1; /* will get enabled soon */
> +  attr->task = 1; // catch FORK/EXIT
> +  attr->comm = 1; // catch EXEC
> +  attr->comm_exec = 1; // catch EXEC
> +  // attr->precise_ip = 2; // request 0 skid ... but that conflicts with 
> PERF_COUNT_HW_BRANCH_INSTRUCTIONS:freq=4000
> +  attr->build_id = 1; // request build ids in MMAP2 events
> +
> +  if (pid > 0) // actually only once, to allow break in case of error
> +    attr->inherit = 1; // propagate to child processes
> +
> +
> +  if (show_tmi)
> +    { // hexdump attr
> +      clog << "perf_event_attr hexdump: ";
> +      auto bytes = (unsigned char*) attr;
> +      for (size_t x = 0; x<sizeof(*attr); x++)
> +       clog << ((x % 8) ? "" : " ")
> +            << ((x % 32) ? "" : "\n")
> +            << format("{:02x}", (unsigned)bytes[x]);
> +      clog << "\n";
> +    }
> +
> +  // Iterate over all cpus, even if attaching to a single pid, because
> +  // we set ->inherit=1.  That requires possible concurrency, which is
> +  // enabled by per-cpu ring buffers.
> +  int ncpus = sysconf(_SC_NPROCESSORS_CONF);
> +  for (int cpu=0; cpu<ncpus; cpu++)
> +    {
> +      int fd = syscall(__NR_perf_event_open, attr,
> +                      (pid > 0 ? pid : -1), cpu, -1,
> +                      PERF_FLAG_FD_CLOEXEC);
> +      if (fd < 0)
> +       {
> +         cerr << format("WARNING: unable to open perf event for cpu {}: 
> {}\n", cpu, strerror(errno));
> +         continue;
> +       }
> +      void *buf = mmap(NULL, this->mmap_size, PROT_READ | PROT_WRITE, 
> MAP_SHARED, fd, 0);
> +      if (buf == MAP_FAILED)
> +       {
> +         cerr << format("ERROR: perf event mmap failed: {}\n", 
> strerror(errno));
> +         close(fd);
> +         continue;
> +       }
> +      this->perf_fds.push_back(fd);
> +      this->perf_headers.push_back((perf_event_mmap_page*) buf);
> +      struct pollfd pfd = {.fd = fd, .events=POLLIN};
> +      this->pollfds.push_back(pfd);
> +    }
> +
> +  if (this->perf_fds.size() == 0)
> +    throw runtime_error("ERROR: no perf events opened");
> +}
> +
> +PerfReader::~PerfReader()
> +{
> +  for (auto fd : this->perf_fds)
> +    close(fd);
> +  for (auto m : this->perf_headers)
> +    munmap((void*) m, this->mmap_size);
> +  ebl_closebackend (this->default_ebl);
> +}
> +
> +uint64_t millis_monotonic()
> +{
> +  return 
> chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now().time_since_epoch()).count();
> +}
> +
> +static inline uint64_t
> +ring_buffer_read_head(volatile struct perf_event_mmap_page *base)
> +{
> +  uint64_t head = base->data_head;
> +  asm volatile("" ::: "memory"); // memory fence
> +  return head;
> +}
> +
> +static inline void
> +ring_buffer_write_tail(volatile struct perf_event_mmap_page *base,
> +                      uint64_t tail)
> +{
> +  asm volatile("" ::: "memory"); // memory fence
> +  base->data_tail = tail;
> +}
> +
> +void PerfReader::process_some()
> +{
> +  if (! this->enabled)
> +    {
> +      for (auto fd : this->perf_fds)
> +       ioctl(fd, PERF_EVENT_IOC_ENABLE, 0 /* value ignored */);
> +      this->enabled = true;
> +    }
> +
> +  uint64_t starttime = millis_monotonic();
> +  uint64_t endtime = starttime + 1000; // run at most one second
> +  uint64_t ring_buffer_size = this->page_size * this->page_count; // just 
> the ring buffer size
> +
> +  while (! interrupted)
> +    {
> +      uint64_t now = millis_monotonic();
> +      if (endtime < now)
> +       break;
> +      int ready = poll(this->pollfds.data(), this->pollfds.size(), 
> (int)(endtime-now)); // wait a little while
> +      if (ready < 0)
> +       break;
> +
> +      for (size_t i = 0; i < pollfds.size(); i++)
> +       if (this->pollfds[i].revents & POLLIN) // found an fd with fresh 
> yummy events
> +         {
> +           perf_event_mmap_page *header = perf_headers[i];
> +           uint64_t data_head = ring_buffer_read_head(header);
> +           uint64_t data_tail = header->data_tail;
> +           uint8_t *base = ((uint8_t *) header) + this->page_size;
> +           struct perf_event_header *ehdr;
> +           size_t ehdr_size;
> +
> +           while (data_head != data_tail) // consume all packets in ring 
> buffer XXX why?
> +             {
> +               ehdr = (perf_event_header*) (base + (data_tail & 
> (ring_buffer_size - 1)));
> +               ehdr_size = ehdr->size;
> +               if (show_tmi)
> +                 clog << format("perf head={:p} tail={:p} ehdr={:p} 
> size={:d}{:x}\n",
> +                                     (void*) data_head, (void*) data_tail, 
> (void*) ehdr, ehdr_size, 0);
> +
> +               if (((uint8_t *)ehdr) + ehdr_size > base + ring_buffer_size) 
> // mmap region wraparound?
> +                 {
> +                   // need to copy it to a contiguous temporary
> +                   uint8_t *copy_start = (uint8_t*) ehdr;
> +                   size_t len_first = base + ring_buffer_size - copy_start;
> +                   size_t len_secnd = ehdr_size - len_first;
> +                   uint8_t *event_temp = this->event_wraparound_temp.data();
> +                   memcpy(event_temp, copy_start, len_first);       // part 
> at end of mmap'd region
> +                   memcpy(event_temp + len_first, base, len_secnd); // part 
> at beginning of mmap'd region
> +                   ehdr = (perf_event_header*) event_temp;
> +                 }
> +
> +               this->decode_event(ehdr);
> +               data_tail += ehdr_size;
> +             }
> +
> +           ring_buffer_write_tail(header, data_tail);
> +         }
> +    }
> +}
> +
> +void PerfReader::decode_event(const perf_event_header* ehdr)
> +{
> +  consumer->process(ehdr); // allow general processing
> +
> +  // and decode into individual event types
> +  switch (ehdr->type)
> +    {
> +    case PERF_RECORD_SAMPLE:
> +      {
> +       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
> sizeof(perf_event_header);
> +       uint64_t ip = *reinterpret_cast<const uint64_t*>(data); data += 
> sizeof(uint64_t);
> +       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint64_t time = *reinterpret_cast<const uint64_t*>(data); data += 
> sizeof(uint64_t);
> +       // PERF_SAMPLE_CALLCHAIN would be here if requested
> +       uint64_t abi = *reinterpret_cast<const uint64_t*>(data); data += 
> sizeof(uint64_t);
> +       uint32_t nregs = this->sample_regs_count;
> +       const uint64_t* regs = reinterpret_cast<const uint64_t*>(data); data 
> += nregs * sizeof(uint64_t);
> +       uint64_t data_size = *reinterpret_cast<const uint64_t*>(data); data 
> += sizeof(uint64_t);
> +       const uint8_t* stack_data = data;
> +       consumer->process_sample(ehdr, ip, pid, tid, time, abi, nregs, regs, 
> data_size, stack_data);
> +       break;
> +      }
> +    case PERF_RECORD_COMM:
> +      {
> +       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
> sizeof(perf_event_header);
> +       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       const char* comm = reinterpret_cast<const char*>(data);
> +       consumer->process_comm(ehdr, pid, tid, (ehdr->misc & 
> PERF_RECORD_MISC_COMM_EXEC), comm);
> +       break;
> +      }
> +    case PERF_RECORD_EXIT:
> +      {
> +       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
> sizeof(perf_event_header);
> +       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t ppid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t ptid = *reinterpret_cast<const uint32_t*>(data);
> +       consumer->process_exit(ehdr, pid, ppid, tid, ptid);
> +       break;
> +      }
> +    case PERF_RECORD_FORK:
> +      {
> +       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
> sizeof(perf_event_header);
> +       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t ppid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t ptid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       consumer->process_fork(ehdr, pid, ppid, tid, ptid);
> +       break;
> +      }
> +    case PERF_RECORD_MMAP2:
> +      {
> +       const uint8_t* data = reinterpret_cast<const uint8_t*>(ehdr) + 
> sizeof(perf_event_header);
> +       uint32_t pid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint32_t tid = *reinterpret_cast<const uint32_t*>(data); data += 
> sizeof(uint32_t);
> +       uint64_t addr = *reinterpret_cast<const uint64_t*>(data); data += 
> sizeof(uint64_t);
> +       uint64_t len = *reinterpret_cast<const uint64_t*>(data); data += 
> sizeof(uint64_t);
> +       uint64_t pgoff = *reinterpret_cast<const uint64_t*>(data); data += 
> sizeof(uint64_t);
> +       uint8_t build_id_size = 0;
> +       const uint8_t* build_id = nullptr;
> +       if (ehdr->misc & PERF_RECORD_MISC_MMAP_BUILD_ID)
> +         {
> +           build_id_size = *reinterpret_cast<const uint8_t*>(data); data += 
> sizeof(uint8_t);
> +           data += sizeof(uint8_t) + sizeof(uint16_t); // skip padding
> +           build_id = reinterpret_cast<const uint8_t*>(data);
> +           data += build_id_size;
> +         }
> +       else
> +         {
> +           data += 4 + 4 + 8 + 8; // maj, min, ino, ino_generation
> +         }
> +       data += sizeof(uint32_t) + sizeof(uint32_t); // prot, flags
> +       const char* filename = reinterpret_cast<const char*>(data);
> +       consumer->process_mmap2(ehdr, pid, tid, addr, len, pgoff, 
> build_id_size, build_id, filename);
> +       break;
> +      }
> +    default:
> +      break;
> +    }
> +}
> +
> +
> +////////////////////////////////////////////////////////////////////////
> +// perf event consumers
> +
> +void StatsPerfConsumer::process_comm(const perf_event_header *sample,
> +                                    uint32_t pid, uint32_t tid, bool exec, 
> const string &comm)
> +{
> +  if (show_modules)
> +    {
> +      clog << format("process_comm: pid={} tid={} exec={} comm={}\n", pid, 
> tid, exec, comm);
> +    }
> +}
> +
> +void StatsPerfConsumer::process_exit(const perf_event_header *sample,
> +                                    uint32_t pid, uint32_t ppid,
> +                                    uint32_t tid, uint32_t ptid)
> +{
> +  if (show_modules)
> +    {
> +      clog << format("process_exit: pid={} ppid={} tid={} ptid={}\n", pid, 
> ppid, tid, ptid);
> +    }
> +}
> +
> +void StatsPerfConsumer::process_fork(const perf_event_header *sample,
> +                                    uint32_t pid, uint32_t ppid,
> +                                    uint32_t tid, uint32_t ptid)
> +{
> +  if (show_modules)
> +    {
> +      clog << format("process_fork: pid={} ppid={} tid={} ptid={}\n", pid, 
> ppid, tid, ptid);
> +    }
> +}
> +
> +void StatsPerfConsumer::process_sample(const perf_event_header *sample,
> +                                      uint64_t ip,
> +                                      uint32_t pid, uint32_t tid,
> +                                      uint64_t time,
> +                                      uint64_t abi,
> +                                      uint32_t nregs, const uint64_t *regs,
> +                                      uint64_t data_size, const uint8_t 
> *data)
> +{
> +  if (show_samples)
> +    {
> +      clog << format("process_sample: pid={:d} tid={:d} ip={:x} time={:d} 
> abi={:d} nregs={:d} data_size={:d}\n",
> +                         pid, tid, ip, time, abi, nregs, data_size);
> +    }
> +}
> +
> +void StatsPerfConsumer::process_mmap2(const perf_event_header *sample,
> +                                     uint32_t pid, uint32_t tid,
> +                                     uint64_t addr, uint64_t len, uint64_t 
> pgoff,
> +                                     uint8_t build_id_size, const uint8_t 
> *build_id,
> +                                     const char *filename)
> +{
> +  if (show_modules)
> +    {
> +      clog << format("process_mmap2: pid={:d} tid={:d} addr={:x} len={:x} 
> pgoff={:x} build_id_size={:d} filename={:s}\n",
> +                         pid, tid, addr, len, pgoff, 
> (unsigned)build_id_size, filename);
> +    }
> +}
> +
> +StatsPerfConsumer::~StatsPerfConsumer()
> +{
> +  for (const auto& kv : this->event_type_counts)
> +    {
> +      clog << format("event type {} count {}\n", kv.first, kv.second);
> +    }
> +}
> +
> +void StatsPerfConsumer::process(const perf_event_header* ehdr)
> +{
> +  this->event_type_counts[ehdr->type] ++;
> +}
> +
> +
> +//////////////////////////////////////////////////////////////////////
> +// unwind stats table for PerfConsumerUnwinder + downstream consumers
> +
> +UnwindDwflStats *UnwindStatsTable::pid_find_or_create (pid_t pid)
> +{
> +  if (this->dwfl_tab.count(pid) == 0)
> +    this->dwfl_tab.emplace(pid, UnwindDwflStats());
> +  return &this->dwfl_tab[pid];
> +}
> +
> +static const string unknown_comm = "<unknown>";
> +
> +string UnwindStatsTable::pid_find_comm (pid_t pid)
> +{
> +  UnwindDwflStats *entry = this->pid_find_or_create(pid);
> +  if (entry == NULL)
> +    return unknown_comm;
> +  if (!entry->comm.empty())
> +    return entry->comm;
> +  string name = format("/proc/{}/comm", pid);
> +  ifstream procfile(name);
> +  string buf;
> +  if (!procfile || !getline(procfile, buf))
> +    entry->comm = unknown_comm;
> +  else
> +    entry->comm = buf;
> +
> +  return entry->comm;
> +}
> +
> +Dwfl *UnwindStatsTable::pid_find_dwfl (pid_t pid)
> +{
> +  if (this->dwfl_tab.count(pid) == 0)
> +    return NULL;
> +  return this->dwfl_tab[pid].dwfl;
> +}
> +
> +void UnwindStatsTable::pid_store_dwfl (pid_t pid, Dwfl *dwfl)
> +{
> +  UnwindDwflStats *entry = this->pid_find_or_create(pid);
> +  if (entry == NULL)
> +    return;
> +  entry->dwfl = dwfl;
> +  if (show_summary)
> +    this->pid_find_comm(pid);
> +  return;
> +}
> +
> +UnwindModuleStats *UnwindStatsTable::buildid_find (string buildid)
> +{
> +  if (this->buildid_tab.count(buildid) == 0)
> +    return NULL;
> +  return &this->buildid_tab[buildid];
> +}
> +
> +UnwindModuleStats *UnwindStatsTable::buildid_find_or_create (string buildid, 
> Dwfl_Module *mod)
> +{
> +  if (this->buildid_tab.count(buildid) == 0)
> +    {
> +      this->buildid_tab.emplace(buildid, UnwindModuleStats());
> +      /* TODO: Guess text range for mod? */
> +      (void)mod;
> +    }
> +  return &this->buildid_tab[buildid];
> +}
> +
> +void UnwindStatsTable::print_summary () const
> +{
> +#define PERCENT(x,tot) ((x+tot == 0)?0.0:((double)x)/((double)tot)*100.0)
> +  int total_samples = 0;
> +  int total_lost_samples = 0;
> +  clog << "\n=== pid / sample counts ===\n";
> +  for (auto& p : this->dwfl_tab)
> +    {
> +      pid_t pid = p.first;
> +      const UnwindDwflStats& d = p.second;
> +      clog << format(N_("{} {} -- max {} frames, received {} samples, lost 
> {} samples ({:.1f}%) (last {}, worst {})\n"),
> +              pid, d.comm, d.max_frames,
> +              d.total_samples, d.lost_samples,
> +              PERCENT(d.lost_samples, d.total_samples),
> +              dwfl_unwound_source_str(d.last_unwound),
> +              dwfl_unwound_source_str(d.worst_unwound));
> +      total_samples += d.total_samples;
> +      total_lost_samples += d.lost_samples;
> +    }
> +  clog << "===\n";
> +  clog << format(N_("TOTAL -- received {} samples, lost {} samples, loaded 
> {} processes\n"),
> +         total_samples, total_lost_samples,
> +         this->dwfl_tab.size() /* TODO: If implementing eviction, need to 
> maintain a separate count of evicted pids. */);
> +  clog << "\n";
> +#undef PERCENT
> +}
> +
> +////////////////////////////////////////////////////////////////////////
> +// real perf consumer: unwind helpers
> +
> +PerfConsumerUnwinder::PerfConsumerUnwinder(UnwindSampleConsumer* usc, 
> UnwindStatsTable *ust)
> +    : consumer(usc), stats(ust) {
> +  maxframes = usc->maxframes();
> +  this->tracker = dwflst_tracker_begin (&dwfl_cfi_callbacks);
> +}
> +
> +PerfConsumerUnwinder::PerfConsumerUnwinder(UnwindSampleConsumer* usc, 
> UnwindStatsTable *ust, PerfReader *reader)
> +  : consumer(usc), stats(ust) {
> +  maxframes = usc->maxframes();
> +  this->reader = reader;
> +  this->tracker = dwflst_tracker_begin (&dwfl_cfi_callbacks);
> +}
> +
> +PerfConsumerUnwinder::~PerfConsumerUnwinder() {
> +  dwflst_tracker_end (this->tracker);
> +}
> +
> +/* TODO: Could be relocated to libdwfl/linux-pid-attach.c
> +   to remove some duplication of existing linux-pid-attach code. */
> +int PerfConsumerUnwinder::find_procfile (Dwfl *dwfl, pid_t *pid, Elf **elf, 
> int *elf_fd)
> +{
> +  int err = 0; /* The errno to return. XXX libdwfl would also set this for 
> dwfl->attacherr.  */
> +
> +  /* Make sure to report the actual PID (thread group leader) to
> +     dwfl_attach_state.  */
> +  string buffer = format("/proc/{}/status", *pid);
> +  ifstream procfile(buffer);
> +  if (!procfile)
> +    {
> +      err = errno;
> +    fail:
> +      return err;
> +    }
> +
> +  string line;
> +  while (getline (procfile, line))
> +    if (startswith (line.c_str(), "Tgid:"))
> +      {
> +       errno = 0;
> +       char *endptr;
> +       long val = strtol (&line.c_str()[5], &endptr, 10);
> +       if ((errno == ERANGE && val == LONG_MAX)
> +           || *endptr != '\n' || val < 0 || val != (pid_t) val)
> +         *pid = 0;
> +       else
> +         *pid = (pid_t) val;
> +       break;
> +      }
> +
> +  if (*pid == 0)
> +    {
> +      err = ESRCH;
> +      goto fail;
> +    }
> +
> +  {
> +    string name = format("/proc/{}/task", *pid);
> +    DIR *dir = opendir (name.c_str());
> +    if (dir == NULL)
> +      {
> +        err = errno;
> +        goto fail;
> +      }
> +    else
> +      closedir(dir);
> +  }
> +
> +  {
> +    string name = format("/proc/{}/exe", *pid);
> +    *elf_fd = open (name.c_str(), O_RDONLY);
> +  }
> +  if (*elf_fd >= 0)
> +    {
> +      *elf = elf_begin (*elf_fd, ELF_C_READ_MMAP, NULL);
> +      if (*elf == NULL)
> +       {
> +         /* Just ignore, dwfl_attach_state will fall back to trying
> +            to associate the Dwfl with one of the existing Dwfl_Module
> +            ELF images (to know the machine/class backend to use).  */
> +         if (verbose)
> +           cerr << format(N_("WARNING: find_procfile pid {}: elf not 
> found\n"), (long long)*pid);
> +         close (*elf_fd);
> +         *elf_fd = -1;
> +       }
> +    }
> +  else
> +    *elf = NULL;
> +  return 0;
> +}
> +
> +Dwfl *PerfConsumerUnwinder::init_dwfl(pid_t pid)
> +{
> +  Dwfl *dwfl = dwflst_tracker_dwfl_begin (this->tracker);
> +
> +  int err = dwfl_linux_proc_report (dwfl, pid);
> +  if (err < 0)
> +    {
> +      if (verbose)
> +       cerr << format("WARNING: dwfl_linux_proc_report pid {}: {}\n", (long 
> long) pid, dwfl_errmsg(-1));
> +      return NULL;
> +    }
> +  err = dwfl_report_end (dwfl, NULL, NULL);
> +  if (err != 0)
> +    {
> +      if (verbose)
> +       cerr << format("WARNING: dwfl_report_end pid {}: {}\n", (long long) 
> pid, dwfl_errmsg(-1));
> +      return NULL;
> +    }
> +
> +  return dwfl;
> +}
> +
> +Dwfl *pcu_init_dwfl_cb (Dwflst_Process_Tracker *cb_tracker __attribute__ 
> ((unused)),
> +                       pid_t pid,
> +                       void *arg)
> +{
> +  PerfConsumerUnwinder *pcu = (PerfConsumerUnwinder *)arg;
> +  return pcu->init_dwfl (pid);
> +}
> +
> +uint32_t expected_frame_nregs (Ebl *ebl)
> +{
> +  int m = ebl_get_elfmachine(ebl);
> +  /* For aarch64, we actually use fewer than ebl->frame_nregs to unwind. */
> +  if (m == EM_ARM)
> +    return 14; /* XXX 16 for 32-bit ARM */
> +  /* On x86, expect everything except FLAGS: */
> +  if (m == EM_X86_64 || m == EM_386)
> +    return ebl_frame_nregs(ebl);
> +  /* In general, it's better to be on the permissive side. */
> +  return 1;
> +}
> +
> +Dwfl *PerfConsumerUnwinder::find_dwfl(pid_t pid, const uint64_t *regs, 
> uint32_t nregs,
> +                                     Elf **out_elf, bool *cached)
> +{
> +  if (nregs < expected_frame_nregs(this->reader->ebl()))
> +    {
> +      if (verbose)
> +       cerr << format(N_("WARNING: find_dwfl: nregs={}, expected at least 
> {}\n"), nregs, ebl_frame_nregs(this->reader->ebl()));
> +      return NULL;
> +    }
> +
> +  Elf *elf = NULL;
> +  Dwfl *dwfl = dwflst_tracker_find_pid (this->tracker, pid, 
> pcu_init_dwfl_cb, this);
> +  int elf_fd = -1;
> +  int err;
> +  if (dwfl != NULL && dwfl_pid(dwfl) != -1 /* dwfl is attached */)
> +    {
> +      *cached = true;
> +      goto reuse;
> +    }
> +  err = this->find_procfile (dwfl, &pid, &elf, &elf_fd);
> +  if (err < 0)

find_procfile returns positive errnos when it encounters an error.
So this error handler will never run and elf/elf_fd remain NULL/-1.

> +    {
> +      if (verbose)
> +       cerr << format("WARNING: find_procfile pid {}: {}\n", (long long) 
> pid, dwfl_errmsg(-1));
> +      return NULL;
> +    }
> +
> + reuse:
> +  this->last_us.sp = regs[this->get_sp_reg(this->last_us.elfclass == 
> ELFCLASS32)];
> +  this->last_us.base = this->last_us.sp;
> +
> +  if (!*cached)
> +    this->stats->pid_store_dwfl (pid, dwfl);
> +  *out_elf = elf;
> +  return dwfl;
> +}
> +
> +/* Index of stack pointer within dwarf_regs order: */
> +int PerfConsumerUnwinder::get_sp_reg(bool is_abi32)
> +{
> +  int machine = ebl_get_elfmachine(this->reader->ebl());
> +  if (machine == EM_X86_64 || machine == EM_386) return is_abi32 ? 4 : 7;
> +  else if (machine == EM_ARM) return is_abi32 ? 13 : 31;
> +  else { assert(0); return 7; }
> +}
> +
> +int PerfConsumerUnwinder::unwind_frame_cb(Dwfl_Frame *state)
> +{
> +  Dwarf_Addr pc;
> +  bool isactivation;
> +  if (! dwfl_frame_pc (state, &pc, &isactivation))
> +    {
> +      if (verbose)
> +       cerr << format("WARNING: dwfl_frame_pc: {}\n", dwfl_errmsg(-1));
> +      return DWARF_CB_ABORT;
> +    }
> +
> +  Dwarf_Addr pc_adjusted = pc - (isactivation ? 0 : 1);
> +  Dwarf_Addr sp;
> +
> +  int is_abi32 = (this->last_us.elfclass == ELFCLASS32);
> +  int user_regs_sp = this->get_sp_reg(is_abi32);
> +  int rc = dwfl_frame_reg (state, user_regs_sp, &sp);
> +  if (rc < 0)
> +    {
> +      if (verbose)
> +       cerr << format("WARNING: dwfl_frame_reg: {}\n", dwfl_errmsg(-1));
> +      return DWARF_CB_ABORT;
> +    }
> +
> +  UnwindDwflStats *dwfl_ent = 
> this->stats->pid_find_or_create(this->last_us.pid);
> +  if (dwfl_ent != NULL)
> +    {
> +      Dwfl_Unwound_Source unwound_source = dwfl_frame_unwound_source(state);
> +      if (unwound_source > dwfl_ent->worst_unwound)
> +       dwfl_ent->worst_unwound = unwound_source;
> +      dwfl_ent->last_unwound = unwound_source;
> +      if (show_frames)
> +       {
> +         Dwfl_Module *m = dwfl_addrmodule(this->last_us.dwfl, pc);
> +         uint64_t rel_pc = pc_adjusted;
> +         int j = dwfl_module_relocate_address (m, &rel_pc);
> +         (void) j;
> +         clog << format("* frame {:d}: rel_pc={:x} raw_pc={:x} sp={:x}+{:x} 
> [{}]\n",
> +                        this->last_us.addrs.size(), rel_pc, pc_adjusted, 
> this->last_us.base, (sp - this->last_us.base), 
> dwfl_unwound_source_str(unwound_source));
> +       }
> +    }
> +  else
> +    {
> +      if (show_frames)
> +       {
> +         Dwfl_Module *m = dwfl_addrmodule(this->last_us.dwfl, pc);
> +         uint64_t rel_pc = pc_adjusted;
> +         int j = dwfl_module_relocate_address (m, &rel_pc);
> +         (void) j;
> +         clog << format(N_("* frame {:d}: rel_pc={:x} raw_pc={:x} 
> sp={:x}+{:x} [dwfl_ent not found]\n"),
> +                        this->last_us.addrs.size(), rel_pc, pc_adjusted, 
> this->last_us.base, (sp - this->last_us.base));
> +       }
> +    }
> +  if (show_debugfile)
> +    {
> +      Dwfl_Module *m = dwfl_addrmodule(this->last_us.dwfl, pc);
> +      if (m == NULL)
> +       {
> +         clog << format("* pid {:d} pc={:x} -> MODULE NOT FOUND\n",
> +                        this->last_us.pid, pc);
> +       }
> +      else
> +       {
> +         const unsigned char *desc;
> +         GElf_Addr vaddr;
> +         int build_id_len = dwfl_module_build_id (m, &desc, &vaddr);
> +         clog << format("* pid {:d} build_id=", this->last_us.pid);
> +         for (int i = 0; i < build_id_len; ++i)
> +           clog << format("{:02x}", static_cast<int>(desc[i]));
> +
> +         const char *mainfile;
> +         const char *debugfile;
> +         const char *modname = dwfl_module_info (m, NULL, NULL, NULL, NULL,
> +                                                 NULL, &mainfile, 
> &debugfile);
> +         clog << format("module={} mainfile={} debugfile={}\n",
> +                        modname,
> +                        mainfile ? mainfile : "<none>",
> +                        debugfile ? debugfile : "<none>");
> +         /* TODO: Also store this data to avoid repeated extraction for
> +            the final buildid summary?  */
> +#ifdef DEBUG_MODULES
> +         Dwarf_Addr bias;
> +         Dwarf_CFI *cfi_eh = dwfl_module_eh_cfi (m, &bias);
> +         if (cfi_eh == NULL)
> +           clog << format("* pc={:x} -> NO EH_CFI\n", pc);
> +#endif
> +       }
> +    }
> +
> +  this->last_us.sp = sp;
> +  this->last_us.addrs.push_back(pc);
> +
> +  /* e.g. gmon callgraphs only requires maxframes=1
> +     (initial pc + one frame for caller ID only) */
> +  if (this->last_us.addrs.size() > this->maxframes)
> +    {
> +      /* XXX without maxframes, very rarely, the unwinder can loop
> +        infinitely; worth investigating? */
> +      return DWARF_CB_ABORT;
> +    }
> +  return DWARF_CB_OK;
> +}
> +
> +int pcu_unwind_frame_cb(Dwfl_Frame *state, void *arg)
> +{
> +  PerfConsumerUnwinder *pcu = (PerfConsumerUnwinder *)arg;
> +  return pcu->unwind_frame_cb(state);
> +}
> +
> +
> +////////////////////////////////////////////////////////////////////////
> +// real perf consumer: event handler callbacks
> +
> +void PerfConsumerUnwinder::process_comm(const perf_event_header *sample,
> +                                       uint32_t pid, uint32_t tid, bool 
> exec, const string &comm)
> +{
> +  // NB: Could have dwflst ditch data for process and start anew, if EXEC.
> +  // XXX: is this needed to avoid gradual memory leaks or pid reuse?
> +}
> +
> +void PerfConsumerUnwinder::process_exit(const perf_event_header *sample,
> +                                       uint32_t pid, uint32_t ppid,
> +                                       uint32_t tid, uint32_t ptid)
> +{
> +  // NB: Could have dwflst ditch data for process.
> +  // XXX: is this needed to avoid gradual memory leaks or pid reuse?
> +}
> +
> +void PerfConsumerUnwinder::process_fork(const perf_event_header *sample,
> +                                       uint32_t pid, uint32_t ppid,
> +                                       uint32_t tid, uint32_t ptid)
> +{
> +  // NB: Could have dwflst begin tracking a new process, but
> +  // this will likely happen automatically when a packet is received
> +  // from it.  The short duration between fork/exec typically means
> +  // elfutils will pick up on the post-exec process -- we would have
> +  // to work hard to replicate a situation where
> +  // process_fork/process_comm handling are needed.
> +}
> +
> +void PerfConsumerUnwinder::process_sample(const perf_event_header *sample,
> +                                         uint64_t ip,
> +                                         uint32_t pid, uint32_t tid,
> +                                         uint64_t time,
> +                                         uint64_t abi,
> +                                         uint32_t nregs, const uint64_t 
> *regs,
> +                                         uint64_t data_size, const uint8_t 
> *data)
> +{
> +  string comm;
> +  if (show_summary)
> +    comm = this->stats->pid_find_comm(pid);
> +
> +  if (show_frames)
> +    clog << "\n"; /* extra newline for padding */
> +
> +  Elf *elf = NULL; // Released during dwflst_tracker_end
> +  bool cached = false;
> +  Dwfl *dwfl = this->find_dwfl (pid, regs, nregs, &elf, &cached);
> +  UnwindDwflStats *dwfl_ent = NULL;
> +  bool first_load = false; /* -> for show_modules: pid is loaded first time 
> */
> +  if (verbose || show_summary || show_modules)
> +    {
> +      if (dwfl_ent == NULL)
> +       dwfl_ent = this->stats->pid_find_or_create(pid);
> +      if (dwfl_ent->total_samples == 0)
> +       first_load = true;
> +    }
> +  if (dwfl == NULL)
> +    {
> +      if (show_summary || show_modules)
> +       {
> +         /* dwfl_ent loaded above */
> +         dwfl_ent->total_samples++;
> +         dwfl_ent->lost_samples++;
> +       }
> +      if (verbose && show_summary)
> +       {
> +         cerr << format("WARNING: find_dwfl pid {} ({}) (failed)\n", (long 
> long)pid, comm);
> +       }
> +      else
> +       {
> +         cerr << format("WARNING: find_dwfl pid {} (failed)\n", (long 
> long)pid);
> +       }
> +      return;
> +    }
> +
> +  if (show_samples || (first_load && show_modules))
> +    {
> +      bool is_abi32 = (abi == PERF_SAMPLE_REGS_ABI_32);
> +      clog << format("find_dwfl {}pid {:d} {}({}): hdr_size={:d} size={:d}{} 
> pc={:x} sp={:x}+{:d}\n",
> +                    first_load ? "newly seen " : "", (long long)pid,
> +                    (cached ? "(cached) " : ""), comm,
> +                    sample->size, data_size,
> +                    (is_abi32 ? " (32-bit)" : ""), ip,
> +                    this->last_us.base, 0);
> +    }
> +
> +  this->last_us.addrs.clear();
> +  this->last_us.elfclass = (abi == PERF_SAMPLE_REGS_ABI_32 ? ELFCLASS32 : 
> ELFCLASS64);
> +  this->last_us.dwfl = dwfl;
> +  this->last_us.pid = pid;
> +  int rc = dwflst_perf_sample_getframes (dwfl, elf, pid, tid,
> +                                        data, data_size,
> +                                        regs, nregs,
> +                                        this->reader->regs_mask(), abi,
> +                                        pcu_unwind_frame_cb, this);
> +  if (rc < 0)
> +    {
> +      /* dwfl_ent loaded above */
> +      if (verbose && dwfl_ent->shown_errors < 10)
> +       {
> +         dwfl_ent->shown_errors ++;
> +         cerr << format("WARNING: dwflst_perf_sample_getframes pid {}: 
> {}{}\n",
> +                        (long long)pid, dwfl_errmsg(-1),
> +                        dwfl_ent->shown_errors >= 10 ?
> +                        " (...suppressing further warnings for this pid)" : 
> "");
> +       }
> +    }
> +  if (show_summary)
> +    {
> +      /* For final diagnostics.  dwfl_ent loaded above */
> +      if (this->last_us.addrs.size() > (unsigned long)dwfl_ent->max_frames)
> +       dwfl_ent->max_frames = this->last_us.addrs.size();
> +      dwfl_ent->total_samples++;
> +      if (this->maxframes > 2 && this->last_us.addrs.size() <= 2)
> +       dwfl_ent->lost_samples++;
> +    }
> +
> +  this->consumer->process (&this->last_us);
> +  return;
> +}
> +
> +void PerfConsumerUnwinder::process_mmap2(const perf_event_header *sample,
> +                                        uint32_t pid, uint32_t tid,
> +                                        uint64_t addr, uint64_t len, 
> uint64_t pgoff,
> +                                        uint8_t build_id_size, const uint8_t 
> *build_id,
> +                                        const char *filename)
> +{
> +  Dwfl *dwfl = this->stats->pid_find_dwfl(pid);
> +  if (dwfl != NULL)
> +    {
> +      dwfl_report_begin_add(dwfl);
> +      dwfl_report_module(dwfl, filename, /*start*/ addr, /*end*/ addr + len);
> +      dwfl_report_end(dwfl, NULL, NULL);
> +    }
> +}
> +
> +
> +////////////////////////////////////////////////////////////////////////
> +// unwind data consumers // basic statistics
> +
> +UnwindStatsConsumer::~UnwindStatsConsumer()
> +{
> +  this->stats->print_summary();
> +}
> +
> +void UnwindStatsConsumer::process(const UnwindSample* sample)
> +{
> +  /* Most of the logic is handled by UnwindStatsTable. */
> +}
> +
> +int UnwindStatsConsumer::maxframes()
> +{
> +  return opt_maxframes >= 0 ? opt_maxframes : 256;
> +}
> +
> +
> +////////////////////////////////////////////////////////////////////////
> +// unwind data consumers // gprof
> +
> +/* gmon.out file format bits */
> +#define GMON_MAGIC "gmon"
> +#define GMON_VERSION 1
> +
> +struct gmon_hdr {
> +  char cookie[4];
> +  char version[4];
> +  char spare[3 * 4];
> +};
> +
> +enum gmon_entry_tag {
> +  GMON_TAG_TIME_HIST = 0,
> +  GMON_TAG_CG_ARC = 1,
> +  GMON_TAG_BB_COUNT = 2,
> +};
> +
> +struct gmon_hist_hdr {
> +  uint8_t tag; /* GMON_TAG_TIME_HIST */
> +  uint8_t unused[3];
> +  uint64_t low_pc;
> +  uint64_t high_pc;
> +  uint32_t num_buckets;
> +  uint32_t prof_rate;
> +  char _dimension_string[16];
> +};
> +
> +
> +void GprofUnwindSampleConsumer::record_gmon_hist(ostream &of, map<uint64_t, 
> uint32_t> &histogram, uint64_t low_pc, uint64_t high_pc, uint64_t alignment)
> +{
> +  // write one histogram from low_pc ... high_pc
> +  uint32_t num_buckets = (high_pc-low_pc)/alignment + 1;
> +  double result_scale = 
> (double)((high_pc-low_pc)/sizeof(uint16_t))/num_buckets;
> +  if (verbose > 5)
> +    /* It's the @scale value that must be kept within 0.000001 of 0.5 to
> +       keep gprof from complaining. */
> +    clog << format("+histogram {:x}..{:x} (alignment {}) of {} buckets 
> @scale {}\n",
> +                  low_pc, high_pc, alignment, num_buckets, result_scale);
> +
> +  // write histogram record header
> +  unsigned char tag = GMON_TAG_TIME_HIST;
> +  of.write(reinterpret_cast<const char *>(&tag), sizeof(tag));
> +  int wordsize = (sizeof (void *) == 8) ? 8 : 4;
> +  if (wordsize == 4) {
> +    uint32_t addr = low_pc;
> +    of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
> +    addr = high_pc;
> +    of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
> +  } else {
> +    of.write(reinterpret_cast<const char *>(&low_pc), sizeof(low_pc));
> +    of.write(reinterpret_cast<const char *>(&high_pc), sizeof(high_pc));
> +  }
> +  of.write(reinterpret_cast<const char *>(&num_buckets), 
> sizeof(num_buckets));
> +  uint32_t prof_rate = attr.sample_freq;
> +  of.write(reinterpret_cast<const char *>(&prof_rate), sizeof(prof_rate));
> +  // dimension string is 15 chars long (not null terminated)
> +  std::string dimension_base = libpfm_event.empty() ? "ticks" :
> +    libpfm_event.substr(0, 15);
> +  dimension_base.resize(15, '\0');  // ensure exactly 15 bytes
> +  of.write(dimension_base.data(), 15);
> +  // dimension character abbreviation: just take the first char of above
> +  of.write(dimension_base.data(), 1);
> +
> +  // write histogram buckets
> +  uint64_t bucket_addr = low_pc;
> +  int n_overflows = 0, max_overflows = 5; // limit 'bucket overflow' spam
> +  for (uint32_t bucket = 0; bucket < num_buckets; bucket++)
> +    {
> +      uint16_t count = 0;
> +      for (auto it = histogram.lower_bound(bucket_addr);
> +              it != histogram.upper_bound(bucket_addr+alignment-1);
> +              it ++)
> +       {
> +         if (numeric_limits<uint16_t>::max() <= (int) count + (int) 
> it->second)
> +           {
> +             count = numeric_limits<uint16_t>::max();
> +             // XXX: a provisional error message to give a sense of
> +             // whether this happens often-enough to do something
> +             // more complex, such as adjusting the histogram
> +             // granularity:
> +             if (n_overflows >= max_overflows) break;
> +             n_overflows++;
> +             cerr << format("WARNING: histogram bucket overflow at {:x}{}",
> +                            bucket_addr,
> +                            n_overflows >= max_overflows ?
> +                            " (... suppressing further warnings for this 
> histogram)" : "")
> +                  << endl;
> +             break;
> +           }
> +         count += it->second;
> +       }
> +      bucket_addr += alignment;
> +      of.write(reinterpret_cast<const char *>(&count), sizeof(count));
> +    }
> +}
> +
> +void GprofUnwindSampleConsumer::record_gmon_out(const string& buildid, 
> UnwindModuleStats& m)
> +{
> +  string filename = output_dir + "/" + "gmon." + buildid + ".out";
> +  string exe_symlink_path = output_dir + "/" + "gmon." + buildid + ".exe";
> +  string json_path = output_dir + "/" + "gmon." + buildid + ".json";
> +
> +  if (output_force) {
> +    filesystem::remove(filename);
> +    filesystem::remove(exe_symlink_path);
> +    filesystem::remove(json_path);
> +  }
> +
> +  string target_path = buildid_to_mainfile[buildid];
> +  if (target_path != unknown_comm) // skip .exe symlink if there's no path
> +    if (symlink(target_path.c_str(), exe_symlink_path.c_str()) == -1) {
> +      // Handle error, e.g., print errno or throw exception
> +      cerr << format("WARNING: symlink failed: {}\n", strerror(errno));
> +      // NB: no return needed here; proceed to write out other bits.
> +      // A smart enough consumer will make do with buildid based executable 
> lookup.
> +    }
> +
> +  json_object *metadata = json_object_new_object();
> +  if (!metadata) {
> +  json_fail:
> +    cerr << format("ERROR: json allocation failed: {}\n", strerror(errno));
> +    return;
> +  }
> +  json_object *buildid_js = json_object_new_string(buildid.c_str());
> +  if (NULL == buildid_js) goto json_fail;
> +  json_object_object_add(metadata, "buildid", buildid_js);
> +  if (buildid_to_mainfile.count(buildid) != 0) {
> +    const string &mainfile = buildid_to_mainfile[buildid];
> +    json_object *mainfile_js = json_object_new_string(mainfile.c_str());
> +    if (NULL == mainfile_js) goto json_fail;
> +    json_object_object_add(metadata, "mainfile", mainfile_js);
> +  }
> +  if (buildid_to_debugfile.count(buildid) != 0) {
> +    const string &debugfile = buildid_to_debugfile[buildid];
> +    json_object *debugfile_js = json_object_new_string(debugfile.c_str());
> +    if (NULL == debugfile_js) goto json_fail;
> +    json_object_object_add(metadata, "debugfile", debugfile_js);
> +  }
> +  if (libpfm_event != "") {
> +    json_object *event_js = json_object_new_string(libpfm_event.c_str());
> +    if (NULL == event_js) goto json_fail;
> +    json_object_object_add(metadata, "libpfm-event", event_js);
> +  }
> +  if (libpfm_event_decoded != "") {
> +    json_object *event_js = 
> json_object_new_string(libpfm_event_decoded.c_str());
> +    if (NULL == event_js) goto json_fail;
> +    json_object_object_add(metadata, "libpfm-event-decoded", event_js);
> +  }
> +  {
> +    json_object *br_js = json_object_new_boolean(branch_record);
> +    if (NULL == br_js) goto json_fail;
> +    json_object_object_add(metadata, "branch-record", br_js);
> +  }
> +
> +  const char *metadata_str = json_object_to_json_string(metadata);
> +  if (!metadata_str) goto json_fail;
> +  ofstream of_js (json_path);
> +  of_js << metadata_str;
> +  of_js.close();
> +  json_object_put (metadata);
> +
> +  ofstream of (filename, ios::binary);
> +  if (!of)
> +    {
> +      cerr << format(N_("ERROR: buildid {} -- could not open '{}' for 
> writing\n"), buildid, filename);
> +    }
> +
> +  /* Write gmon header.  It and other headers mostly hold
> +     native-endian and fixed (or native) bitwidth values.  In
> +     principle, we should get the bitness/endianness from the
> +     particular executable associated with the buildid.  But, being a
> +     live profiler, we don't really have to deal with CROSS
> +     architecture work, and for now can just hard-code the bitness to
> +     match this host program. XXX
> +   */
> +  int wordsize = (sizeof (void *) == 8) ? 8 : 4;
> +  struct gmon_hdr ghdr;
> +  memcpy (&ghdr.cookie[0], GMON_MAGIC, 4);
> +  uint32_t version = GMON_VERSION;
> +  memcpy (&ghdr.version[0], reinterpret_cast<const char *>(&version), 4);
> +  memset (&ghdr.spare[0], 0, sizeof(ghdr.spare));
> +  of.write(reinterpret_cast<const char *>(&ghdr), sizeof(ghdr));
> +
> +  if (m.histogram.size() > 0)
> +    {
> +      uint64_t low_pc = m.histogram.begin()->first;
> +      uint64_t high_pc = m.histogram.rbegin()->first;
> +      uint64_t alignment = (high_pc - low_pc + 1) / UINT_MAX + 1;
> +
> +      if (gmon_hist_split == HIST_SPLIT_NONE)
> +       {
> +         /* Put everything into one histogram. */
> +         this->record_gmon_hist(of, m.histogram, low_pc, high_pc, alignment);
> +       }
> +      else if (gmon_hist_split == HIST_SPLIT_EVEN)
> +       {
> +         /* This option attempts to satisfy gprof's histogram scale
> +            consistency check, which requires all values
> +            '(double)(high_pc-low_pc)/num_buckets' to fall within
> +            EPSILON.  In practice, we can only be sure of this if we
> +            cover the address space with histograms all one size.  */
> +
> +         /* Keep the search for 'optimal' size simple -- we just need
> +            a plausible order of magnitude.  XXX Some rechecking of
> +            correctness needed.  */
> +         //uint64_t min_size = 1; // this is 'optimal' much of the time
> +         uint64_t min_size = 1024;
> +         uint64_t max_size = high_pc - low_pc;
> +         uint64_t opt_size = min_size;
> +         uint64_t opt_est = 0;
> +         uint64_t next_size = opt_size;
> +         while (next_size < max_size)
> +           {
> +             if (next_size > max_size)

`next_size > max_size` is never true in this while loop.

> +               next_size = max_size;
> +             uint64_t size_inc = sizeof(struct gmon_hdr) + next_size;
> +             uint64_t size_est = size_inc;
> +             uint64_t pc = low_pc;
> +             while (pc + size_est < high_pc)
> +               {
> +                 auto it = m.histogram.upper_bound(pc + size_est/alignment);
> +                 if (it == m.histogram.end())
> +                   break;
> +                 pc = it->first;
> +                 size_est += sizeof(struct gmon_hdr) + next_size;
> +               }
> +             if (opt_est == 0 || size_est < opt_est)
> +               {
> +                 opt_size = next_size;
> +                 opt_est = size_est;
> +               }
> +             // if (opt_est > prev_est) break; /* XXX: We've hit the lowest 
> point. */
> +             next_size = 2 * next_size;
> +           }
> +
> +         /* Partition into histograms of opt_size.
> +            XXX: May need to check if low_pc must be aligned.  */
> +         uint64_t prev_pc = low_pc;
> +         uint64_t pc = prev_pc;
> +         for (const auto& p : m.histogram)
> +           {
> +             pc = p.first;
> +             if (pc - low_pc > opt_size)
> +               {
> +                 /* Record a histogram from low_pc to low_pc+opt_size. */
> +                 this->record_gmon_hist(of, m.histogram,
> +                                        low_pc, low_pc+opt_size-1 /* >= 
> prev_pc */,
> +                                        alignment);
> +                 low_pc = pc;
> +               }
> +             prev_pc = pc;
> +           }
> +         /* Record a final histogram from low_pc to low_pc+opt_size.
> +            XXX: Edge case -- may want to adjust for overflow of
> +            low_pc+opt_size at end of address space.  */
> +         this->record_gmon_hist(of, m.histogram,
> +                                low_pc, low_pc+opt_size-1 /* >= prev_pc */,
> +                                alignment);
> +       }
> +      else if (gmon_hist_split == HIST_SPLIT_FLEX)
> +       {
> +         /* Allow variable-size histograms to save on storage space.
> +            Will fail gprof's input consistency checks, XXX but ok
> +            for profiledb purposes?  */
> +         uint64_t prev_pc = low_pc;
> +         uint64_t pc = prev_pc;
> +         /* XXX Iterate histogram ascending by key, faster than by addr
> +            when we just need to scan for gaps.  */
> +         for (const auto& p : m.histogram)
> +           {
> +             pc = p.first;
> +             uint64_t bin_dist = (pc - prev_pc) / alignment;
> +             if (bin_dist > sizeof(struct gmon_hist_hdr))
> +               /* XXX If we add '&& low_pc != prev_pc && pc != high_pc',
> +                  this avoids producing a histogram with only 1 entry,
> +                  but this is still not enough to satisfy gprof's
> +                  histogram scale calculation.  */
> +               {
> +                 /* Record a histogram from low_pc to prev_pc. */
> +                 this->record_gmon_hist(of, m.histogram, low_pc, prev_pc, 
> alignment);
> +                 low_pc = pc;
> +               }
> +             prev_pc = pc;
> +           }
> +         /* Record a final histogram from low_pc to pc. */
> +         this->record_gmon_hist(of, m.histogram, low_pc, pc, alignment);
> +       }
> +    }
> +
> +  /* Write call graph arcs. */
> +  for (auto& p : m.callgraph)
> +    {
> +      unsigned char tag = GMON_TAG_CG_ARC;
> +      of.write(reinterpret_cast<const char *>(&tag), sizeof(tag));
> +      if (wordsize == 4) {
> +       uint32_t addr = p.first.first;
> +       of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
> +       addr = p.first.second;
> +       of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
> +      } else {
> +       uint64_t addr = p.first.first;
> +       of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
> +       addr = p.first.second;
> +       of.write(reinterpret_cast<const char *>(&addr), sizeof(addr));
> +      }
> +      /* p is (from,to) -> count */
> +      uint32_t count = p.second;
> +      of.write(reinterpret_cast<const char *>(&count), sizeof(count));
> +    }
> +
> +  of.close();
> +}
> +
> +GprofUnwindSampleConsumer::~GprofUnwindSampleConsumer()
> +{
> +  if (show_summary)
> +    {
> +      this->stats->print_summary ();
> +      clog << "=== buildid / sample counts ===\n";
> +    }
> +
> +  UnwindStatsTable::buildid_map_t sorted_map 
> (this->stats->buildid_tab.begin(), this->stats->buildid_tab.end());
> +  for (auto& p : sorted_map) // traverse in sorted order
> +    {
> +      const string& buildid = p.first;
> +      UnwindModuleStats& module_stats = p.second;
> +      this->record_gmon_out(buildid, module_stats);
> +      if (show_summary)
> +        {
> +          /* In record_gmon_out we will write the buildid-->path mapping
> +             to a json metadata file.  That makes for a reasonable hint;
> +             debuginfod-find can be used as a mostly-functional fallback
> +             (for packaged rather than locally built executables) if the
> +             results are moved to another system.  */
> +          string mainfile = "<unknown>";
> +          if (buildid_to_mainfile.count(buildid) != 0)
> +            mainfile = buildid_to_mainfile[buildid];
> +          string debugfile = "";
> +          if (buildid_to_debugfile.count(buildid) != 0)
> +            debugfile = buildid_to_debugfile[buildid];
> +          clog << format(N_("buildid {} ({}{}{}) -- received {} distinct 
> pcs, {} callgraph arcs\n"), /* TODO also count samples / estimated histogram 
> size? */
> +                         buildid,
> +                         mainfile,
> +                         debugfile.empty() ? "" : " +debugfile ",
> +                         debugfile,
> +                         module_stats.histogram.size(),
> +                         module_stats.callgraph.size());
> +        }
> +    }
> +  if (show_summary)
> +    {
> +      clog << "===\n";
> +      clog << format(N_("TOTAL -- received {} buildids\n"), 
> this->stats->buildid_tab.size());
> +    }
> +  clog << "\n";
> +}
> +
> +
> +int
> +GprofUnwindSampleConsumer::maxframes()
> +{
> +  // gprof only needs one level of backtracing,
> +  // but user can override consumer's preference
> +  // with --maxframes option:
> +  return opt_maxframes >= 0 ? opt_maxframes : 1;
> +}
> +
> +
> +void GprofUnwindSampleConsumer::process(const UnwindSample *sample)
> +{
> +  if (sample->addrs.size() < 1)
> +    return; /* edge case -- no pc or callgraph arc */
> +
> +  Dwarf_Addr pc = sample->addrs[0];
> +  Dwarf_Addr pc2 = sample->addrs.size() < 2 ? 0 : sample->addrs[1];
> +
> +  Dwfl_Module *mod = dwfl_addrmodule(sample->dwfl, pc);
> +  if (mod == NULL)
> +    return;
> +#if 0
> +  Dwarf_Addr bias;
> +  Elf *elf = dwfl_module_getelf (mod, &bias);
> +  (void)elf;
> +#endif

Similar comment as the previous #if 0 block.  If this is not intended for
debug builds going forward, let's get rid of it. Otherwise let's use a macro
defined near the top of the file to control whether it's enabled.  Both of the
existing #if 0 blocks could be controlled with one macro if appropriate.

Aaron

> +
> +  Dwfl_Module *mod2 = dwfl_addrmodule(sample->dwfl, pc2);
> +  // XXX: allowing mod2 == NULL -- callgraph arc will be skipped
> +
> +  // extract buildid for pc (hit callee)
> +  const unsigned char *desc = nullptr;
> +  GElf_Addr vaddr;
> +  int build_id_len = dwfl_module_build_id(mod, &desc, &vaddr);
> +  if (build_id_len <= 0)
> +    return; // TODO: report/tabulate hit outside known modules
> +
> +  // possible optimization would be to use the unconverted build_id_desc as 
> hash key
> +  string buildid;
> +  for (int i = 0; i < build_id_len; ++i) {
> +    buildid += format("{:02x}", static_cast<int>(desc[i]));
> +  }
> +
> +  const char *mainfile_cstr;
> +  const char *debugfile_cstr;
> +  Dwarf_Addr low_addr;
> +  Dwarf_Addr high_addr;
> +  dwfl_module_info (mod, NULL, &low_addr, &high_addr, NULL,
> +                   NULL, &mainfile_cstr, &debugfile_cstr);
> +  string mainfile = mainfile_cstr ? mainfile_cstr : "<unknown>";
> +  string debugfile = debugfile_cstr ? debugfile_cstr : "";
> +  if (!buildid_to_mainfile.count(buildid))
> +    buildid_to_mainfile[buildid] = mainfile;
> +  if (!buildid_to_debugfile.count(buildid))
> +    buildid_to_debugfile[buildid] = debugfile;
> +  /* XXX: Also monitor for collisions here? */
> +
> +  UnwindModuleStats *buildid_ent = 
> this->stats->buildid_find_or_create(buildid, mod);
> +
> +  uint64_t last_pc = pc;
> +  int i = dwfl_module_relocate_address (mod, &pc);
> +  /* XXX: Out-of-range address seen with ld-linux.so, not useful for 
> profiledb purposes: */
> +  if ((last_pc < low_addr || last_pc > high_addr))
> +    {
> +      if (verbose)
> +       clog << format(N_("{}: Skipping pc={:x} raw_pc={:x} outside module 
> range start={:x}..end={:x}\n"),
> +                      mainfile, pc, last_pc, low_addr, high_addr);
> +      return;
> +    }
> +  (void) i;
> +  // XXX: could get dwfl_module_relocation_info (mod, i, NULL), but no need?
> +  buildid_ent->record_pc(pc);
> +
> +  // If caller & callee are in different modules, this is a 
> cross-shared-library
> +  // call, so we can't track it as a call-graph arc.  TODO: at least count 
> them
> +  if (sample->addrs.size() >= 2 && mod == mod2) // intra-module call
> +    {
> +      last_pc = pc2;
> +      int j = dwfl_module_relocate_address (mod, &pc2); // map pc2 also
> +      if (last_pc < low_addr || last_pc > high_addr)
> +       {
> +         if (verbose)
> +           clog << format(N_("{}: Skipping pc={:x} raw_pc={:x} outside 
> module range start={:x}..end={:x}\n"),
> +                          mainfile, pc2, last_pc, low_addr, high_addr);
> +         return;
> +       }
> +      (void) j;
> +      buildid_ent->record_callgraph_arc(pc2, pc);
> +    }
> +}
> +
> --
> 2.53.0
>

Re: [PATCH 1/4] configure.ac, src/: update to C++20, add new stackprof tool

Reply via email to