Hi Aaron, On Tue, May 27, 2025 at 10:04:20AM -0400, Aaron Merey wrote: > Implement concurrent execution of print_debug_* functions during handling > of -w, --debug-dump using libthread.a.
Do you intend to also enable this for other sections? > A new `-C, --concurrency=NUM` command line option controls the maximum > number of threads that may be used. This value defaults to the number of CPUs. > > Job output is buffered and printed in the order that jobs were added to > the queue. This helps preserve the existing order of stdout output. Full > support for output buffering in print_debug_* functions is added in the > next patch in this series. I would have expected 3/3 "src/readelf.c: Add support for print_debug_* output buffering" before this patch. Does this patch really work correctly before that is done? > * src/readelf.c (default_concurrency): Function estimating the > maximum number of threads. > (parse_opt): Handle -C, --concurrency=NUM. > (do_job): Entry point function for threads. > (schedule_job): If thread safety is enabled, add job to the > job queue. Otherwise just run the job from the main thread. > (print_debug): Pass print_debug_* function pointers and > args to schedule_job. Also call run_jobs if thread safety > is enabled. > > Signed-off-by: Aaron Merey <ame...@redhat.com> > > --- > v3: > Replace assert before run_jobs with a conditional. > > src/readelf.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 149 insertions(+), 4 deletions(-) > > diff --git a/src/readelf.c b/src/readelf.c > index b7dba390..e3aece8b 100644 > --- a/src/readelf.c > +++ b/src/readelf.c > @@ -57,6 +57,18 @@ > > #include "../libdw/known-dwarf.h" > > +#ifdef USE_LOCKS > +#include "threadlib.h" > +#endif > + > +#ifdef HAVE_SCHED_H > +#include <sched.h> > +#endif > + > +#ifdef HAVE_SYS_RESOURCE_H > +#include <sys/resource.h> > +#endif OK, I see the following in configure.ac: dnl for debuginfod concurrency heuristics AC_CHECK_HEADERS([sched.h]) AC_CHECK_FUNCS([sched_getaffinity]) AC_CHECK_HEADERS([sys/resource.h]) > #ifdef __linux__ > #define CORE_SIGILL SIGILL > #define CORE_SIGBUS SIGBUS > @@ -150,6 +162,10 @@ static const struct argp_option options[] = > N_("Ignored for compatibility (lines always wide)"), 0 }, > { "decompress", 'z', NULL, 0, > N_("Show compression information for compressed sections (when used with > -S); decompress section before dumping data (when used with -p or -x)"), 0 }, > +#ifdef USE_LOCKS > + { "concurrency", 'C', "NUM", 0, > + N_("Set maximum number of threads. Defaults to the number of CPUs."), 0 > }, > +#endif > { NULL, 0, NULL, 0, NULL, 0 } > }; OK. > @@ -249,6 +265,11 @@ static bool print_decompress = false; > /* True if we want to show split compile units for debug_info skeletons. */ > static bool show_split_units = false; > > +#if USE_LOCKS > +/* Maximum number of threads. */ > +static int max_threads = -1; > +#endif > + OK. Could also have been zero since -C will set it to 1 or higher. > /* Select printing of debugging sections. */ > static enum section_e > { > @@ -380,6 +401,43 @@ cleanup_list (struct section_argument *list) > } > } > > +#ifdef USE_LOCKS > +/* Estimate the maximum number of threads. This is normally > + #CPU. Return value is guaranteed to be at least 1. */ > +static int > +default_concurrency (void) > +{ > + unsigned aff = 0; > +#ifdef HAVE_SCHED_GETAFFINITY > + { > + int ret; > + cpu_set_t mask; > + CPU_ZERO (&mask); > + ret = sched_getaffinity (0, sizeof(mask), &mask); > + if (ret == 0) > + aff = CPU_COUNT (&mask); > + } > +#endif Is this better/worse than using get_nprocs or sysconf (_SC_NPROCESSORS_ONLN)? https://www.gnu.org/software/libc/manual/html_node/Processor-Resources.html Also from the sched_getaffinity man page it looks like cpu_set_t might be too small and the call might fail on systems with > 1024 cores. Maybe this is a non-issue though? > + unsigned fn = 0; > +#ifdef HAVE_GETRLIMIT > + { > + struct rlimit rlim; > + int rc = getrlimit (RLIMIT_NOFILE, &rlim); > + if (rc == 0) > + fn = MAX ((rlim_t) 1, (rlim.rlim_cur - 100) / 2); > + /* Conservatively estimate that at least 2 fds are used > + by each thread. */ > + } > +#endif Interesting. I wouldn't have thought about that. But makes sense. If you check available file descriptors, would it also make sense to check for available memory? And limit the number of jobs to mem/1024M? > + unsigned d = MIN (MAX (aff, 2U), > + MAX (fn, 2U)); > + > + return d; > +} > +#endif > + > int > main (int argc, char *argv[]) > { > @@ -403,6 +461,12 @@ main (int argc, char *argv[]) > /* Before we start tell the ELF library which version we are using. */ > elf_version (EV_CURRENT); > > +#ifdef USE_LOCKS > + /* If concurrency wasn't set by argp_parse, then set a default value. */ > + if (max_threads == -1) > + max_threads = default_concurrency (); > +#endif > + OK. > /* Now process all the files given at the command line. */ > bool only_one = remaining + 1 == argc; > do > @@ -527,6 +591,16 @@ parse_opt (int key, char *arg, > case 'c': > print_archive_index = true; > break; > +#if USE_LOCKS > + case 'C': > + if (arg != NULL) > + { > + max_threads = atoi (arg); > + if (max_threads < 1) > + error (1, 0, _("-C NUM minimum 1")); > + } > + break; > +#endif I think you will get slightly nicer output doing: if (max_threads < 1) { argp_error (state, _("-C NUM minimum 1")); return EINVAL; } > case 'w': > if (arg == NULL) > { > @@ -5492,7 +5566,7 @@ listptr_base (struct listptr *p) > } > > /* To store the name used in compare_listptr */ > -static const char *sort_listptr_name; > +_Thread_local const char *sort_listptr_name; O, that is interesting and not in your ChangeLog. Is this the simplest way to make this thread-safe? Could there be a lock around sort_listptr ? What about the static listptr_table known_xxxx[ptr|bases]? Should building up/accessing those be made thread-safe? > static int > compare_listptr (const void *a, const void *b) > @@ -11950,6 +12024,65 @@ getone_dwflmod (Dwfl_Module *dwflmod, > return DWARF_CB_OK; > } > > +typedef struct { > + Dwfl_Module *dwflmod; > + Ebl *ebl; > + GElf_Ehdr *ehdr; > + Elf_Scn scn; > + GElf_Shdr shdr; > + Dwarf *dbg; > + FILE *out; > + void (*fp) (Dwfl_Module *, Ebl *, GElf_Ehdr *, > + Elf_Scn *, GElf_Shdr *, Dwarf *, FILE *); > +} job_data; > + > +#ifdef USE_LOCKS > + > +/* Thread entry point. */ > +static void * > +do_job (void *data, FILE *out) > +{ > + job_data *d = (job_data *) data; > + d->fp (d->dwflmod, d->ebl, d->ehdr, &d->scn, &d->shdr, d->dbg, out); > + return NULL; > +} > +#endif OK. > +/* If readelf is built with thread safety, then set up JDATA at index IDX > + and add it to the job queue. > + > + If thread safety is not supported or the maximum number of threads is set > + to 1, then immediately call START_ROUTINE with the given arguments. */ > +static void > +schedule_job (job_data jdata[], size_t idx, > + void (*start_routine) (Dwfl_Module *, Ebl *, GElf_Ehdr *, > + Elf_Scn *, GElf_Shdr *, Dwarf *, FILE *), > + Dwfl_Module *dwflmod, Ebl *ebl, GElf_Ehdr *ehdr, Elf_Scn *scn, > + GElf_Shdr *shdr, Dwarf *dbg) > +{ > +#ifdef USE_LOCKS > + if (max_threads > 1) > + { > + /* Add to the job queue. */ > + jdata[idx].dwflmod = dwflmod; > + jdata[idx].ebl = ebl; > + jdata[idx].ehdr = ehdr; > + jdata[idx].scn = *scn; > + jdata[idx].shdr = *shdr; > + jdata[idx].dbg = dbg; > + jdata[idx].fp = start_routine; > + > + add_job (do_job, (void *) &jdata[idx]); > + } > + else > + start_routine (dwflmod, ebl, ehdr, scn, shdr, dbg, stdout); > +#else > + (void) jdata; (void) idx; > + > + start_routine (dwflmod, ebl, ehdr, scn, shdr, dbg, stdout); > +#endif > +} OK. > static void > print_debug (Dwfl_Module *dwflmod, Ebl *ebl, GElf_Ehdr *ehdr) > { > @@ -12122,6 +12255,9 @@ print_debug (Dwfl_Module *dwflmod, Ebl *ebl, > GElf_Ehdr *ehdr) > if (unlikely (elf_getshdrstrndx (ebl->elf, &shstrndx) < 0)) > error_exit (0, _("cannot get section header string table index")); > > + size_t num_jobs = 0; > + job_data jdata[shnum]; > + shnum can be big, not sure you want to stack allocate it. Also do you really need/want shnum? There are only that many .debug sections. See enum section_e. Although in an ET_REL file there can be multiple of the same. There is ndebug_sections. Maybe that can be used? Anyway, point is shnum can be huge, so please don't stack allocate and maybe somehow count how many job_data you need if shnum is really large. > /* If the .debug_info section is listed as implicitly required then > we must make sure to handle it before handling any other debug > section. Various other sections depend on the CU DIEs being > @@ -12239,15 +12375,24 @@ print_debug (Dwfl_Module *dwflmod, Ebl *ebl, > GElf_Ehdr *ehdr) > && strcmp (&name[14], debug_sections[n].name) == 0) > ) > { > - if ((print_debug_sections | implicit_debug_sections) > - & debug_sections[n].bitmask) > - debug_sections[n].fp (dwflmod, ebl, ehdr, scn, shdr, dbg); > + if (((print_debug_sections | implicit_debug_sections) > + & debug_sections[n].bitmask)) > + schedule_job (jdata, num_jobs++, debug_sections[n].fp, > + dwflmod, ebl, ehdr, scn, shdr, dbg); > + > + assert (num_jobs <= shnum); > break; > } > } > } > } OK. > +#ifdef USE_LOCKS > + /* If max_threads == 1, then jobs were immediately run in schedule_job. */ > + if (max_threads > 1) > + run_jobs (max_threads - 1); > +#endif > + > dwfl_end (skel_dwfl); > free (skel_name); Why max_threads - 1 ? Cheers, Mark