__libdw_dieabbrev uses the abbrev_lock rwlock to synchronize access to the Dwarf_Die abbrev field as well as its lazy loading. Calls to rwlock_wrlock and unlock incur significant performance overhead even in single-threaded cases.
This patch implements thread safety in __libdw_dieabbrev with GCC __atomic builtins instead of an rwlock, improving performance. abbrev_lock has been changed to a mutex and now synchronizes access to the Dwarf_CU field last_abbrev_offset in __libdw_findabbrev. libdw/ * dwarf_begin_elf.c (valid_p): Change type of abbrev_lock from rwlock to mutex. * dwarf_end.c (cu_free): Ditto. * dwarf_tag.c (__libdw_findabbrev): Protect cu->last_abbrev_offset with mutex. * libdwP.h (struct Dwarf_CU): Change type of abbrev_lock from rwlock to mutex. (__libdw_dieabbrev): Use __atomic builtins instead of rwlock. * libdw_findcu.c (__libdw_intern_next_unit): Change type of abbrev_lock from rwlock to mutex. Signed-off-by: Aaron Merey <ame...@redhat.com> --- v1: https://patchwork.sourceware.org/project/elfutils/patch/20250623000006.869936-1-ame...@redhat.com/ v1 replaced lazy loading of Dwarf_Die.abbrev with eager loading. This patch keeps lazy loading but synchronizes it with __atomic builtins instead of the current rwlock implementation. v2 is faster than v1 and the current rwlock-based approach on the main branch. Here are some performance numbers for comparison. The data was gathered with the following command: $ hyperfine --warmup 2 'eu-readelf -N -w --concurrency=1 /usr/local/bin/stap' Here are the results under different implementations of thread safe abbrev loading: thread safety enabled, this patch applied (__atomic builtins, lazy loading) Time (mean ± σ): 1.693 s ± 0.012 s [User: 1.675 s, System: 0.015 s] Range (min … max): 1.679 s … 1.719 s 10 runs thread safety enabled, v1 patch applied (eager loading) Time (mean ± σ): 1.714 s ± 0.011 s [User: 1.696 s, System: 0.015 s] Range (min … max): 1.700 s … 1.733 s 10 runs thread safety enabled, main branch (rwlock, lazy loading) Time (mean ± σ): 1.756 s ± 0.011 s [User: 1.739 s, System: 0.014 s] Range (min … max): 1.742 s … 1.774 s 10 runs thread safety disabled, main branch (lazy loading) Time (mean ± σ): 1.701 s ± 0.014 s [User: 1.683 s, System: 0.014 s] Range (min … max): 1.684 s … 1.728 s 10 runs With this patch, `eu-readelf -N -w` performance with thread safety enabled is on par with the performance of main branch with thread safety disabled. Patch v1 eager abbrev loading with thread safety enabled is 1.2% slower. And the current main branch rwlock implementation is 3.7% slower. One issue we need to address by the time we remove the "experimental" notice from --enable-thread-safety is communicating to the library caller that they should not attempt to read Dwarf_Die.abbrev before it is loaded. Since the Dwarf_Die struct is public and transparent to library callers, one thread attempting to read the abbrev field while another thread lazy loads the abbrev is undefined behavior. I don't think we can fully eliminate the possibility of UB under a lazy loading design given that we should not change the Dwarf_Die layout and its public accessibility. Lazy loading implementation cannot prevent library callers from reading partially written abbrev bytes no matter what internal synchronization methods are used. Eager loading of each Dwarf_Die abbrev prevents UB since the caller always sees the abbrev field fully initialized. But this approach is 1.2% slower. libdw/dwarf_begin_elf.c | 6 +++--- libdw/dwarf_end.c | 2 +- libdw/dwarf_tag.c | 4 ++++ libdw/libdwP.h | 46 ++++++++++++++++++++++------------------- libdw/libdw_findcu.c | 2 +- 5 files changed, 34 insertions(+), 26 deletions(-) diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c index 63e2805d..f5cabde0 100644 --- a/libdw/dwarf_begin_elf.c +++ b/libdw/dwarf_begin_elf.c @@ -359,8 +359,8 @@ valid_p (Dwarf *result) result->fake_loc_cu->version = 4; result->fake_loc_cu->split = NULL; eu_search_tree_init (&result->fake_loc_cu->locs_tree); - rwlock_init (result->fake_loc_cu->abbrev_lock); rwlock_init (result->fake_loc_cu->split_lock); + mutex_init (result->fake_loc_cu->abbrev_lock); mutex_init (result->fake_loc_cu->src_lock); mutex_init (result->fake_loc_cu->str_off_base_lock); mutex_init (result->fake_loc_cu->intern_lock); @@ -392,8 +392,8 @@ valid_p (Dwarf *result) result->fake_loclists_cu->version = 5; result->fake_loclists_cu->split = NULL; eu_search_tree_init (&result->fake_loclists_cu->locs_tree); - rwlock_init (result->fake_loclists_cu->abbrev_lock); rwlock_init (result->fake_loclists_cu->split_lock); + mutex_init (result->fake_loclists_cu->abbrev_lock); mutex_init (result->fake_loclists_cu->src_lock); mutex_init (result->fake_loclists_cu->str_off_base_lock); mutex_init (result->fake_loclists_cu->intern_lock); @@ -430,8 +430,8 @@ valid_p (Dwarf *result) result->fake_addr_cu->version = 5; result->fake_addr_cu->split = NULL; eu_search_tree_init (&result->fake_addr_cu->locs_tree); - rwlock_init (result->fake_addr_cu->abbrev_lock); rwlock_init (result->fake_addr_cu->split_lock); + mutex_init (result->fake_addr_cu->abbrev_lock); mutex_init (result->fake_addr_cu->src_lock); mutex_init (result->fake_addr_cu->str_off_base_lock); mutex_init (result->fake_addr_cu->intern_lock); diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c index 979b1168..fa27b19e 100644 --- a/libdw/dwarf_end.c +++ b/libdw/dwarf_end.c @@ -63,8 +63,8 @@ cu_free (void *arg) struct Dwarf_CU *p = (struct Dwarf_CU *) arg; eu_search_tree_fini (&p->locs_tree, noop_free); - rwlock_fini (p->abbrev_lock); rwlock_fini (p->split_lock); + mutex_fini (p->abbrev_lock); mutex_fini (p->src_lock); mutex_fini (p->str_off_base_lock); mutex_fini (p->intern_lock); diff --git a/libdw/dwarf_tag.c b/libdw/dwarf_tag.c index 218382a1..7daa4813 100644 --- a/libdw/dwarf_tag.c +++ b/libdw/dwarf_tag.c @@ -53,15 +53,19 @@ __libdw_findabbrev (struct Dwarf_CU *cu, unsigned int code) /* Find the next entry. It gets automatically added to the hash table. */ + mutex_lock (cu->abbrev_lock); abb = __libdw_getabbrev (cu->dbg, cu, cu->last_abbrev_offset, &length); + if (abb == NULL || abb == DWARF_END_ABBREV) { /* Make sure we do not try to search for it again. */ cu->last_abbrev_offset = (size_t) -1l; + mutex_unlock (cu->abbrev_lock); return DWARF_END_ABBREV; } cu->last_abbrev_offset += length; + mutex_unlock (cu->abbrev_lock); /* Is this the code we are looking for? */ if (abb->code == code) diff --git a/libdw/libdwP.h b/libdw/libdwP.h index b77db142..cfb1f206 100644 --- a/libdw/libdwP.h +++ b/libdw/libdwP.h @@ -455,14 +455,14 @@ struct Dwarf_CU Don't access directly, call __libdw_cu_locs_base. */ Dwarf_Off locs_base; - /* Synchronize access to the abbrev member of a Dwarf_Die that - refers to this Dwarf_CU. Covers __libdw_die_abbrev. */ - rwlock_define(, abbrev_lock); - /* Synchronize access to the split member of this Dwarf_CU. Covers __libdw_find_split_unit. */ rwlock_define(, split_lock); + /* Synchronize access to the last_abbrev_offset member of a Dwarf_Die + that refers to this Dwarf_CU. */ + mutex_define(, abbrev_lock); + /* Synchronize access to the lines and files members. Covers dwarf_getsrclines and dwarf_getsrcfiles. */ mutex_define(, src_lock); @@ -823,41 +823,45 @@ static inline Dwarf_Abbrev * __nonnull_attribute__ (1) __libdw_dieabbrev (Dwarf_Die *die, const unsigned char **readp) { + Dwarf_Abbrev *end_abbrev = DWARF_END_ABBREV; + Dwarf_Abbrev *expected = (Dwarf_Abbrev *) NULL; + if (unlikely (die->cu == NULL)) { - die->abbrev = DWARF_END_ABBREV; - return DWARF_END_ABBREV; + __atomic_compare_exchange_n (&die->abbrev, &expected, end_abbrev, false, + __ATOMIC_RELEASE, __ATOMIC_ACQUIRE); + return end_abbrev; } - rwlock_wrlock (die->cu->abbrev_lock); - - /* Do we need to get the abbreviation, or need to read after the code? */ - if (die->abbrev == NULL || readp != NULL) + Dwarf_Abbrev *abbrev = __atomic_load_n (&die->abbrev, __ATOMIC_ACQUIRE); + if (abbrev == NULL || readp != NULL) { - /* Get the abbreviation code. */ + /* We need to get the abbreviation or need to read after the code. */ unsigned int code; const unsigned char *addr = die->addr; - if (addr >= (const unsigned char *) die->cu->endp) { - die->abbrev = DWARF_END_ABBREV; - rwlock_unlock (die->cu->abbrev_lock); - return DWARF_END_ABBREV; + __atomic_compare_exchange_n (&die->abbrev, &expected, + end_abbrev, false, + __ATOMIC_RELEASE, __ATOMIC_ACQUIRE); + return end_abbrev; } + /* Get the abbreviation code. */ get_uleb128 (code, addr, die->cu->endp); if (readp != NULL) *readp = addr; /* Find the abbreviation. */ - if (die->abbrev == NULL) - die->abbrev = __libdw_findabbrev (die->cu, code); + if (abbrev == NULL) + { + abbrev = __libdw_findabbrev (die->cu, code); + __atomic_compare_exchange_n (&die->abbrev, &expected, abbrev, false, + __ATOMIC_RELEASE, __ATOMIC_ACQUIRE); + } } - Dwarf_Abbrev *result = die->abbrev; - rwlock_unlock (die->cu->abbrev_lock); - - return result; + return abbrev; } /* Helper functions for form handling. */ diff --git a/libdw/libdw_findcu.c b/libdw/libdw_findcu.c index 59267343..a1f700d3 100644 --- a/libdw/libdw_findcu.c +++ b/libdw/libdw_findcu.c @@ -177,8 +177,8 @@ __libdw_intern_next_unit (Dwarf *dbg, bool debug_types) newp->startp = data->d_buf + newp->start; newp->endp = data->d_buf + newp->end; eu_search_tree_init (&newp->locs_tree); - rwlock_init (newp->abbrev_lock); rwlock_init (newp->split_lock); + mutex_init (newp->abbrev_lock); mutex_init (newp->src_lock); mutex_init (newp->str_off_base_lock); mutex_init (newp->intern_lock); -- 2.51.0