__libdw_dieabbrev uses the abbrev_lock rwlock to synchronize access to the
Dwarf_Die abbrev field as well as its lazy loading.  Calls to rwlock_wrlock
and unlock incur significant performance overhead even in single-threaded
cases.

This patch implements thread safety in __libdw_dieabbrev with GCC __atomic
builtins instead of an rwlock, improving performance.  abbrev_lock has been
changed to a mutex and now synchronizes access to the Dwarf_CU field
last_abbrev_offset in __libdw_findabbrev.

libdw/
        * dwarf_begin_elf.c (valid_p): Change type of abbrev_lock from
        rwlock to mutex.
        * dwarf_end.c (cu_free): Ditto.
        * dwarf_tag.c (__libdw_findabbrev): Protect
        cu->last_abbrev_offset with mutex.
        * libdwP.h (struct Dwarf_CU): Change type of abbrev_lock from
        rwlock to mutex.
        (__libdw_dieabbrev): Use __atomic builtins instead of rwlock.
        * libdw_findcu.c (__libdw_intern_next_unit): Change type of
        abbrev_lock from rwlock to mutex.

Signed-off-by: Aaron Merey <ame...@redhat.com>

---
v1:
https://patchwork.sourceware.org/project/elfutils/patch/20250623000006.869936-1-ame...@redhat.com/

v1 replaced lazy loading of Dwarf_Die.abbrev with eager loading.  This
patch keeps lazy loading but synchronizes it with __atomic builtins
instead of the current rwlock implementation.

v2 is faster than v1 and the current rwlock-based approach on the main branch.
Here are some performance numbers for comparison.  The data was gathered with
the following command:
$ hyperfine --warmup 2 'eu-readelf -N -w --concurrency=1 /usr/local/bin/stap'

Here are the results under different implementations of thread safe abbrev
loading:

thread safety enabled, this patch applied (__atomic builtins, lazy loading)
  Time (mean ± σ):      1.693 s ±  0.012 s    [User: 1.675 s, System: 0.015 s]
  Range (min … max):    1.679 s …  1.719 s    10 runs

thread safety enabled, v1 patch applied (eager loading)
  Time (mean ± σ):      1.714 s ±  0.011 s    [User: 1.696 s, System: 0.015 s]
  Range (min … max):    1.700 s …  1.733 s    10 runs

thread safety enabled, main branch (rwlock, lazy loading)
  Time (mean ± σ):      1.756 s ±  0.011 s    [User: 1.739 s, System: 0.014 s]
  Range (min … max):    1.742 s …  1.774 s    10 runs

thread safety disabled, main branch (lazy loading)
  Time (mean ± σ):      1.701 s ±  0.014 s    [User: 1.683 s, System: 0.014 s]
  Range (min … max):    1.684 s …  1.728 s    10 runs

With this patch, `eu-readelf -N -w` performance with thread safety
enabled is on par with the performance of main branch with thread safety
disabled.  Patch v1 eager abbrev loading with thread safety enabled is
1.2% slower.  And the current main branch rwlock implementation is 3.7%
slower.

One issue we need to address by the time we remove the "experimental"
notice from --enable-thread-safety is communicating to the library
caller that they should not attempt to read Dwarf_Die.abbrev before
it is loaded. Since the Dwarf_Die struct is public and transparent to
library callers, one thread attempting to read the abbrev field while
another thread lazy loads the abbrev is undefined behavior.

I don't think we can fully eliminate the possibility of UB under a lazy
loading design given that we should not change the Dwarf_Die layout and
its public accessibility.  Lazy loading implementation cannot prevent
library callers from reading partially written abbrev bytes no matter
what internal synchronization methods are used.

Eager loading of each Dwarf_Die abbrev prevents UB since the caller always
sees the abbrev field fully initialized.  But this approach is 1.2% slower.

 libdw/dwarf_begin_elf.c |  6 +++---
 libdw/dwarf_end.c       |  2 +-
 libdw/dwarf_tag.c       |  4 ++++
 libdw/libdwP.h          | 46 ++++++++++++++++++++++-------------------
 libdw/libdw_findcu.c    |  2 +-
 5 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
index 63e2805d..f5cabde0 100644
--- a/libdw/dwarf_begin_elf.c
+++ b/libdw/dwarf_begin_elf.c
@@ -359,8 +359,8 @@ valid_p (Dwarf *result)
          result->fake_loc_cu->version = 4;
          result->fake_loc_cu->split = NULL;
          eu_search_tree_init (&result->fake_loc_cu->locs_tree);
-         rwlock_init (result->fake_loc_cu->abbrev_lock);
          rwlock_init (result->fake_loc_cu->split_lock);
+         mutex_init (result->fake_loc_cu->abbrev_lock);
          mutex_init (result->fake_loc_cu->src_lock);
          mutex_init (result->fake_loc_cu->str_off_base_lock);
          mutex_init (result->fake_loc_cu->intern_lock);
@@ -392,8 +392,8 @@ valid_p (Dwarf *result)
          result->fake_loclists_cu->version = 5;
          result->fake_loclists_cu->split = NULL;
          eu_search_tree_init (&result->fake_loclists_cu->locs_tree);
-         rwlock_init (result->fake_loclists_cu->abbrev_lock);
          rwlock_init (result->fake_loclists_cu->split_lock);
+         mutex_init (result->fake_loclists_cu->abbrev_lock);
          mutex_init (result->fake_loclists_cu->src_lock);
          mutex_init (result->fake_loclists_cu->str_off_base_lock);
          mutex_init (result->fake_loclists_cu->intern_lock);
@@ -430,8 +430,8 @@ valid_p (Dwarf *result)
          result->fake_addr_cu->version = 5;
          result->fake_addr_cu->split = NULL;
          eu_search_tree_init (&result->fake_addr_cu->locs_tree);
-         rwlock_init (result->fake_addr_cu->abbrev_lock);
          rwlock_init (result->fake_addr_cu->split_lock);
+         mutex_init (result->fake_addr_cu->abbrev_lock);
          mutex_init (result->fake_addr_cu->src_lock);
          mutex_init (result->fake_addr_cu->str_off_base_lock);
          mutex_init (result->fake_addr_cu->intern_lock);
diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
index 979b1168..fa27b19e 100644
--- a/libdw/dwarf_end.c
+++ b/libdw/dwarf_end.c
@@ -63,8 +63,8 @@ cu_free (void *arg)
   struct Dwarf_CU *p = (struct Dwarf_CU *) arg;
 
   eu_search_tree_fini (&p->locs_tree, noop_free);
-  rwlock_fini (p->abbrev_lock);
   rwlock_fini (p->split_lock);
+  mutex_fini (p->abbrev_lock);
   mutex_fini (p->src_lock);
   mutex_fini (p->str_off_base_lock);
   mutex_fini (p->intern_lock);
diff --git a/libdw/dwarf_tag.c b/libdw/dwarf_tag.c
index 218382a1..7daa4813 100644
--- a/libdw/dwarf_tag.c
+++ b/libdw/dwarf_tag.c
@@ -53,15 +53,19 @@ __libdw_findabbrev (struct Dwarf_CU *cu, unsigned int code)
 
        /* Find the next entry.  It gets automatically added to the
           hash table.  */
+       mutex_lock (cu->abbrev_lock);
        abb = __libdw_getabbrev (cu->dbg, cu, cu->last_abbrev_offset, &length);
+
        if (abb == NULL || abb == DWARF_END_ABBREV)
          {
            /* Make sure we do not try to search for it again.  */
            cu->last_abbrev_offset = (size_t) -1l;
+           mutex_unlock (cu->abbrev_lock);
            return DWARF_END_ABBREV;
          }
 
        cu->last_abbrev_offset += length;
+       mutex_unlock (cu->abbrev_lock);
 
        /* Is this the code we are looking for?  */
        if (abb->code == code)
diff --git a/libdw/libdwP.h b/libdw/libdwP.h
index b77db142..cfb1f206 100644
--- a/libdw/libdwP.h
+++ b/libdw/libdwP.h
@@ -455,14 +455,14 @@ struct Dwarf_CU
      Don't access directly, call __libdw_cu_locs_base.  */
   Dwarf_Off locs_base;
 
-  /* Synchronize access to the abbrev member of a Dwarf_Die that
-     refers to this Dwarf_CU.  Covers __libdw_die_abbrev. */
-  rwlock_define(, abbrev_lock);
-
   /* Synchronize access to the split member of this Dwarf_CU.
      Covers __libdw_find_split_unit.  */
   rwlock_define(, split_lock);
 
+  /* Synchronize access to the last_abbrev_offset member of a Dwarf_Die
+     that refers to this Dwarf_CU.  */
+  mutex_define(, abbrev_lock);
+
   /* Synchronize access to the lines and files members.
      Covers dwarf_getsrclines and dwarf_getsrcfiles.  */
   mutex_define(, src_lock);
@@ -823,41 +823,45 @@ static inline Dwarf_Abbrev *
 __nonnull_attribute__ (1)
 __libdw_dieabbrev (Dwarf_Die *die, const unsigned char **readp)
 {
+  Dwarf_Abbrev *end_abbrev = DWARF_END_ABBREV;
+  Dwarf_Abbrev *expected = (Dwarf_Abbrev *) NULL;
+
   if (unlikely (die->cu == NULL))
     {
-      die->abbrev = DWARF_END_ABBREV;
-      return DWARF_END_ABBREV;
+      __atomic_compare_exchange_n (&die->abbrev, &expected, end_abbrev, false,
+                                  __ATOMIC_RELEASE, __ATOMIC_ACQUIRE);
+      return end_abbrev;
     }
 
-  rwlock_wrlock (die->cu->abbrev_lock);
-
-  /* Do we need to get the abbreviation, or need to read after the code?  */
-  if (die->abbrev == NULL || readp != NULL)
+  Dwarf_Abbrev *abbrev = __atomic_load_n (&die->abbrev, __ATOMIC_ACQUIRE);
+  if (abbrev == NULL || readp != NULL)
     {
-      /* Get the abbreviation code.  */
+      /* We need to get the abbreviation or need to read after the code.  */
       unsigned int code;
       const unsigned char *addr = die->addr;
-
       if (addr >= (const unsigned char *) die->cu->endp)
        {
-         die->abbrev = DWARF_END_ABBREV;
-         rwlock_unlock (die->cu->abbrev_lock);
-         return DWARF_END_ABBREV;
+         __atomic_compare_exchange_n (&die->abbrev, &expected,
+                                      end_abbrev, false,
+                                      __ATOMIC_RELEASE, __ATOMIC_ACQUIRE);
+         return end_abbrev;
        }
 
+      /* Get the abbreviation code.  */
       get_uleb128 (code, addr, die->cu->endp);
       if (readp != NULL)
        *readp = addr;
 
       /* Find the abbreviation.  */
-      if (die->abbrev == NULL)
-       die->abbrev = __libdw_findabbrev (die->cu, code);
+      if (abbrev == NULL)
+       {
+         abbrev = __libdw_findabbrev (die->cu, code);
+         __atomic_compare_exchange_n (&die->abbrev, &expected, abbrev, false,
+                                      __ATOMIC_RELEASE, __ATOMIC_ACQUIRE);
+       }
     }
 
-  Dwarf_Abbrev *result = die->abbrev;
-  rwlock_unlock (die->cu->abbrev_lock);
-
-  return result;
+  return abbrev;
 }
 
 /* Helper functions for form handling.  */
diff --git a/libdw/libdw_findcu.c b/libdw/libdw_findcu.c
index 59267343..a1f700d3 100644
--- a/libdw/libdw_findcu.c
+++ b/libdw/libdw_findcu.c
@@ -177,8 +177,8 @@ __libdw_intern_next_unit (Dwarf *dbg, bool debug_types)
   newp->startp = data->d_buf + newp->start;
   newp->endp = data->d_buf + newp->end;
   eu_search_tree_init (&newp->locs_tree);
-  rwlock_init (newp->abbrev_lock);
   rwlock_init (newp->split_lock);
+  mutex_init (newp->abbrev_lock);
   mutex_init (newp->src_lock);
   mutex_init (newp->str_off_base_lock);
   mutex_init (newp->intern_lock);
-- 
2.51.0

Reply via email to