Greetings,

Attached patch almost entirely mechanically copies x86_64/Gtrace.c and
x86_64/Gstash_frame.c to x86.

Performance result from Gperf-trace on my machine, using recent GCC SVN
(gcc (GCC) 4.7.0 20111031 (experimental)) in 32-bit mode:

before:

unw_getcontext : cold avg=   69.758 nsec, warm avg=   68.856 nsec
unw_init_local : cold avg=  149.264 nsec, warm avg=   58.820 nsec
no cache        : unw_step : 1st= 1132.628 min=  579.818 avg=  614.949 nsec
global cache    : unw_step : 1st=  619.381 min=  579.819 avg=  600.354 nsec
per-thread cache: unw_step : 1st=  638.485 min=  579.818 avg=  598.752 nsec

after:

unw_getcontext : cold avg=   69.141 nsec, warm avg=   28.610 nsec
unw_init_local : cold avg=  159.740 nsec, warm avg=   28.610 nsec
no cache        : unw_step : 1st= 2840.780 min=    8.997 avg=   18.182 nsec
global cache    : unw_step : 1st=   47.234 min=    8.997 avg=   14.752 nsec
per-thread cache: unw_step : 1st=   47.234 min=    8.997 avg=   14.698 nsec

I observe two new failures: Ltest-nocalloc (expected) and run-ptrace-misc.

The second failure I can't quite explain: AFAICT the fast trace is not
used by that test.

Thanks,

P.S. I did not apply this fix:
  http://lists.nongnu.org/archive/html/libunwind-devel/2011-11/msg00018.html
as I didn't get any comments on it yet.

If it is to be applied, it would need to go into both places (I think).

-- 
Paul Pluzhnikov
diff --git a/include/tdep-x86/libunwind_i.h b/include/tdep-x86/libunwind_i.h
index 1f2e144..efb0035 100644
--- a/include/tdep-x86/libunwind_i.h
+++ b/include/tdep-x86/libunwind_i.h
@@ -36,9 +36,24 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
SOFTWARE.  */
 #include "mempool.h"
 #include "dwarf.h"
 
+typedef enum
+  {
+    UNW_X86_FRAME_STANDARD = -2,     /* regular rbp, rsp +/- offset */
+    UNW_X86_FRAME_SIGRETURN = -1,    /* special sigreturn frame */
+    UNW_X86_FRAME_OTHER = 0,         /* not cacheable (special or 
unrecognised) */
+    UNW_X86_FRAME_GUESSED = 1        /* guessed it was regular, but not known 
*/
+  }
+unw_tdep_frame_type_t;
+
 typedef struct
   {
-    /* no x86-specific fast trace */
+    uint32_t virtual_address;
+    int32_t frame_type     : 2;  /* unw_tdep_frame_type_t classification */
+    int32_t last_frame     : 1;  /* non-zero if last frame in chain */
+    int32_t cfa_reg_esp    : 1;  /* cfa dwarf base register is esp vs. ebp */
+    int32_t cfa_reg_offset : 30; /* cfa is at this offset from base register 
value */
+    int32_t ebp_cfa_offset : 15; /* ebp saved at this offset from cfa (-1 = 
not saved) */
+    int32_t esp_cfa_offset : 15; /* esp saved at this offset from cfa (-1 = 
not saved) */
   }
 unw_tdep_frame_t;
 
@@ -61,6 +76,8 @@ struct cursor
   {
     struct dwarf_cursor dwarf;         /* must be first */
 
+    unw_tdep_frame_t frame_info;       /* quick tracing assist info */
+
     /* Format of sigcontext structure and address at which it is
        stored: */
     enum
@@ -250,8 +267,8 @@ dwarf_put (struct dwarf_cursor *c, dwarf_loc_t loc, 
unw_word_t val)
 #define tdep_fetch_frame(c,ip,n)       do {} while(0)
 #define tdep_cache_frame(c,rs)         do {} while(0)
 #define tdep_reuse_frame(c,rs)         do {} while(0)
-#define tdep_stash_frame(c,rs)         do {} while(0)
-#define tdep_trace(cur,addr,n)         (-UNW_ENOINFO)
+#define tdep_stash_frame               UNW_OBJ(stash_frame)
+#define tdep_trace                     UNW_OBJ(tdep_trace)
 
 #ifdef UNW_LOCAL_ONLY
 # define tdep_find_proc_info(c,ip,n)                           \
diff --git a/src/Makefile.am b/src/Makefile.am
index 26b1895..31e90b3 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -236,7 +236,7 @@ libunwind_la_SOURCES_x86 = 
$(libunwind_la_SOURCES_x86_common)               \
        x86/Lcreate_addr_space.c x86/Lget_save_loc.c x86/Lglobal.c      \
        x86/Linit.c x86/Linit_local.c x86/Linit_remote.c                \
        x86/Lget_proc_info.c x86/Lregs.c                                \
-       x86/Lresume.c x86/Lstep.c
+       x86/Lresume.c x86/Lstash_frame.c x86/Lstep.c x86/Ltrace.c
 
 # The list of files that go into libunwind-x86:
 libunwind_x86_la_SOURCES_x86 = $(libunwind_la_SOURCES_x86_common)      \
@@ -245,7 +245,7 @@ libunwind_x86_la_SOURCES_x86 = 
$(libunwind_la_SOURCES_x86_common)   \
        x86/Gcreate_addr_space.c x86/Gget_save_loc.c x86/Gglobal.c      \
        x86/Ginit.c x86/Ginit_local.c x86/Ginit_remote.c                \
        x86/Gget_proc_info.c x86/Gregs.c                                \
-       x86/Gresume.c x86/Gstep.c
+       x86/Gresume.c x86/Gstash_frame.c x86/Gstep.c x86/Gtrace.c
 
 # The list of files that go both into libunwind and libunwind-x86_64:
 noinst_HEADERS += x86_64/offsets.h                                     \
diff --git a/src/x86/Gstash_frame.c b/src/x86/Gstash_frame.c
new file mode 100644
index 0000000..7a8f840
--- /dev/null
+++ b/src/x86/Gstash_frame.c
@@ -0,0 +1,95 @@
+/* libunwind - a platform-independent unwind library
+   Copyright (C) 2011 by FERMI NATIONAL ACCELERATOR LABORATORY
+     Adjusted for x86 from x86_64 by Paul Pluzhnikov <[email protected]>
+
+This file is part of libunwind.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#include "unwind_i.h"
+
+HIDDEN void
+tdep_stash_frame (struct dwarf_cursor *d, struct dwarf_reg_state *rs)
+{
+  struct cursor *c = (struct cursor *) dwarf_to_cursor (d);
+  unw_tdep_frame_t *f = &c->frame_info;
+
+  Debug (4, "ip=0x%lx cfa=0x%lx type %d cfa [where=%d val=%ld] cfaoff=%ld"
+        " ra=0x%lx ebp [where=%d val=%ld @0x%lx] esp [where=%d val=%ld 
@0x%lx]\n",
+        d->ip, d->cfa, f->frame_type,
+        rs->reg[DWARF_CFA_REG_COLUMN].where,
+        rs->reg[DWARF_CFA_REG_COLUMN].val,
+        rs->reg[DWARF_CFA_OFF_COLUMN].val,
+        DWARF_GET_LOC(d->loc[d->ret_addr_column]),
+        rs->reg[EBP].where, rs->reg[EBP].val, DWARF_GET_LOC(d->loc[EBP]),
+        rs->reg[ESP].where, rs->reg[ESP].val, DWARF_GET_LOC(d->loc[ESP]));
+
+  /* A standard frame is defined as:
+      - CFA is register-relative offset off EBP or ESP;
+      - Return address is saved at CFA-8;
+      - EBP is unsaved or saved at CFA+offset, offset != -1;
+      - ESP is unsaved or saved at CFA+offset, offset != -1.  */
+  if (f->frame_type == UNW_X86_FRAME_OTHER
+      && (rs->reg[DWARF_CFA_REG_COLUMN].where == DWARF_WHERE_REG)
+      && (rs->reg[DWARF_CFA_REG_COLUMN].val == EBP
+         || rs->reg[DWARF_CFA_REG_COLUMN].val == ESP)
+      && labs(rs->reg[DWARF_CFA_OFF_COLUMN].val) < (1 << 29)
+      && DWARF_GET_LOC(d->loc[d->ret_addr_column]) == d->cfa-8
+      && (rs->reg[EBP].where == DWARF_WHERE_UNDEF
+         || rs->reg[EBP].where == DWARF_WHERE_SAME
+         || (rs->reg[EBP].where == DWARF_WHERE_CFAREL
+             && labs(rs->reg[EBP].val) < (1 << 14)
+             && rs->reg[EBP].val+1 != 0))
+      && (rs->reg[ESP].where == DWARF_WHERE_UNDEF
+         || rs->reg[ESP].where == DWARF_WHERE_SAME
+         || (rs->reg[ESP].where == DWARF_WHERE_CFAREL
+             && labs(rs->reg[ESP].val) < (1 << 14)
+             && rs->reg[ESP].val+1 != 0)))
+  {
+    /* Save information for a standard frame. */
+    f->frame_type = UNW_X86_FRAME_STANDARD;
+    f->cfa_reg_esp = (rs->reg[DWARF_CFA_REG_COLUMN].val == ESP);
+    f->cfa_reg_offset = rs->reg[DWARF_CFA_OFF_COLUMN].val;
+    if (rs->reg[EBP].where == DWARF_WHERE_CFAREL)
+      f->ebp_cfa_offset = rs->reg[EBP].val;
+    if (rs->reg[ESP].where == DWARF_WHERE_CFAREL)
+      f->esp_cfa_offset = rs->reg[ESP].val;
+    Debug (4, " standard frame\n");
+  }
+
+  /* Signal frame was detected via augmentation in tdep_fetch_frame()
+     and partially filled in tdep_reuse_frame().  Now that we have
+     the delta between inner and outer CFAs available to use, fill in
+     the offsets for CFA and stored registers.  We don't have space
+     for RIP, it's location is calculated relative to EBP location. */
+  else if (f->frame_type == UNW_X86_FRAME_SIGRETURN)
+  {
+    assert (f->cfa_reg_offset == -1);
+    f->cfa_reg_offset = d->cfa - c->sigcontext_addr;
+    f->ebp_cfa_offset = DWARF_GET_LOC(d->loc[EBP]) - d->cfa;
+    f->esp_cfa_offset = DWARF_GET_LOC(d->loc[ESP]) - d->cfa;
+    Debug (4, " sigreturn frame ebpoff %d espoff %d\n",
+          f->ebp_cfa_offset, f->esp_cfa_offset);
+  }
+
+  /* PLT and guessed EBP-walked frames are handled in unw_step(). */
+  else
+    Debug (4, " unusual frame\n");
+}
diff --git a/src/x86/Gtrace.c b/src/x86/Gtrace.c
new file mode 100644
index 0000000..6d5ad45
--- /dev/null
+++ b/src/x86/Gtrace.c
@@ -0,0 +1,543 @@
+/* libunwind - a platform-independent unwind library
+   Copyright (C) 2010, 2011 by FERMI NATIONAL ACCELERATOR LABORATORY
+     Adjusted for x86 from x86_64 by Paul Pluzhnikov <[email protected]>
+
+This file is part of libunwind.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#include "unwind_i.h"
+#include "offsets.h"
+#include <signal.h>
+#include <limits.h>
+
+#pragma weak pthread_once
+#pragma weak pthread_key_create
+#pragma weak pthread_getspecific
+#pragma weak pthread_setspecific
+
+/* Initial hash table size. Table expands by 2 bits (times four). */
+#define HASH_MIN_BITS 14
+
+/* There's not enough space to store RIP's location in a signal
+   frame, but we can calculate it relative to RBP's (or RSP's)
+   position in mcontext structure.  Note we don't want to use
+   the UC_MCONTEXT_GREGS_* directly since we rely on DWARF info. */
+#define dEIP (LINUX_SC_EIP_OFF - LINUX_SC_EBP_OFF)
+
+typedef struct
+{
+  unw_tdep_frame_t *frames;
+  size_t log_size;
+  size_t used;
+  size_t dtor_count;  /* Counts how many times our destructor has already
+                        been called. */
+} unw_trace_cache_t;
+
+static const unw_tdep_frame_t empty_frame = { 0, UNW_X86_FRAME_OTHER, -1, -1, 
0, -1, -1 };
+static pthread_mutex_t trace_init_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_once_t trace_cache_once = PTHREAD_ONCE_INIT;
+static sig_atomic_t trace_cache_once_happen;
+static pthread_key_t trace_cache_key;
+static struct mempool trace_cache_pool;
+static __thread  unw_trace_cache_t *tls_cache;
+static __thread  int tls_cache_destroyed;
+
+/* Free memory for a thread's trace cache. */
+static void
+trace_cache_free (void *arg)
+{
+  unw_trace_cache_t *cache = arg;
+  if (++cache->dtor_count < PTHREAD_DESTRUCTOR_ITERATIONS)
+  {
+    /* Not yet our turn to get destroyed. Re-install ourselves into the key. */
+    pthread_setspecific(trace_cache_key, cache);
+    Debug(5, "delayed freeing cache %p (%zx to go)\n", cache,
+         PTHREAD_DESTRUCTOR_ITERATIONS - cache->dtor_count);
+    return;
+  }
+  munmap (cache->frames, (1u << cache->log_size) * sizeof(unw_tdep_frame_t));
+  mempool_free (&trace_cache_pool, cache);
+  tls_cache = NULL;
+  tls_cache_destroyed = 1;
+  Debug(5, "freed cache %p\n", cache);
+}
+
+/* Initialise frame tracing for threaded use. */
+static void
+trace_cache_init_once (void)
+{
+  pthread_key_create (&trace_cache_key, &trace_cache_free);
+  mempool_init (&trace_cache_pool, sizeof (unw_trace_cache_t), 0);
+  trace_cache_once_happen = 1;
+}
+
+static unw_tdep_frame_t *
+trace_cache_buckets (size_t n)
+{
+  unw_tdep_frame_t *frames;
+  size_t i;
+
+  GET_MEMORY(frames, n * sizeof (unw_tdep_frame_t));
+  if (likely(frames != 0))
+    for (i = 0; i < n; ++i)
+      frames[i] = empty_frame;
+
+  return frames;
+}
+
+/* Allocate and initialise hash table for frame cache lookups.
+   Returns the cache initialised with (1u << HASH_LOW_BITS) hash
+   buckets, or NULL if there was a memory allocation problem. */
+static unw_trace_cache_t *
+trace_cache_create (void)
+{
+  unw_trace_cache_t *cache;
+
+  if (tls_cache_destroyed)
+  {
+    /* The current thread is in the process of exiting. Don't recreate
+       cache, as we wouldn't have another chance to free it. */
+    Debug(5, "refusing to reallocate cache: "
+            "thread-locals are being deallocated\n");
+    return NULL;
+  }
+
+  if (! (cache = mempool_alloc(&trace_cache_pool)))
+  {
+    Debug(5, "failed to allocate cache\n");
+    return NULL;
+  }
+
+  if (! (cache->frames = trace_cache_buckets(1u << HASH_MIN_BITS)))
+  {
+    Debug(5, "failed to allocate buckets\n");
+    mempool_free(&trace_cache_pool, cache);
+    return NULL;
+  }
+
+  cache->log_size = HASH_MIN_BITS;
+  cache->used = 0;
+  cache->dtor_count = 0;
+  tls_cache_destroyed = 0;  /* Paranoia: should already be 0. */
+  Debug(5, "allocated cache %p\n", cache);
+  return cache;
+}
+
+/* Expand the hash table in the frame cache if possible. This always
+   quadruples the hash size, and clears all previous frame entries. */
+static int
+trace_cache_expand (unw_trace_cache_t *cache)
+{
+  size_t old_size = (1u << cache->log_size);
+  size_t new_log_size = cache->log_size + 2;
+  unw_tdep_frame_t *new_frames = trace_cache_buckets (1u << new_log_size);
+
+  if (unlikely(! new_frames))
+  {
+    Debug(5, "failed to expand cache to 2^%lu buckets\n", new_log_size);
+    return -UNW_ENOMEM;
+  }
+
+  Debug(5, "expanded cache from 2^%lu to 2^%lu buckets\n", cache->log_size, 
new_log_size);
+  munmap(cache->frames, old_size * sizeof(unw_tdep_frame_t));
+  cache->frames = new_frames;
+  cache->log_size = new_log_size;
+  cache->used = 0;
+  return 0;
+}
+
+static unw_trace_cache_t *
+trace_cache_get_unthreaded (void)
+{
+  unw_trace_cache_t *cache;
+  intrmask_t saved_mask;
+  static unw_trace_cache_t *global_cache = 0;
+  lock_acquire (&trace_init_lock, saved_mask);
+  if (! global_cache)
+  {
+    mempool_init (&trace_cache_pool, sizeof (unw_trace_cache_t), 0);
+    global_cache = trace_cache_create ();
+  }
+  cache = global_cache;
+  lock_release (&trace_init_lock, saved_mask);
+  Debug(5, "using cache %p\n", cache);
+  return cache;
+}
+
+/* Get the frame cache for the current thread. Create it if there is none. */
+static unw_trace_cache_t *
+trace_cache_get (void)
+{
+  unw_trace_cache_t *cache;
+  if (likely (pthread_once != 0))
+  {
+    pthread_once(&trace_cache_once, &trace_cache_init_once);
+    if (!trace_cache_once_happen)
+    {
+      return trace_cache_get_unthreaded();
+    }
+    if (! (cache = tls_cache))
+    {
+      cache = trace_cache_create();
+      pthread_setspecific(trace_cache_key, cache);
+      tls_cache = cache;
+    }
+    Debug(5, "using cache %p\n", cache);
+    return cache;
+  }
+  else
+  {
+    return trace_cache_get_unthreaded();
+  }
+}
+
+/* Initialise frame properties for address cache slot F at address
+   RIP using current CFA, RBP and RSP values.  Modifies CURSOR to
+   that location, performs one unw_step(), and fills F with what
+   was discovered about the location.  Returns F.
+
+   FIXME: This probably should tell DWARF handling to never evaluate
+   or use registers other than RBP, RSP and RIP in case there is
+   highly unusual unwind info which uses these creatively. */
+static unw_tdep_frame_t *
+trace_init_addr (unw_tdep_frame_t *f,
+                unw_cursor_t *cursor,
+                unw_word_t cfa,
+                unw_word_t eip,
+                unw_word_t ebp,
+                unw_word_t esp)
+{
+  struct cursor *c = (struct cursor *) cursor;
+  struct dwarf_cursor *d = &c->dwarf;
+  int ret = -UNW_EINVAL;
+
+  /* Initialise frame properties: unknown, not last. */
+  f->virtual_address = eip;
+  f->frame_type = UNW_X86_FRAME_OTHER;
+  f->last_frame = 0;
+  f->cfa_reg_esp = -1;
+  f->cfa_reg_offset = 0;
+  f->ebp_cfa_offset = -1;
+  f->esp_cfa_offset = -1;
+
+  /* Reinitialise cursor to this instruction - but undo next/prev RIP
+     adjustment because unw_step will redo it - and force RIP, RBP
+     RSP into register locations (=~ ucontext we keep), then set
+     their desired values. Then perform the step. */
+  d->ip = eip + d->use_prev_instr;
+  d->cfa = cfa;
+  d->loc[UNW_X86_EIP] = DWARF_REG_LOC (d, UNW_X86_EIP);
+  d->loc[UNW_X86_EBP] = DWARF_REG_LOC (d, UNW_X86_EBP);
+  d->loc[UNW_X86_ESP] = DWARF_REG_LOC (d, UNW_X86_ESP);
+  c->frame_info = *f;
+
+  if (likely(dwarf_put (d, d->loc[UNW_X86_EIP], eip) >= 0)
+      && likely(dwarf_put (d, d->loc[UNW_X86_EBP], ebp) >= 0)
+      && likely(dwarf_put (d, d->loc[UNW_X86_ESP], esp) >= 0)
+      && likely((ret = unw_step (cursor)) >= 0))
+    *f = c->frame_info;
+
+  /* If unw_step() stopped voluntarily, remember that, even if it
+     otherwise could not determine anything useful.  This avoids
+     failing trace if we hit frames without unwind info, which is
+     common for the outermost frame (CRT stuff) on many systems.
+     This avoids failing trace in very common circumstances; failing
+     to unw_step() loop wouldn't produce any better result. */
+  if (ret == 0)
+    f->last_frame = -1;
+
+  Debug (3, "frame va %lx type %d last %d cfa %s+%d ebp @ cfa%+d esp @ 
cfa%+d\n",
+        f->virtual_address, f->frame_type, f->last_frame,
+        f->cfa_reg_esp ? "esp" : "ebp", f->cfa_reg_offset,
+        f->ebp_cfa_offset, f->esp_cfa_offset);
+
+  return f;
+}
+
+/* Look up and if necessary fill in frame attributes for address RIP
+   in CACHE using current CFA, RBP and RSP values.  Uses CURSOR to
+   perform any unwind steps necessary to fill the cache.  Returns the
+   frame cache slot which describes RIP. */
+static unw_tdep_frame_t *
+trace_lookup (unw_cursor_t *cursor,
+             unw_trace_cache_t *cache,
+             unw_word_t cfa,
+             unw_word_t eip,
+             unw_word_t ebp,
+             unw_word_t esp)
+{
+  /* First look up for previously cached information using cache as
+     linear probing hash table with probe step of 1.  Majority of
+     lookups should be completed within few steps, but it is very
+     important the hash table does not fill up, or performance falls
+     off the cliff. */
+  uint64_t i, addr;
+  uint64_t cache_size = 1u << cache->log_size;
+  uint64_t slot = ((eip * 0x9e3779b97f4a7c16) >> 43) & (cache_size-1);
+  unw_tdep_frame_t *frame;
+
+  for (i = 0; i < 16; ++i)
+  {
+    frame = &cache->frames[slot];
+    addr = frame->virtual_address;
+
+    /* Return if we found the address. */
+    if (likely(addr == eip))
+    {
+      Debug (4, "found address after %ld steps\n", i);
+      return frame;
+    }
+
+    /* If slot is empty, reuse it. */
+    if (likely(! addr))
+      break;
+
+    /* Linear probe to next slot candidate, step = 1. */
+    if (++slot >= cache_size)
+      slot -= cache_size;
+  }
+
+  /* If we collided after 16 steps, or if the hash is more than half
+     full, force the hash to expand. Fill the selected slot, whether
+     it's free or collides. Note that hash expansion drops previous
+     contents; further lookups will refill the hash. */
+  Debug (4, "updating slot %lu after %ld steps, replacing 0x%lx\n", slot, i, 
addr);
+  if (unlikely(addr || cache->used >= cache_size / 2))
+  {
+    if (unlikely(trace_cache_expand (cache) < 0))
+      return NULL;
+
+    cache_size = 1u << cache->log_size;
+    slot = ((eip * 0x9e3779b97f4a7c16) >> 43) & (cache_size-1);
+    frame = &cache->frames[slot];
+    addr = frame->virtual_address;
+  }
+
+  if (! addr)
+    ++cache->used;
+
+  return trace_init_addr (frame, cursor, cfa, eip, ebp, esp);
+}
+
+/* Fast stack backtrace for x86-64.
+
+   This is used by backtrace() implementation to accelerate frequent
+   queries for current stack, without any desire to unwind. It fills
+   BUFFER with the call tree from CURSOR upwards for at most SIZE
+   stack levels. The first frame, backtrace itself, is omitted. When
+   called, SIZE should give the maximum number of entries that can be
+   stored into BUFFER. Uses an internal thread-specific cache to
+   accelerate queries.
+
+   The caller should fall back to a unw_step() loop if this function
+   fails by returning -UNW_ESTOPUNWIND, meaning the routine hit a
+   stack frame that is too complex to be traced in the fast path.
+
+   This function is tuned for clients which only need to walk the
+   stack to get the call tree as fast as possible but without any
+   other details, for example profilers sampling the stack thousands
+   to millions of times per second.  The routine handles the most
+   common x86-64 ABI stack layouts: CFA is RBP or RSP plus/minus
+   constant offset, return address is at CFA-8, and RBP and RSP are
+   either unchanged or saved on stack at constant offset from the CFA;
+   the signal return frame; and frames without unwind info provided
+   they are at the outermost (final) frame or can conservatively be
+   assumed to be frame-pointer based.
+
+   Any other stack layout will cause the routine to give up. There
+   are only a handful of relatively rarely used functions which do
+   not have a stack in the standard form: vfork, longjmp, setcontext
+   and _dl_runtime_profile on common linux systems for example.
+
+   On success BUFFER and *SIZE reflect the trace progress up to *SIZE
+   stack levels or the outermost frame, which ever is less.  It may
+   stop short of outermost frame if unw_step() loop would also do so,
+   e.g. if there is no more unwind information; this is not reported
+   as an error.
+
+   The function returns a negative value for errors, -UNW_ESTOPUNWIND
+   if tracing stopped because of an unusual frame unwind info.  The
+   BUFFER and *SIZE reflect tracing progress up to the error frame.
+
+   Callers of this function would normally look like this:
+
+     unw_cursor_t     cur;
+     unw_context_t    ctx;
+     void             addrs[128];
+     int              depth = 128;
+     int              ret;
+
+     unw_getcontext(&ctx);
+     unw_init_local(&cur, &ctx);
+     if ((ret = unw_tdep_trace(&cur, addrs, &depth)) < 0)
+     {
+       depth = 0;
+       unw_getcontext(&ctx);
+       unw_init_local(&cur, &ctx);
+       while ((ret = unw_step(&cur)) > 0 && depth < 128)
+       {
+         unw_word_t ip;
+         unw_get_reg(&cur, UNW_REG_IP, &ip);
+         addresses[depth++] = (void *) ip;
+       }
+     }
+*/
+HIDDEN int
+tdep_trace (unw_cursor_t *cursor, void **buffer, int *size)
+{
+  struct cursor *c = (struct cursor *) cursor;
+  struct dwarf_cursor *d = &c->dwarf;
+  unw_trace_cache_t *cache;
+  unw_word_t ebp, esp, eip, cfa;
+  int maxdepth = 0;
+  int depth = 0;
+  int ret;
+
+  /* Check input parametres. */
+  if (unlikely(! cursor || ! buffer || ! size || (maxdepth = *size) <= 0))
+    return -UNW_EINVAL;
+
+  Debug (1, "begin ip 0x%lx cfa 0x%lx\n", d->ip, d->cfa);
+
+  /* Tell core dwarf routines to call back to us. */
+  d->stash_frames = 1;
+
+  /* Determine initial register values. These are direct access safe
+     because we know they come from the initial machine context. */
+  eip = d->ip;
+  esp = cfa = d->cfa;
+  ACCESS_MEM_FAST(ret, 0, d, DWARF_GET_LOC(d->loc[UNW_X86_EBP]), ebp);
+  assert(ret == 0);
+
+  /* Get frame cache. */
+  if (unlikely(! (cache = trace_cache_get())))
+  {
+    Debug (1, "returning %d, cannot get trace cache\n", -UNW_ENOMEM);
+    *size = 0;
+    d->stash_frames = 0;
+    return -UNW_ENOMEM;
+  }
+
+  /* Trace the stack upwards, starting from current RIP.  Adjust
+     the RIP address for previous/next instruction as the main
+     unwinding logic would also do.  We undo this before calling
+     back into unw_step(). */
+  while (depth < maxdepth)
+  {
+    eip -= d->use_prev_instr;
+    Debug (2, "depth %d cfa 0x%lx eip 0x%lx esp 0x%lx ebp 0x%lx\n",
+          depth, cfa, eip, esp, ebp);
+
+    /* See if we have this address cached.  If not, evaluate enough of
+       the dwarf unwind information to fill the cache line data, or to
+       decide this frame cannot be handled in fast trace mode.  We
+       cache negative results too to prevent unnecessary dwarf parsing
+       for common failures. */
+    unw_tdep_frame_t *f = trace_lookup (cursor, cache, cfa, eip, ebp, esp);
+
+    /* If we don't have information for this frame, give up. */
+    if (unlikely(! f))
+    {
+      ret = -UNW_ENOINFO;
+      break;
+    }
+
+    Debug (3, "frame va %lx type %d last %d cfa %s+%d ebp @ cfa%+d esp @ 
cfa%+d\n",
+           f->virtual_address, f->frame_type, f->last_frame,
+           f->cfa_reg_esp ? "esp" : "ebp", f->cfa_reg_offset,
+           f->ebp_cfa_offset, f->esp_cfa_offset);
+
+    assert (f->virtual_address == eip);
+
+    /* Stop if this was the last frame.  In particular don't evaluate
+       new register values as it may not be safe - we don't normally
+       run with full validation on, and do not want to - and there's
+       enough bad unwind info floating around that we need to trust
+       what unw_step() previously said, in potentially bogus frames. */
+    if (f->last_frame)
+      break;
+
+    /* Evaluate CFA and registers for the next frame. */
+    switch (f->frame_type)
+    {
+    case UNW_X86_FRAME_GUESSED:
+      /* Fall thru to standard processing after forcing validation. */
+      c->validate = 1;
+
+    case UNW_X86_FRAME_STANDARD:
+      /* Advance standard traceable frame. */
+      cfa = (f->cfa_reg_esp ? esp : ebp) + f->cfa_reg_offset;
+      ACCESS_MEM_FAST(ret, c->validate, d, cfa - 8, eip);
+      if (likely(ret >= 0) && likely(f->ebp_cfa_offset != -1))
+       ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->ebp_cfa_offset, ebp);
+
+      /* Don't bother reading RSP from DWARF, CFA becomes new RSP. */
+      esp = cfa;
+
+      /* Next frame needs to back up for unwind info lookup. */
+      d->use_prev_instr = 1;
+      break;
+
+    case UNW_X86_FRAME_SIGRETURN:
+      /* Advance standard signal frame, whose CFA points above saved
+         registers (ucontext) among other things.  We know the info
+        is stored at some unknown constant offset off inner frame's
+        CFA.  We determine the actual offset from DWARF unwind info. */
+      cfa = cfa + f->cfa_reg_offset;
+      ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->ebp_cfa_offset + dEIP, 
eip);
+      if (likely(ret >= 0))
+        ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->ebp_cfa_offset, ebp);
+      if (likely(ret >= 0))
+        ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->esp_cfa_offset, esp);
+
+      /* Resume stack at signal restoration point. The stack is not
+         necessarily continuous here, especially with sigaltstack(). */
+      cfa = esp;
+
+      /* Next frame should not back up. */
+      d->use_prev_instr = 0;
+      break;
+
+    default:
+      /* We cannot trace through this frame, give up and tell the
+        caller we had to stop.  Data collected so far may still be
+        useful to the caller, so let it know how far we got.  */
+      ret = -UNW_ESTOPUNWIND;
+      break;
+    }
+
+    Debug (4, "new cfa 0x%lx eip 0x%lx esp 0x%lx ebp 0x%lx\n",
+          cfa, eip, esp, ebp);
+
+    /* If we failed or ended up somewhere bogus, stop. */
+    if (unlikely(ret < 0 || eip < 0x4000))
+      break;
+
+    /* Record this address in stack trace. We skipped the first address. */
+    buffer[depth++] = (void *) (eip - d->use_prev_instr);
+  }
+
+#if UNW_DEBUG
+  Debug (1, "returning %d, depth %d\n", ret, depth);
+#endif
+  *size = depth;
+  return ret;
+}
_______________________________________________
Libunwind-devel mailing list
[email protected]
https://lists.nongnu.org/mailman/listinfo/libunwind-devel

Reply via email to