Author: Lassi Tuura <lat@cern.ch>
Date:   2011-03-24 10:13:44 +0100

    Performance optimisations for fast trace.

Insert static branch prediction predicates in useful places and avoid
unnecessary code in the hottest paths. Bypass unnecessary indirect
calls, in particular to access_mem(), when known to be safe.
---
 include/libunwind_i.h             |    7 +++++
 include/tdep-x86_64/libunwind_i.h |    8 +++---
 src/mi/backtrace.c                |    8 +++---
 src/x86_64/Ginit.c                |   14 ++---------
 src/x86_64/Ginit_local.c          |    2 +-
 src/x86_64/Gos-freebsd.c          |    1 +
 src/x86_64/Gos-linux.c            |    1 +
 src/x86_64/Gtrace.c               |   49 ++++++++++++++++++-------------------
 src/x86_64/init.h                 |   48 +++++++++++++++++++++++-------------
 src/x86_64/unwind_i.h             |   12 +++++++++
 10 files changed, 87 insertions(+), 63 deletions(-)

diff --git a/include/libunwind_i.h b/include/libunwind_i.h
index 9b91a12..81fc3d6 100644
--- a/include/libunwind_i.h
+++ b/include/libunwind_i.h
@@ -301,6 +301,13 @@ struct elf_image
     size_t size;		/* (file-) size of the image */
   };
 
+/* Provide a place holder for architecture to override for fast access
+   to memory when known not to need to validate and know the access
+   will be local to the process. A suitable override will improve
+   unw_tdep_trace() performance in particular. */
+#define ACCESS_MEM_FAST(ret,validate,cur,addr,to) \
+  do { (ret) = dwarf_get ((cur), DWARF_MEM_LOC ((cur), (addr)), &(to)); } while (0)
+
 #include "tdep/libunwind_i.h"
 
 #ifndef tdep_get_func_addr
diff --git a/include/tdep-x86_64/libunwind_i.h b/include/tdep-x86_64/libunwind_i.h
index b076bd1..a427af3 100644
--- a/include/tdep-x86_64/libunwind_i.h
+++ b/include/tdep-x86_64/libunwind_i.h
@@ -89,10 +89,10 @@ dwarf_get_uc(const struct dwarf_cursor *cursor)
 # define DWARF_LOC(r, t)	((dwarf_loc_t) { .val = (r) })
 # define DWARF_IS_REG_LOC(l)	0
 # define DWARF_REG_LOC(c,r)	(DWARF_LOC((unw_word_t)			     \
-				 tdep_uc_addr(dwarf_get_uc(c), (r)), 0))
+				 x86_64_r_uc_addr(dwarf_get_uc(c), (r)), 0))
 # define DWARF_MEM_LOC(c,m)	DWARF_LOC ((m), 0)
 # define DWARF_FPREG_LOC(c,r)	(DWARF_LOC((unw_word_t)			     \
-				 tdep_uc_addr(dwarf_get_uc(c), (r)), 0))
+				 x86_64_r_uc_addr(dwarf_get_uc(c), (r)), 0))
 #else /* !UNW_LOCAL_ONLY */
 
 # define DWARF_LOC_TYPE_FP	(1 << 0)
@@ -162,7 +162,6 @@ dwarf_put (struct dwarf_cursor *c, dwarf_loc_t loc, unw_word_t val)
 /* Platforms that support UNW_INFO_FORMAT_TABLE need to define
    tdep_search_unwind_table.  */
 #define tdep_search_unwind_table	dwarf_search_unwind_table
-#define tdep_uc_addr			UNW_ARCH_OBJ(uc_addr)
 #define tdep_get_elf_image		UNW_ARCH_OBJ(get_elf_image)
 #define tdep_access_reg			UNW_OBJ(access_reg)
 #define tdep_access_fpreg		UNW_OBJ(access_fpreg)
@@ -176,6 +175,7 @@ dwarf_put (struct dwarf_cursor *c, dwarf_loc_t loc, unw_word_t val)
 # define tdep_reuse_frame(c,rs)		do {} while(0)
 #endif
 #define tdep_stash_frame		UNW_OBJ(stash_frame)
+#define x86_64_r_uc_addr                UNW_OBJ(r_uc_addr)
 
 #ifdef UNW_LOCAL_ONLY
 # define tdep_find_proc_info(c,ip,n)				\
@@ -203,7 +203,7 @@ extern void tdep_init_mem_validate (void);
 extern int tdep_search_unwind_table (unw_addr_space_t as, unw_word_t ip,
 				     unw_dyn_info_t *di, unw_proc_info_t *pi,
 				     int need_unwind_info, void *arg);
-extern void *tdep_uc_addr (ucontext_t *uc, int reg);
+extern void *x86_64_r_uc_addr (ucontext_t *uc, int reg);
 extern int tdep_get_elf_image (struct elf_image *ei, pid_t pid, unw_word_t ip,
 			       unsigned long *segbase, unsigned long *mapoff,
 			       char *path, size_t pathlen);
diff --git a/src/mi/backtrace.c b/src/mi/backtrace.c
index a126b1a..309783e 100644
--- a/src/mi/backtrace.c
+++ b/src/mi/backtrace.c
@@ -26,7 +26,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #ifndef UNW_REMOTE_ONLY
 
 #define UNW_LOCAL_ONLY
-#include <libunwind.h>
+#include "libunwind_i.h"
 #include <string.h>
 
 /* See glibc manual for a description of this function.  */
@@ -40,7 +40,7 @@ slow_backtrace (void **buffer, int size)
   int n = 0;
 
   unw_getcontext (&uc);
-  if (unw_init_local (&cursor, &uc) < 0)
+  if (unlikely (unw_init_local (&cursor, &uc) < 0))
     return 0;
 
   while (unw_step (&cursor) > 0)
@@ -65,7 +65,7 @@ backtrace (void **buffer, int size)
 
   unw_getcontext (&uc);
 
-  if (unw_init_local (&cursor, &uc) < 0)
+  if (unlikely (unw_init_local (&cursor, &uc) < 0))
     return 0;
 
   /* We don't need backtrace() to show up in buffer */
@@ -73,7 +73,7 @@ backtrace (void **buffer, int size)
   if (ret < 0)
     return ret;
 
-  if (unw_tdep_trace (&cursor, buffer, &n) < 0)
+  if (unlikely (unw_tdep_trace (&cursor, buffer, &n) < 0))
     {
       return slow_backtrace(buffer, size);
     }
diff --git a/src/x86_64/Ginit.c b/src/x86_64/Ginit.c
index f49e4ba..e779d93 100644
--- a/src/x86_64/Ginit.c
+++ b/src/x86_64/Ginit.c
@@ -47,16 +47,6 @@ static struct unw_addr_space local_addr_space;
 
 PROTECTED unw_addr_space_t unw_local_addr_space = &local_addr_space;
 
-# ifdef UNW_LOCAL_ONLY
-
-HIDDEN void *
-tdep_uc_addr (ucontext_t *uc, int reg)
-{
-  return x86_64_r_uc_addr (uc, reg);
-}
-
-# endif /* UNW_LOCAL_ONLY */
-
 HIDDEN unw_dyn_info_list_t _U_dyn_info_list;
 
 /* XXX fix me: there is currently no way to locate the dyn-info list
@@ -168,7 +158,7 @@ static int
 access_mem (unw_addr_space_t as, unw_word_t addr, unw_word_t *val, int write,
 	    void *arg)
 {
-  if (write)
+  if (unlikely (write))
     {
       Debug (16, "mem[%016lx] <- %lx\n", addr, *val);
       *(unw_word_t *) addr = *val;
@@ -177,7 +167,7 @@ access_mem (unw_addr_space_t as, unw_word_t addr, unw_word_t *val, int write,
     {
       /* validate address */
       const struct cursor *c = (const struct cursor *)arg;
-      if (c && c->validate && validate_mem(addr))
+      if (likely (c != 0) && unlikely (c->validate) && unlikely (validate_mem (addr)))
         return -1;
       *val = *(unw_word_t *) addr;
       Debug (16, "mem[%016lx] -> %lx\n", addr, *val);
diff --git a/src/x86_64/Ginit_local.c b/src/x86_64/Ginit_local.c
index 70bef3e..54b4fcd 100644
--- a/src/x86_64/Ginit_local.c
+++ b/src/x86_64/Ginit_local.c
@@ -43,7 +43,7 @@ unw_init_local (unw_cursor_t *cursor, ucontext_t *uc)
 {
   struct cursor *c = (struct cursor *) cursor;
 
-  if (tdep_needs_initialization)
+  if (unlikely (tdep_needs_initialization))
     tdep_init ();
 
   Debug (1, "(cursor=%p)\n", c);
diff --git a/src/x86_64/Gos-freebsd.c b/src/x86_64/Gos-freebsd.c
index 50ee60b..3ef9926 100644
--- a/src/x86_64/Gos-freebsd.c
+++ b/src/x86_64/Gos-freebsd.c
@@ -154,6 +154,7 @@ unw_handle_signal_frame (unw_cursor_t *cursor)
 HIDDEN void *
 x86_64_r_uc_addr (ucontext_t *uc, int reg)
 {
+  /* NOTE: common_init() in init.h inlines these for fast path access. */
   void *addr;
 
   switch (reg)
diff --git a/src/x86_64/Gos-linux.c b/src/x86_64/Gos-linux.c
index c027888..a315ea1 100644
--- a/src/x86_64/Gos-linux.c
+++ b/src/x86_64/Gos-linux.c
@@ -106,6 +106,7 @@ unw_handle_signal_frame (unw_cursor_t *cursor)
 HIDDEN void *
 x86_64_r_uc_addr (ucontext_t *uc, int reg)
 {
+  /* NOTE: common_init() in init.h inlines these for fast path access. */
   void *addr;
 
   switch (reg)
diff --git a/src/x86_64/Gtrace.c b/src/x86_64/Gtrace.c
index fb6e84a..11669bb 100644
--- a/src/x86_64/Gtrace.c
+++ b/src/x86_64/Gtrace.c
@@ -92,7 +92,7 @@ trace_cache_buckets (void)
   unw_tdep_frame_t *frames = mempool_alloc(&trace_frame_pool);
   size_t i;
 
-  if (likely (frames != 0))
+  if (likely(frames != 0))
     for (i = 0; i < (1u << HASH_LOW_BITS); ++i)
       frames[i] = empty_frame;
 
@@ -142,7 +142,7 @@ trace_cache_expand (unw_trace_cache_t *cache)
   old_size = (1u << cache->log_frame_vecs);
   new_size = cache->log_frame_vecs + 2;
   for (i = old_size; i < (1u << new_size); ++i)
-    if (unlikely (! (cache->frames[i] = trace_cache_buckets())))
+    if (unlikely(! (cache->frames[i] = trace_cache_buckets())))
     {
       Debug(5, "failed to expand cache to 2^%lu hash bucket sets\n", new_size);
       for (j = old_size; j < i; ++j)
@@ -234,10 +234,10 @@ trace_init_addr (unw_tdep_frame_t *f,
   d->loc[UNW_X86_64_RSP] = DWARF_REG_LOC (d, UNW_X86_64_RSP);
   c->frame_info = *f;
 
-  if (dwarf_put (d, d->loc[UNW_X86_64_RIP], rip) >= 0
-      && dwarf_put (d, d->loc[UNW_X86_64_RBP], rbp) >= 0
-      && dwarf_put (d, d->loc[UNW_X86_64_RSP], rsp) >= 0
-      && (ret = unw_step (cursor)) >= 0)
+  if (likely(dwarf_put (d, d->loc[UNW_X86_64_RIP], rip) >= 0)
+      && likely(dwarf_put (d, d->loc[UNW_X86_64_RBP], rbp) >= 0)
+      && likely(dwarf_put (d, d->loc[UNW_X86_64_RSP], rsp) >= 0)
+      && likely((ret = unw_step (cursor)) >= 0))
     *f = c->frame_info;
 
   /* If unw_step() stopped voluntarily, remember that, even if it
@@ -287,14 +287,14 @@ trace_lookup (unw_cursor_t *cursor,
     addr = frame->virtual_address;
 
     /* Return if we found the address. */
-    if (addr == rip)
+    if (likely(addr == rip))
     {
       Debug (4, "found address after %ld steps\n", i);
       return frame;
     }
 
     /* If slot is empty, reuse it. */
-    if (! addr)
+    if (likely(! addr))
       break;
 
     /* Linear probe to next slot candidate, step = 1. */
@@ -307,9 +307,9 @@ trace_lookup (unw_cursor_t *cursor,
      it's free or collides. Note that hash expansion drops previous
      contents; further lookups will refill the hash. */
   Debug (4, "updating slot %lu after %ld steps, replacing 0x%lx\n", slot, i, addr);
-  if (unlikely (addr || cache->used >= cache_size / 2))
+  if (unlikely(addr || cache->used >= cache_size / 2))
   {
-    if (unlikely (trace_cache_expand (cache) < 0))
+    if (unlikely(trace_cache_expand (cache) < 0))
       return 0;
 
     cache_size = 1u << (HASH_LOW_BITS + cache->log_frame_vecs);
@@ -404,7 +404,7 @@ unw_tdep_trace (unw_cursor_t *cursor, void **buffer, int *size)
   int ret;
 
   /* Check input parametres. */
-  if (! cursor || ! buffer || ! size || (maxdepth = *size) <= 0)
+  if (unlikely(! cursor || ! buffer || ! size || (maxdepth = *size) <= 0))
     return -UNW_EINVAL;
 
   Debug (1, "begin ip 0x%lx cfa 0x%lx\n", d->ip, d->cfa);
@@ -415,7 +415,7 @@ unw_tdep_trace (unw_cursor_t *cursor, void **buffer, int *size)
   /* Determine initial register values. */
   rip = d->ip;
   rsp = cfa = d->cfa;
-  if ((ret = dwarf_get (d, d->loc[UNW_X86_64_RBP], &rbp)) < 0)
+  if (unlikely((ret = dwarf_get (d, d->loc[UNW_X86_64_RBP], &rbp)) < 0))
   {
     Debug (1, "returning %d, rbp value not found\n", ret);
     *size = 0;
@@ -424,7 +424,7 @@ unw_tdep_trace (unw_cursor_t *cursor, void **buffer, int *size)
   }
 
   /* Get frame cache. */
-  if (! (cache = trace_cache_get()))
+  if (unlikely(! (cache = trace_cache_get())))
   {
     Debug (1, "returning %d, cannot get trace cache\n", -UNW_ENOMEM);
     *size = 0;
@@ -453,7 +453,7 @@ unw_tdep_trace (unw_cursor_t *cursor, void **buffer, int *size)
     buffer[depth++] = (void *) rip;
 
     /* If we don't have information for this frame, give up. */
-    if (! f)
+    if (unlikely(! f))
     {
       ret = -UNW_ENOINFO;
       break;
@@ -484,9 +484,9 @@ unw_tdep_trace (unw_cursor_t *cursor, void **buffer, int *size)
     case UNW_X86_64_FRAME_STANDARD:
       /* Advance standard traceable frame. */
       cfa = (f->cfa_reg_rsp ? rsp : rbp) + f->cfa_reg_offset;
-      ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa - 8), &rip);
-      if (ret >= 0 && f->rbp_cfa_offset != -1)
-	ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa + f->rbp_cfa_offset), &rbp);
+      ACCESS_MEM_FAST(ret, c->validate, d, cfa - 8, rip);
+      if (likely(ret >= 0) && likely(f->rbp_cfa_offset != -1))
+	ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->rbp_cfa_offset, rbp);
 
       /* Don't bother reading RSP from DWARF, CFA becomes new RSP. */
       rsp = cfa;
@@ -500,13 +500,12 @@ unw_tdep_trace (unw_cursor_t *cursor, void **buffer, int *size)
          registers (ucontext) among other things.  We know the info
 	 is stored at some unknown constant offset off inner frame's
 	 CFA.  We determine the actual offset from DWARF unwind info. */
-      d->use_prev_instr = 0;
       cfa = cfa + f->cfa_reg_offset;
-      ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa + f->rbp_cfa_offset + dRIP), &rip);
-      if (ret >= 0)
-	ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa + f->rbp_cfa_offset), &rbp);
-      if (ret >= 0)
-	ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa + f->rsp_cfa_offset), &rsp);
+      ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->rbp_cfa_offset + dRIP, rip);
+      if (likely(ret >= 0))
+        ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->rbp_cfa_offset, rbp);
+      if (likely(ret >= 0))
+        ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->rsp_cfa_offset, rsp);
 
       /* Resume stack at signal restoration point. The stack is not
          necessarily continuous here, especially with sigaltstack(). */
@@ -527,8 +526,8 @@ unw_tdep_trace (unw_cursor_t *cursor, void **buffer, int *size)
     Debug (4, "new cfa 0x%lx rip 0x%lx rsp 0x%lx rbp 0x%lx\n",
 	   cfa, rip, rsp, rbp);
 
-    /* If we failed on ended up somewhere bogus, stop. */
-    if (ret < 0 || rip < 0x4000)
+    /* If we failed or ended up somewhere bogus, stop. */
+    if (unlikely(ret < 0 || rip < 0x4000))
       break;
   }
 
diff --git a/src/x86_64/init.h b/src/x86_64/init.h
index f04ecda..e80e553 100644
--- a/src/x86_64/init.h
+++ b/src/x86_64/init.h
@@ -27,28 +27,42 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #include "unwind_i.h"
 
+/* Avoid a trip to x86_64_r_uc_addr() for purely local initialisation. */
+#if defined UNW_LOCAL_ONLY && defined __linux
+# define REG_INIT_LOC(c, rlc, ruc) \
+    DWARF_LOC ((unw_word_t) &c->uc->uc_mcontext.gregs[REG_ ## ruc], 0)
+
+#elif defined UNW_LOCAL_ONLY && defined __FreeBSD__
+# define REG_INIT_LOC(c, rlc, ruc) \
+    DWARF_LOC ((unw_word_t) &c->uc->uc_mcontext.mc_ ## rlc, 0)
+
+#else
+# define REG_INIT_LOC(c, rlc, ruc) \
+    DWARF_REG_LOC (&c->dwarf, UNW_X86_64_ ## ruc)
+#endif
+
 static inline int
 common_init (struct cursor *c, unsigned use_prev_instr)
 {
   int ret;
 
-  c->dwarf.loc[RAX] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RAX);
-  c->dwarf.loc[RDX] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RDX);
-  c->dwarf.loc[RCX] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RCX);
-  c->dwarf.loc[RBX] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RBX);
-  c->dwarf.loc[RSI] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RSI);
-  c->dwarf.loc[RDI] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RDI);
-  c->dwarf.loc[RBP] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RBP);
-  c->dwarf.loc[RSP] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RSP);
-  c->dwarf.loc[R8]  = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R8);
-  c->dwarf.loc[R9]  = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R9);
-  c->dwarf.loc[R10] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R10);
-  c->dwarf.loc[R11] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R11);
-  c->dwarf.loc[R12] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R12);
-  c->dwarf.loc[R13] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R13);
-  c->dwarf.loc[R14] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R14);
-  c->dwarf.loc[R15] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R15);
-  c->dwarf.loc[RIP] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RIP);
+  c->dwarf.loc[RAX] = REG_INIT_LOC(c, rax, RAX);
+  c->dwarf.loc[RDX] = REG_INIT_LOC(c, rdx, RDX);
+  c->dwarf.loc[RCX] = REG_INIT_LOC(c, rcx, RCX);
+  c->dwarf.loc[RBX] = REG_INIT_LOC(c, rbx, RBX);
+  c->dwarf.loc[RSI] = REG_INIT_LOC(c, rsi, RSI);
+  c->dwarf.loc[RDI] = REG_INIT_LOC(c, rdi, RDI);
+  c->dwarf.loc[RBP] = REG_INIT_LOC(c, rbp, RBP);
+  c->dwarf.loc[RSP] = REG_INIT_LOC(c, rsp, RSP);
+  c->dwarf.loc[R8]  = REG_INIT_LOC(c, r8,  R8);
+  c->dwarf.loc[R9]  = REG_INIT_LOC(c, r9,  R9);
+  c->dwarf.loc[R10] = REG_INIT_LOC(c, r10, R10);
+  c->dwarf.loc[R11] = REG_INIT_LOC(c, r11, R11);
+  c->dwarf.loc[R12] = REG_INIT_LOC(c, r12, R12);
+  c->dwarf.loc[R13] = REG_INIT_LOC(c, r13, R13);
+  c->dwarf.loc[R14] = REG_INIT_LOC(c, r14, R14);
+  c->dwarf.loc[R15] = REG_INIT_LOC(c, r15, R15);
+  c->dwarf.loc[RIP] = REG_INIT_LOC(c, rip, RIP);
 
   ret = dwarf_get (&c->dwarf, c->dwarf.loc[RIP], &c->dwarf.ip);
   if (ret < 0)
diff --git a/src/x86_64/unwind_i.h b/src/x86_64/unwind_i.h
index 699a6b3..1e55a76 100644
--- a/src/x86_64/unwind_i.h
+++ b/src/x86_64/unwind_i.h
@@ -65,6 +65,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define x86_64_r_uc_addr		UNW_OBJ(r_uc_addr)
 #define x86_64_sigreturn		UNW_OBJ(sigreturn)
 
+/* By-pass calls to access_mem() when known to be safe. */
+#ifdef UNW_LOCAL_ONLY
+# undef ACCESS_MEM_FAST
+# define ACCESS_MEM_FAST(ret,validate,cur,addr,to)                     \
+  do {                                                                 \
+    if (unlikely(validate))                                            \
+      (ret) = dwarf_get ((cur), DWARF_MEM_LOC ((cur), (addr)), &(to)); \
+    else                                                               \
+      (ret) = 0, (to) = *(unw_word_t *)(addr);                         \
+  } while (0)
+#endif
+
 extern void x86_64_local_addr_space_init (void);
 extern int x86_64_local_resume (unw_addr_space_t as, unw_cursor_t *cursor,
 			     void *arg);
