[pypy-commit] pypy value-profiling: merge default

cfbolz Wed, 19 Aug 2015 09:27:18 -0700

Author: Carl Friedrich Bolz <[email protected]>
Branch: value-profiling
Changeset: r79065:bbcbd47d4cec
Date: 2015-08-19 18:25 +0200
http://bitbucket.org/pypy/pypy/changeset/bbcbd47d4cec/


Log:    merge default

diff too long, truncating to 2000 out of 2204 lines

diff --git a/_pytest/assertion/rewrite.py b/_pytest/assertion/rewrite.py
--- a/_pytest/assertion/rewrite.py
+++ b/_pytest/assertion/rewrite.py
@@ -308,7 +308,10 @@
         if (len(data) != 8 or data[:4] != imp.get_magic() or
                 struct.unpack("<l", data[4:])[0] != mtime):
             return None
-        co = marshal.load(fp)
+        try:
+            co = marshal.load(fp)
+        except ValueError:
+            return None # e.g. bad marshal data because of pypy/cpython mix
         if not isinstance(co, types.CodeType):
             # That's interesting....
             return None
diff --git a/pypy/doc/embedding.rst b/pypy/doc/embedding.rst
--- a/pypy/doc/embedding.rst
+++ b/pypy/doc/embedding.rst
@@ -46,7 +46,11 @@
    source. It'll acquire the GIL.
 
    Note: this is meant to be called *only once* or a few times at most.  See
-   the `more complete example`_ below.
+   the `more complete example`_ below.  In PyPy <= 2.6.0, the globals
+   dictionary is *reused* across multiple calls, giving potentially
+   strange results (e.g. objects dying too early).  In PyPy >= 2.6.1,
+   you get a new globals dictionary for every call (but then, all globals
+   dictionaries are all kept alive forever, in ``sys._pypy_execute_source``).
 
 .. function:: int pypy_execute_source_ptr(char* source, void* ptr);
 
diff --git a/pypy/goal/targetpypystandalone.py 
b/pypy/goal/targetpypystandalone.py
--- a/pypy/goal/targetpypystandalone.py
+++ b/pypy/goal/targetpypystandalone.py
@@ -128,13 +128,7 @@
 
     @entrypoint('main', [rffi.CCHARP], c_name='pypy_execute_source')
     def pypy_execute_source(ll_source):
-        after = rffi.aroundstate.after
-        if after: after()
-        source = rffi.charp2str(ll_source)
-        res = _pypy_execute_source(source)
-        before = rffi.aroundstate.before
-        if before: before()
-        return rffi.cast(rffi.INT, res)
+        return pypy_execute_source_ptr(ll_source, 0)
 
     @entrypoint('main', [rffi.CCHARP, lltype.Signed],
                 c_name='pypy_execute_source_ptr')
@@ -142,9 +136,7 @@
         after = rffi.aroundstate.after
         if after: after()
         source = rffi.charp2str(ll_source)
-        space.setitem(w_globals, space.wrap('c_argument'),
-                      space.wrap(ll_ptr))
-        res = _pypy_execute_source(source)
+        res = _pypy_execute_source(source, ll_ptr)
         before = rffi.aroundstate.before
         if before: before()
         return rffi.cast(rffi.INT, res)
@@ -169,15 +161,21 @@
         before = rffi.aroundstate.before
         if before: before()
 
-    w_globals = space.newdict()
-    space.setitem(w_globals, space.wrap('__builtins__'),
-                  space.builtin_modules['__builtin__'])
-
-    def _pypy_execute_source(source):
+    def _pypy_execute_source(source, c_argument):
         try:
-            compiler = space.createcompiler()
-            stmt = compiler.compile(source, 'c callback', 'exec', 0)
-            stmt.exec_code(space, w_globals, w_globals)
+            w_globals = space.newdict(module=True)
+            space.setitem(w_globals, space.wrap('__builtins__'),
+                          space.builtin_modules['__builtin__'])
+            space.setitem(w_globals, space.wrap('c_argument'),
+                          space.wrap(c_argument))
+            space.appexec([space.wrap(source), w_globals], """(src, glob):
+                import sys
+                stmt = compile(src, 'c callback', 'exec')
+                if not hasattr(sys, '_pypy_execute_source'):
+                    sys._pypy_execute_source = []
+                sys._pypy_execute_source.append(glob)
+                exec stmt in glob
+            """)
         except OperationError, e:
             debug("OperationError:")
             debug(" operror-type: " + e.w_type.getname(space))
diff --git a/pypy/module/_vmprof/test/test__vmprof.py 
b/pypy/module/_vmprof/test/test__vmprof.py
--- a/pypy/module/_vmprof/test/test__vmprof.py
+++ b/pypy/module/_vmprof/test/test__vmprof.py
@@ -21,11 +21,12 @@
             i = 0
             count = 0
             i += 5 * WORD # header
-            assert s[i] == '\x04'
-            i += 1 # marker
-            assert s[i] == '\x04'
-            i += 1 # length
-            i += len('pypy')
+            assert s[i    ] == '\x05'    # MARKER_HEADER
+            assert s[i + 1] == '\x00'    # 0
+            assert s[i + 2] == '\x01'    # VERSION_THREAD_ID
+            assert s[i + 3] == chr(4)    # len('pypy')
+            assert s[i + 4: i + 8] == 'pypy'
+            i += 8
             while i < len(s):
                 if s[i] == '\x03':
                     break
diff --git a/pypy/module/_vmprof/test/test_direct.py 
b/pypy/module/_vmprof/test/test_direct.py
--- a/pypy/module/_vmprof/test/test_direct.py
+++ b/pypy/module/_vmprof/test/test_direct.py
@@ -42,7 +42,7 @@
 }
 
 
-""" + open(str(srcdir.join("rvmprof_get_custom_offset.h"))).read())
+""" + open(str(srcdir.join("vmprof_get_custom_offset.h"))).read())
 
 class TestDirect(object):
     def test_infrastructure(self):
diff --git a/pypy/module/struct/formatiterator.py 
b/pypy/module/struct/formatiterator.py
--- a/pypy/module/struct/formatiterator.py
+++ b/pypy/module/struct/formatiterator.py
@@ -82,7 +82,13 @@
                 w_index = space.int(w_obj)   # wrapped float -> wrapped int or 
long
             if w_index is None:
                 raise StructError("cannot convert argument to integer")
-        return getattr(space, meth)(w_index)
+        method = getattr(space, meth)
+        try:
+            return method(w_index)
+        except OperationError as e:
+            if e.match(self.space, self.space.w_OverflowError):
+                raise StructError("argument out of range")
+            raise
 
     def accept_bool_arg(self):
         w_obj = self.accept_obj_arg()
diff --git a/pypy/module/struct/test/test_struct.py 
b/pypy/module/struct/test/test_struct.py
--- a/pypy/module/struct/test/test_struct.py
+++ b/pypy/module/struct/test/test_struct.py
@@ -428,6 +428,9 @@
         assert s.unpack(s.pack(42)) == (42,)
         assert s.unpack_from(memoryview(s.pack(42))) == (42,)
 
+    def test_overflow(self):
+        raises(self.struct.error, self.struct.pack, 'i', 1<<65)
+
 
 class AppTestStructBuffer(object):
     spaceconfig = dict(usemodules=['struct', '__pypy__'])
diff --git a/rpython/flowspace/objspace.py b/rpython/flowspace/objspace.py
--- a/rpython/flowspace/objspace.py
+++ b/rpython/flowspace/objspace.py
@@ -13,6 +13,11 @@
 
 def _assert_rpythonic(func):
     """Raise ValueError if ``func`` is obviously not RPython"""
+    try:
+        func.func_code.co_cellvars
+    except AttributeError:
+        raise ValueError("%r is not RPython: it is likely an unexpected "
+                         "built-in function or type" % (func,))
     if func.func_doc and func.func_doc.lstrip().startswith('NOT_RPYTHON'):
         raise ValueError("%r is tagged as NOT_RPYTHON" % (func,))
     if func.func_code.co_cellvars:
diff --git a/rpython/flowspace/test/test_objspace.py 
b/rpython/flowspace/test/test_objspace.py
--- a/rpython/flowspace/test/test_objspace.py
+++ b/rpython/flowspace/test/test_objspace.py
@@ -1363,6 +1363,15 @@
         simplify_graph(graph)
         assert self.all_operations(graph) == {'bool': 1, 'inplace_add': 1}
 
+    def test_unexpected_builtin_function(self):
+        import itertools
+        e = py.test.raises(ValueError, build_flow, itertools.permutations)
+        assert ' is not RPython:' in str(e.value)
+        e = py.test.raises(ValueError, build_flow, itertools.tee)
+        assert ' is not RPython:' in str(e.value)
+        e = py.test.raises(ValueError, build_flow, Exception.__init__)
+        assert ' is not RPython:' in str(e.value)
+
 
 DATA = {'x': 5,
         'y': 6}
diff --git a/rpython/jit/backend/detect_cpu.py 
b/rpython/jit/backend/detect_cpu.py
--- a/rpython/jit/backend/detect_cpu.py
+++ b/rpython/jit/backend/detect_cpu.py
@@ -63,6 +63,7 @@
             'AMD64': MODEL_X86,    # win64
             'armv7l': MODEL_ARM,
             'armv6l': MODEL_ARM,
+            'arm': MODEL_ARM,      # freebsd
             }.get(mach)
 
     if result is None:
diff --git a/rpython/jit/backend/llsupport/src/codemap.c 
b/rpython/jit/backend/llsupport/src/codemap.c
--- a/rpython/jit/backend/llsupport/src/codemap.c
+++ b/rpython/jit/backend/llsupport/src/codemap.c
@@ -6,9 +6,9 @@
 #endif
 
 #ifdef RPYTHON_VMPROF
-RPY_EXTERN void rpython_vmprof_ignore_signals(int ignored);
+RPY_EXTERN void vmprof_ignore_signals(int ignored);
 static void pypy_codemap_invalid_set(int ignored) {
-    rpython_vmprof_ignore_signals(ignored);
+    vmprof_ignore_signals(ignored);
 }
 #else
 static void pypy_codemap_invalid_set(int ignored) {
diff --git a/rpython/jit/backend/tool/viewcode.py 
b/rpython/jit/backend/tool/viewcode.py
--- a/rpython/jit/backend/tool/viewcode.py
+++ b/rpython/jit/backend/tool/viewcode.py
@@ -17,18 +17,6 @@
 import subprocess
 from bisect import bisect_left
 
-# don't use rpython.tool.udir here to avoid removing old usessions which
-# might still contain interesting executables
-udir = py.path.local.make_numbered_dir(prefix='viewcode-', keep=2)
-tmpfile = str(udir.join('dump.tmp'))
-
-# hack hack
-import rpython.tool
-mod = new.module('rpython.tool.udir')
-mod.udir = udir
-sys.modules['rpython.tool.udir'] = mod
-rpython.tool.udir = mod
-
 # ____________________________________________________________
 # Some support code from Psyco.  There is more over there,
 # I am porting it in a lazy fashion...  See py-utils/xam.py
@@ -438,6 +426,18 @@
 # ____________________________________________________________
 
 if __name__ == '__main__':
+    # don't use rpython.tool.udir here to avoid removing old usessions which
+    # might still contain interesting executables
+    udir = py.path.local.make_numbered_dir(prefix='viewcode-', keep=2)
+    tmpfile = str(udir.join('dump.tmp'))
+
+    # hack hack
+    import rpython.tool
+    mod = new.module('rpython.tool.udir')
+    mod.udir = udir
+    sys.modules['rpython.tool.udir'] = mod
+    rpython.tool.udir = mod
+
     if '--text' in sys.argv:
         sys.argv.remove('--text')
         showgraph = False
@@ -463,3 +463,7 @@
         world.show(showtext=True)
     else:
         world.showtextonly()
+else:
+    from rpython.tool.udir import udir
+    tmpfile = str(udir.join('dump.tmp'))
+    
diff --git a/rpython/rlib/jit.py b/rpython/rlib/jit.py
--- a/rpython/rlib/jit.py
+++ b/rpython/rlib/jit.py
@@ -1087,6 +1087,16 @@
     """
     assert value is not None and type(value) is cls
 
+def ll_record_exact_class(ll_value, ll_cls):
+    from rpython.rlib.debug import ll_assert
+    from rpython.rtyper.lltypesystem.lloperation import llop
+    from rpython.rtyper.lltypesystem import lltype
+    from rpython.rtyper.rclass import ll_type
+    ll_assert(ll_value == lltype.nullptr(lltype.typeOf(ll_value).TO), 
"record_exact_class called with None argument")
+    ll_assert(ll_type(ll_value) is ll_cls, "record_exact_class called with 
invalid arguments")
+    llop.jit_record_exact_class(lltype.Void, ll_value, ll_cls)
+
+
 class Entry(ExtRegistryEntry):
     _about_ = record_exact_class
 
@@ -1099,12 +1109,10 @@
         from rpython.rtyper import rclass
 
         classrepr = rclass.get_type_repr(hop.rtyper)
-
-        hop.exception_cannot_occur()
         v_inst = hop.inputarg(hop.args_r[0], arg=0)
         v_cls = hop.inputarg(classrepr, arg=1)
-        return hop.genop('jit_record_exact_class', [v_inst, v_cls],
-                         resulttype=lltype.Void)
+        hop.exception_is_here()
+        return hop.gendirectcall(ll_record_exact_class, v_inst, v_cls)
 
 def _jit_conditional_call(condition, function, *args):
     pass
diff --git a/rpython/rlib/rvmprof/cintf.py b/rpython/rlib/rvmprof/cintf.py
--- a/rpython/rlib/rvmprof/cintf.py
+++ b/rpython/rlib/rvmprof/cintf.py
@@ -40,24 +40,20 @@
         **eci_kwds))
 
 
-    vmprof_init = rffi.llexternal("rpython_vmprof_init", [rffi.INT], 
rffi.CCHARP,
-                                  compilation_info=eci)
-    vmprof_enable = rffi.llexternal("rpython_vmprof_enable", [rffi.LONG], 
rffi.INT,
+    vmprof_init = rffi.llexternal("vmprof_init",
+                                  [rffi.INT, rffi.DOUBLE, rffi.CCHARP],
+                                  rffi.CCHARP, compilation_info=eci)
+    vmprof_enable = rffi.llexternal("vmprof_enable", [], rffi.INT,
                                     compilation_info=eci,
                                     save_err=rffi.RFFI_SAVE_ERRNO)
-    vmprof_disable = rffi.llexternal("rpython_vmprof_disable", [], rffi.INT,
+    vmprof_disable = rffi.llexternal("vmprof_disable", [], rffi.INT,
                                      compilation_info=eci,
                                      save_err=rffi.RFFI_SAVE_ERRNO)
-    vmprof_write_buf = rffi.llexternal("rpython_vmprof_write_buf",
-                                       [rffi.CCHARP, rffi.LONG],
-                                       lltype.Void, compilation_info=eci)
-
-    ## vmprof_register_virtual_function = rffi.llexternal(
-    ##     "vmprof_register_virtual_function",
-    ##     [rffi.CCHARP, rffi.VOIDP, rffi.VOIDP], lltype.Void,
-    ##     compilation_info=eci, _nowrapper=True)
-
-    vmprof_ignore_signals = rffi.llexternal("rpython_vmprof_ignore_signals",
+    vmprof_register_virtual_function = rffi.llexternal(
+                                           "vmprof_register_virtual_function",
+                                           [rffi.CCHARP, rffi.LONG, rffi.INT],
+                                           rffi.INT, compilation_info=eci)
+    vmprof_ignore_signals = rffi.llexternal("vmprof_ignore_signals",
                                             [rffi.INT], lltype.Void,
                                             compilation_info=eci)
     return CInterface(locals())
diff --git a/rpython/rlib/rvmprof/rvmprof.py b/rpython/rlib/rvmprof/rvmprof.py
--- a/rpython/rlib/rvmprof/rvmprof.py
+++ b/rpython/rlib/rvmprof/rvmprof.py
@@ -1,14 +1,12 @@
 import sys, os
 from rpython.rlib.objectmodel import specialize, we_are_translated
-from rpython.rlib.rstring import StringBuilder
 from rpython.rlib import jit, rgc, rposix
 from rpython.rlib.rvmprof import cintf
 from rpython.rtyper.annlowlevel import cast_instance_to_gcref
 from rpython.rtyper.annlowlevel import cast_base_ptr_to_instance
 from rpython.rtyper.lltypesystem import rffi
 
-MAX_CODES = 8000 - 255
-MAX_FUNC_NAME = 255
+MAX_FUNC_NAME = 1023
 
 # ____________________________________________________________
 
@@ -34,8 +32,6 @@
 
     def _cleanup_(self):
         self.is_enabled = False
-        self.fileno = -1
-        self._current_codes = None
 
     @specialize.argtype(1)
     def register_code(self, code, full_name_func):
@@ -102,18 +98,13 @@
         assert fileno >= 0
         if self.is_enabled:
             raise VMProfError("vmprof is already enabled")
-        if not (1e-6 <= interval < 1.0):
-            raise VMProfError("bad value for 'interval'")
-        interval_usec = int(interval * 1000000.0)
 
-        p_error = self.cintf.vmprof_init(fileno)
+        p_error = self.cintf.vmprof_init(fileno, interval, "pypy")
         if p_error:
             raise VMProfError(rffi.charp2str(p_error))
 
-        self.fileno = fileno
-        self._write_header(interval_usec)
         self._gather_all_code_objs()
-        res = self.cintf.vmprof_enable(interval_usec)
+        res = self.cintf.vmprof_enable()
         if res < 0:
             raise VMProfError(os.strerror(rposix.get_saved_errno()))
         self.is_enabled = True
@@ -125,9 +116,6 @@
         if not self.is_enabled:
             raise VMProfError("vmprof is not enabled")
         self.is_enabled = False
-        if self._current_codes is not None:
-            self._flush_codes()
-        self.fileno = -1
         res = self.cintf.vmprof_disable()
         if res < 0:
             raise VMProfError(os.strerror(rposix.get_saved_errno()))
@@ -136,48 +124,8 @@
         assert name.count(':') == 3 and len(name) <= MAX_FUNC_NAME, (
             "the name must be 'class:func_name:func_line:filename' "
             "and at most %d characters; got '%s'" % (MAX_FUNC_NAME, name))
-        b = self._current_codes
-        if b is None:
-            b = self._current_codes = StringBuilder()
-        b.append('\x02')
-        _write_long_to_string_builder(uid, b)
-        _write_long_to_string_builder(len(name), b)
-        b.append(name)
-        if b.getlength() >= MAX_CODES:
-            self._flush_codes()
-
-    def _flush_codes(self):
-        buf = self._current_codes.build()
-        self._current_codes = None
-        self.cintf.vmprof_write_buf(buf, len(buf))
-        # NOTE: keep in mind that vmprof_write_buf() can only write
-        # a maximum of 8184 bytes.  This should be guaranteed here because:
-        assert MAX_CODES + 17 + MAX_FUNC_NAME <= 8184
-
-    def _write_header(self, interval_usec):
-        b = StringBuilder()
-        _write_long_to_string_builder(0, b)
-        _write_long_to_string_builder(3, b)
-        _write_long_to_string_builder(0, b)
-        _write_long_to_string_builder(interval_usec, b)
-        _write_long_to_string_builder(0, b)
-        b.append('\x04') # interp name
-        b.append(chr(len('pypy')))
-        b.append('pypy')
-        buf = b.build()
-        self.cintf.vmprof_write_buf(buf, len(buf))
-
-
-def _write_long_to_string_builder(l, b):
-    b.append(chr(l & 0xff))
-    b.append(chr((l >> 8) & 0xff))
-    b.append(chr((l >> 16) & 0xff))
-    b.append(chr((l >> 24) & 0xff))
-    if sys.maxint > 2147483647:
-        b.append(chr((l >> 32) & 0xff))
-        b.append(chr((l >> 40) & 0xff))
-        b.append(chr((l >> 48) & 0xff))
-        b.append(chr((l >> 56) & 0xff))
+        if self.cintf.vmprof_register_virtual_function(name, uid, 500000) < 0:
+            raise VMProfError("vmprof buffers full!  disk full or too slow")
 
 
 def vmprof_execute_code(name, get_code_fn, result_class=None):
diff --git a/rpython/rlib/rvmprof/src/rvmprof.c 
b/rpython/rlib/rvmprof/src/rvmprof.c
--- a/rpython/rlib/rvmprof/src/rvmprof.c
+++ b/rpython/rlib/rvmprof/src/rvmprof.c
@@ -1,22 +1,3 @@
-/* VMPROF
- *
- * statistical sampling profiler specifically designed to profile programs
- * which run on a Virtual Machine and/or bytecode interpreter, such as Python,
- * etc.
- *
- * The logic to dump the C stack traces is partly stolen from the code in
- * gperftools.
- * The file "getpc.h" has been entirely copied from gperftools.
- *
- * Tested only on gcc, linux, x86_64.
- *
- * Copyright (C) 2014-2015
- *   Antonio Cuni - [email protected]
- *   Maciej Fijalkowski - [email protected]
- *   Armin Rigo - [email protected]
- *
- */
-
 #define _GNU_SOURCE 1
 
 
@@ -39,431 +20,4 @@
 #endif
 
 
-#include <dlfcn.h>
-#include <assert.h>
-#include <pthread.h>
-#include <sys/time.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "rvmprof_getpc.h"
-#include "rvmprof_unwind.h"
-#include "rvmprof_mt.h"
-
-
-/************************************************************/
-
-// functions copied from libunwind using dlopen
-
-static int (*unw_get_reg)(unw_cursor_t*, int, unw_word_t*) = NULL;
-static int (*unw_step)(unw_cursor_t*) = NULL;
-static int (*unw_init_local)(unw_cursor_t *, unw_context_t *) = NULL;
-static int (*unw_get_proc_info)(unw_cursor_t *, unw_proc_info_t *) = NULL;
-
-static int profile_file = -1;
-
-
-RPY_EXTERN
-char *rpython_vmprof_init(int fd)
-{
-    if (!unw_get_reg) {
-        void *libhandle;
-
-        if (!(libhandle = dlopen("libunwind.so", RTLD_LAZY | RTLD_LOCAL)))
-            goto error;
-        if (!(unw_get_reg = dlsym(libhandle, "_ULx86_64_get_reg")))
-            goto error;
-        if (!(unw_get_proc_info = dlsym(libhandle, "_ULx86_64_get_proc_info")))
-            goto error;
-        if (!(unw_init_local = dlsym(libhandle, "_ULx86_64_init_local")))
-            goto error;
-        if (!(unw_step = dlsym(libhandle, "_ULx86_64_step")))
-            goto error;
-    }
-    if (prepare_concurrent_bufs() < 0)
-        return "out of memory";
-
-    assert(fd >= 0);
-    profile_file = fd;
-    return NULL;
-
- error:
-    return dlerror();
-}
-
-/************************************************************/
-
-/* value: last bit is 1 if signals must be ignored; all other bits
-   are a counter for how many threads are currently in a signal handler */
-static long volatile signal_handler_value = 1;
-
-RPY_EXTERN
-void rpython_vmprof_ignore_signals(int ignored)
-{
-    if (!ignored) {
-        __sync_fetch_and_and(&signal_handler_value, ~1L);
-    }
-    else {
-        /* set the last bit, and wait until concurrently-running signal
-           handlers finish */
-        while (__sync_or_and_fetch(&signal_handler_value, 1L) != 1L) {
-            usleep(1);
-        }
-    }
-}
-
-
-/* *************************************************************
- * functions to write a profile file compatible with gperftools
- * *************************************************************
- */
-
-#define MAX_FUNC_NAME 128
-#define MAX_STACK_DEPTH   \
-    ((SINGLE_BUF_SIZE - sizeof(struct prof_stacktrace_s)) / sizeof(void *))
-
-#define MARKER_STACKTRACE '\x01'
-#define MARKER_VIRTUAL_IP '\x02'
-#define MARKER_TRAILER '\x03'
-
-struct prof_stacktrace_s {
-    char padding[sizeof(long) - 1];
-    char marker;
-    long count, depth;
-    void *stack[];
-};
-
-static long profile_interval_usec = 0;
-static char atfork_hook_installed = 0;
-
-
-/* ******************************************************
- * libunwind workaround for process JIT frames correctly
- * ******************************************************
- */
-
-#include "rvmprof_get_custom_offset.h"
-
-typedef struct {
-    void* _unused1;
-    void* _unused2;
-    void* sp;
-    void* ip;
-    void* _unused3[sizeof(unw_cursor_t)/sizeof(void*) - 4];
-} vmprof_hacked_unw_cursor_t;
-
-static int vmprof_unw_step(unw_cursor_t *cp, int first_run)
-{
-    void* ip;
-    void* sp;
-    ptrdiff_t sp_offset;
-    unw_get_reg (cp, UNW_REG_IP, (unw_word_t*)&ip);
-    unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp);
-    if (!first_run) {
-        // make sure we're pointing to the CALL and not to the first
-        // instruction after. If the callee adjusts the stack for us
-        // it's not safe to be at the instruction after
-        ip -= 1;
-    }
-    sp_offset = vmprof_unw_get_custom_offset(ip, cp);
-
-    if (sp_offset == -1) {
-        // it means that the ip is NOT in JITted code, so we can use the
-        // stardard unw_step
-        return unw_step(cp);
-    }
-    else {
-        // this is a horrible hack to manually walk the stack frame, by
-        // setting the IP and SP in the cursor
-        vmprof_hacked_unw_cursor_t *cp2 = (vmprof_hacked_unw_cursor_t*)cp;
-        void* bp = (void*)sp + sp_offset;
-        cp2->sp = bp;
-        bp -= sizeof(void*);
-        cp2->ip = ((void**)bp)[0];
-        // the ret is on the top of the stack minus WORD
-        return 1;
-    }
-}
-
-
-/* *************************************************************
- * functions to dump the stack trace
- * *************************************************************
- */
-
-static int get_stack_trace(void** result, int max_depth, ucontext_t *ucontext)
-{
-    void *ip;
-    int n = 0;
-    unw_cursor_t cursor;
-    unw_context_t uc = *ucontext;
-
-    int ret = unw_init_local(&cursor, &uc);
-    assert(ret >= 0);
-    (void)ret;
-
-    while (n < max_depth) {
-        if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
-            break;
-        }
-
-        unw_proc_info_t pip;
-        unw_get_proc_info(&cursor, &pip);
-
-        /* if n==0, it means that the signal handler interrupted us while we
-           were in the trampoline, so we are not executing (yet) the real main
-           loop function; just skip it */
-        if (VMPROF_ADDR_OF_TRAMPOLINE((void*)pip.start_ip) && n > 0) {
-            // found main loop stack frame
-            void* sp;
-            unw_get_reg(&cursor, UNW_REG_SP, (unw_word_t *) &sp);
-            void *arg_addr = (char*)sp /* + mainloop_sp_offset */;
-            void **arg_ptr = (void**)arg_addr;
-            /* if (mainloop_get_virtual_ip) {
-               ip = mainloop_get_virtual_ip(*arg_ptr);
-               } else { */
-            ip = *arg_ptr;
-        }
-
-        int first_run = (n == 0);
-        result[n++] = ip;
-        n = vmprof_write_header_for_jit_addr(result, n, ip, max_depth);
-        if (vmprof_unw_step(&cursor, first_run) <= 0)
-            break;
-    }
-    return n;
-}
-
-
-/* *************************************************************
- * the signal handler
- * *************************************************************
- */
-
-static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext)
-{
-    long val = __sync_fetch_and_add(&signal_handler_value, 2L);
-
-    if ((val & 1) == 0) {
-        int saved_errno = errno;
-        int fd = profile_file;
-        assert(fd >= 0);
-
-        struct profbuf_s *p = reserve_buffer(fd);
-        if (p == NULL) {
-            /* ignore this signal: there are no free buffers right now */
-        }
-        else {
-            int depth;
-            struct prof_stacktrace_s *st = (struct prof_stacktrace_s *)p->data;
-            st->marker = MARKER_STACKTRACE;
-            st->count = 1;
-            st->stack[0] = GetPC((ucontext_t*)ucontext);
-            depth = get_stack_trace(st->stack+1, MAX_STACK_DEPTH-1, ucontext);
-            depth++;  // To account for pc value in stack[0];
-            st->depth = depth;
-            p->data_offset = offsetof(struct prof_stacktrace_s, marker);
-            p->data_size = (depth * sizeof(void *) +
-                            sizeof(struct prof_stacktrace_s) -
-                            offsetof(struct prof_stacktrace_s, marker));
-            commit_buffer(fd, p);
-        }
-
-        errno = saved_errno;
-    }
-
-    __sync_sub_and_fetch(&signal_handler_value, 2L);
-}
-
-
-/* *************************************************************
- * the setup and teardown functions
- * *************************************************************
- */
-
-static int install_sigprof_handler(void)
-{
-    struct sigaction sa;
-    memset(&sa, 0, sizeof(sa));
-    sa.sa_sigaction = sigprof_handler;
-    sa.sa_flags = SA_RESTART | SA_SIGINFO;
-    if (sigemptyset(&sa.sa_mask) == -1 ||
-        sigaction(SIGPROF, &sa, NULL) == -1)
-        return -1;
-    return 0;
-}
-
-static int remove_sigprof_handler(void)
-{
-    if (signal(SIGPROF, SIG_DFL) == SIG_ERR)
-        return -1;
-    return 0;
-}
-
-static int install_sigprof_timer(void)
-{
-    static struct itimerval timer;
-    timer.it_interval.tv_sec = 0;
-    timer.it_interval.tv_usec = profile_interval_usec;
-    timer.it_value = timer.it_interval;
-    if (setitimer(ITIMER_PROF, &timer, NULL) != 0)
-        return -1;
-    return 0;
-}
-
-static int remove_sigprof_timer(void) {
-    static struct itimerval timer;
-    timer.it_interval.tv_sec = 0;
-    timer.it_interval.tv_usec = 0;
-    timer.it_value.tv_sec = 0;
-    timer.it_value.tv_usec = 0;
-    if (setitimer(ITIMER_PROF, &timer, NULL) != 0)
-        return -1;
-    return 0;
-}
-
-static void atfork_disable_timer(void) {
-    if (profile_interval_usec > 0) {
-        remove_sigprof_timer();
-    }
-}
-
-static void atfork_enable_timer(void) {
-    if (profile_interval_usec > 0) {
-        install_sigprof_timer();
-    }
-}
-
-static int install_pthread_atfork_hooks(void) {
-    /* this is needed to prevent the problems described there:
-         - http://code.google.com/p/gperftools/issues/detail?id=278
-         - http://lists.debian.org/debian-glibc/2010/03/msg00161.html
-
-        TL;DR: if the RSS of the process is large enough, the clone() syscall
-        will be interrupted by the SIGPROF before it can complete, then
-        retried, interrupted again and so on, in an endless loop.  The
-        solution is to disable the timer around the fork, and re-enable it
-        only inside the parent.
-    */
-    if (atfork_hook_installed)
-        return 0;
-    int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, NULL);
-    if (ret != 0)
-        return -1;
-    atfork_hook_installed = 1;
-    return 0;
-}
-
-RPY_EXTERN
-int rpython_vmprof_enable(long interval_usec)
-{
-    assert(profile_file >= 0);
-    assert(interval_usec > 0);
-    profile_interval_usec = interval_usec;
-
-    if (install_pthread_atfork_hooks() == -1)
-        goto error;
-    if (install_sigprof_handler() == -1)
-        goto error;
-    if (install_sigprof_timer() == -1)
-        goto error;
-    rpython_vmprof_ignore_signals(0);
-    return 0;
-
- error:
-    profile_file = -1;
-    profile_interval_usec = 0;
-    return -1;
-}
-
-static int _write_all(const void *buf, size_t bufsize)
-{
-    while (bufsize > 0) {
-        ssize_t count = write(profile_file, buf, bufsize);
-        if (count <= 0)
-            return -1;   /* failed */
-        buf += count;
-        bufsize -= count;
-    }
-    return 0;
-}
-
-static int close_profile(void)
-{
-    char buf[4096];
-    ssize_t size;
-    unsigned char marker = MARKER_TRAILER;
-
-    if (_write_all(&marker, 1) < 0)
-        return -1;
-
-#ifdef __linux__
-    // copy /proc/self/maps to the end of the profile file
-    int srcfd = open("/proc/self/maps", O_RDONLY);
-    if (srcfd < 0)
-        return -1;
-
-    while ((size = read(srcfd, buf, sizeof buf)) > 0) {
-        if (_write_all(buf, size) < 0) {
-            close(srcfd);
-            return -1;
-        }
-    }
-    close(srcfd);
-#else
-    // freebsd and mac
-    sprintf(buf, "procstat -v %d", getpid());
-    FILE *srcf = popen(buf, "r");
-    if (!srcf)
-        return -1;
-
-    while ((size = fread(buf, 1, sizeof buf, src))) {
-        if (_write_all(buf, size) < 0) {
-            pclose(srcf);
-            return -1;
-        }
-    }
-    pclose(srcf);
-#endif
-
-    /* don't close() the file descriptor from here */
-    profile_file = -1;
-    return 0;
-}
-
-RPY_EXTERN
-int rpython_vmprof_disable(void)
-{
-    rpython_vmprof_ignore_signals(1);
-    profile_interval_usec = 0;
-
-    if (remove_sigprof_timer() == -1)
-        return -1;
-    if (remove_sigprof_handler() == -1)
-        return -1;
-    if (shutdown_concurrent_bufs(profile_file) < 0)
-        return -1;
-    return close_profile();
-}
-
-RPY_EXTERN
-void rpython_vmprof_write_buf(char *buf, long size)
-{
-    struct profbuf_s *p;
-
-    while ((p = reserve_buffer(profile_file)) == NULL) {
-        /* spin loop waiting for a buffer to be ready; should almost never
-           be the case */
-        usleep(1);
-    }
-
-    if (size > SINGLE_BUF_SIZE)
-        size = SINGLE_BUF_SIZE;
-    memcpy(p->data, buf, size);
-    p->data_size = size;
-
-    commit_buffer(profile_file, p);
-}
+#include "vmprof_main.h"
diff --git a/rpython/rlib/rvmprof/src/rvmprof.h 
b/rpython/rlib/rvmprof/src/rvmprof.h
--- a/rpython/rlib/rvmprof/src/rvmprof.h
+++ b/rpython/rlib/rvmprof/src/rvmprof.h
@@ -1,6 +1,6 @@
 
-RPY_EXTERN char *rpython_vmprof_init(int);
-RPY_EXTERN void rpython_vmprof_ignore_signals(int);
-RPY_EXTERN int rpython_vmprof_enable(long);
-RPY_EXTERN int rpython_vmprof_disable(void);
-RPY_EXTERN void rpython_vmprof_write_buf(char *, long);
+RPY_EXTERN char *vmprof_init(int, double, char *);
+RPY_EXTERN void vmprof_ignore_signals(int);
+RPY_EXTERN int vmprof_enable(void);
+RPY_EXTERN int vmprof_disable(void);
+RPY_EXTERN int vmprof_register_virtual_function(char *, long, int);
diff --git a/rpython/rlib/rvmprof/src/rvmprof_get_custom_offset.h 
b/rpython/rlib/rvmprof/src/rvmprof_get_custom_offset.h
deleted file mode 100644
--- a/rpython/rlib/rvmprof/src/rvmprof_get_custom_offset.h
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#ifdef PYPY_JIT_CODEMAP
-void *pypy_find_codemap_at_addr(long addr, long *start_addr);
-long pypy_yield_codemap_at_addr(void *codemap_raw, long addr,
-                                long *current_pos_addr);
-long pypy_jit_stack_depth_at_loc(long loc);
-#endif
-
-
-static ptrdiff_t vmprof_unw_get_custom_offset(void* ip, void *cp) {
-#ifdef PYPY_JIT_CODEMAP
-    intptr_t ip_l = (intptr_t)ip;
-    return pypy_jit_stack_depth_at_loc(ip_l);
-#else
-    return -1;
-#endif
-}
-
-static long vmprof_write_header_for_jit_addr(void **result, long n,
-                                             void *ip, int max_depth)
-{
-#ifdef PYPY_JIT_CODEMAP
-    void *codemap;
-    long current_pos = 0;
-    intptr_t id;
-    long start_addr = 0;
-    intptr_t addr = (intptr_t)ip;
-    int start, k;
-    void *tmp;
-
-    codemap = pypy_find_codemap_at_addr(addr, &start_addr);
-    if (codemap == NULL)
-        // not a jit code at all
-        return n;
-
-    // modify the last entry to point to start address and not the random one
-    // in the middle
-    result[n - 1] = (void*)start_addr;
-    result[n] = (void*)2;
-    n++;
-    start = n;
-    while (n < max_depth) {
-        id = pypy_yield_codemap_at_addr(codemap, addr, &current_pos);
-        if (id == -1)
-            // finish
-            break;
-        if (id == 0)
-            continue; // not main codemap
-        result[n++] = (void *)id;
-    }
-    k = 0;
-    while (k < (n - start) / 2) {
-        tmp = result[start + k];
-        result[start + k] = result[n - k - 1];
-        result[n - k - 1] = tmp;
-        k++;
-    }
-    if (n < max_depth) {
-        result[n++] = (void*)3;
-    }
-#endif
-    return n;
-}
diff --git a/rpython/rlib/rvmprof/src/rvmprof_mt.h 
b/rpython/rlib/rvmprof/src/rvmprof_mt.h
deleted file mode 100644
--- a/rpython/rlib/rvmprof/src/rvmprof_mt.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Support for multithreaded write() operations */
-
-#include <sys/mman.h>
-#include <string.h>
-
-/* The idea is that we have MAX_NUM_BUFFERS available, all of size
-   SINGLE_BUF_SIZE.  Threads and signal handlers can ask to reserve a
-   buffer, fill it, and finally "commit" it, at which point its
-   content is written into the profile file.  There is no hard
-   guarantee about the order in which the committed blocks are
-   actually written.  We do this with two constrains:
-
-   - write() calls should not overlap; only one thread can be
-     currently calling it.
-
-   - the code needs to be multithread-safe *and* signal-handler-safe,
-     which means it must be written in a wait-free style: never have
-     spin loops waiting for some lock to be released, from any of
-     the functions that can be called from the signal handler!  The
-     code holding the lock could be running in the same thread,
-     currently interrupted by the signal handler.
-
-   The value of MAX_NUM_BUFFERS is a trade-off between too high
-   (lots of unnecessary memory, lots of checking all of them)
-   and too low (risk that there is none left).
-*/
-#define MAX_NUM_BUFFERS  20
-#define SINGLE_BUF_SIZE  (8192 - 2 * sizeof(unsigned int))
-
-#if defined(__i386__) || defined(__amd64__)
-  static inline void write_fence(void) { asm("" : : : "memory"); }
-#else
-  static inline void write_fence(void) { __sync_synchronize(); }
-#endif
-
-
-#define PROFBUF_UNUSED   0
-#define PROFBUF_FILLING  1
-#define PROFBUF_READY    2
-
-
-struct profbuf_s {
-    unsigned int data_size;
-    unsigned int data_offset;
-    char data[SINGLE_BUF_SIZE];
-};
-
-static char volatile profbuf_state[MAX_NUM_BUFFERS];
-static struct profbuf_s *profbuf_all_buffers = NULL;
-static int volatile profbuf_write_lock = 2;
-static long profbuf_pending_write;
-
-
-static void unprepare_concurrent_bufs(void)
-{
-    if (profbuf_all_buffers != NULL) {
-        munmap(profbuf_all_buffers, sizeof(struct profbuf_s) * 
MAX_NUM_BUFFERS);
-        profbuf_all_buffers = NULL;
-    }
-}
-
-static int prepare_concurrent_bufs(void)
-{
-    assert(sizeof(struct profbuf_s) == 8192);
-
-    unprepare_concurrent_bufs();
-    profbuf_all_buffers = mmap(NULL, sizeof(struct profbuf_s) * 
MAX_NUM_BUFFERS,
-                               PROT_READ | PROT_WRITE,
-                               MAP_PRIVATE | MAP_ANONYMOUS,
-                               -1, 0);
-    if (profbuf_all_buffers == MAP_FAILED) {
-        profbuf_all_buffers = NULL;
-        return -1;
-    }
-    memset((char *)profbuf_state, PROFBUF_UNUSED, sizeof(profbuf_state));
-    profbuf_write_lock = 0;
-    profbuf_pending_write = -1;
-    return 0;
-}
-
-static int _write_single_ready_buffer(int fd, long i)
-{
-    /* Try to write to disk the buffer number 'i'.  This function must
-       only be called while we hold the write lock. */
-    assert(profbuf_write_lock != 0);
-
-    if (profbuf_pending_write >= 0) {
-        /* A partially written buffer is waiting.  We'll write the
-           rest of this buffer now, instead of 'i'. */
-        i = profbuf_pending_write;
-        assert(profbuf_state[i] == PROFBUF_READY);
-    }
-
-    if (profbuf_state[i] != PROFBUF_READY) {
-        /* this used to be a race condition: the buffer was written by a
-           different thread already, nothing to do now */
-        return 0;
-    }
-
-    int err;
-    struct profbuf_s *p = &profbuf_all_buffers[i];
-    ssize_t count = write(fd, p->data + p->data_offset, p->data_size);
-    if (count == p->data_size) {
-        profbuf_state[i] = PROFBUF_UNUSED;
-        profbuf_pending_write = -1;
-    }
-    else {
-        if (count > 0) {
-            p->data_offset += count;
-            p->data_size -= count;
-        }
-        profbuf_pending_write = i;
-        if (count < 0)
-            return -1;
-    }
-    return 0;
-}
-
-static void _write_ready_buffers(int fd)
-{
-    long i;
-    int has_write_lock = 0;
-
-    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
-        if (profbuf_state[i] == PROFBUF_READY) {
-            if (!has_write_lock) {
-                if (!__sync_bool_compare_and_swap(&profbuf_write_lock, 0, 1))
-                    return;   /* can't acquire the write lock, give up */
-                has_write_lock = 1;
-            }
-            if (_write_single_ready_buffer(fd, i) < 0)
-                break;
-        }
-    }
-    if (has_write_lock)
-        profbuf_write_lock = 0;
-}
-
-static struct profbuf_s *reserve_buffer(int fd)
-{
-    /* Tries to enter a region of code that fills one buffer.  If
-       successful, returns the profbuf_s.  It fails only if the
-       concurrent buffers are all busy (extreme multithreaded usage).
-
-       This might call write() to emit the data sitting in
-       previously-prepared buffers.  In case of write() error, the
-       error is ignored but unwritten data stays in the buffers.
-    */
-    long i;
-
-    _write_ready_buffers(fd);
-
-    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
-        if (profbuf_state[i] == PROFBUF_UNUSED &&
-            __sync_bool_compare_and_swap(&profbuf_state[i], PROFBUF_UNUSED,
-                                         PROFBUF_FILLING)) {
-            struct profbuf_s *p = &profbuf_all_buffers[i];
-            p->data_size = 0;
-            p->data_offset = 0;
-            return p;
-        }
-    }
-    /* no unused buffer found */
-    return NULL;
-}
-
-static void commit_buffer(int fd, struct profbuf_s *buf)
-{
-    /* Leaves a region of code that filled 'buf'.
-
-       This might call write() to emit the data now ready.  In case of
-       write() error, the error is ignored but unwritten data stays in
-       the buffers.
-    */
-
-    /* Make sure every thread sees the full content of 'buf' */
-    write_fence();
-
-    /* Then set the 'ready' flag */
-    long i = buf - profbuf_all_buffers;
-    assert(profbuf_state[i] == PROFBUF_FILLING);
-    profbuf_state[i] = PROFBUF_READY;
-
-    if (!__sync_bool_compare_and_swap(&profbuf_write_lock, 0, 1)) {
-        /* can't acquire the write lock, ignore */
-    }
-    else {
-        _write_single_ready_buffer(fd, i);
-        profbuf_write_lock = 0;
-    }
-}
-
-static int shutdown_concurrent_bufs(int fd)
-{
-    /* no signal handler can be running concurrently here, because we
-       already did rpython_vmprof_ignore_signals(1) */
-    assert(profbuf_write_lock == 0);
-    profbuf_write_lock = 2;
-
-    /* last attempt to flush buffers */
-    int i;
-    for (i = 0; i < MAX_NUM_BUFFERS; i++) {
-        while (profbuf_state[i] == PROFBUF_READY) {
-            if (_write_single_ready_buffer(fd, i) < 0)
-                return -1;
-        }
-    }
-    unprepare_concurrent_bufs();
-    return 0;
-}
diff --git a/rpython/rlib/rvmprof/src/rvmprof_config.h 
b/rpython/rlib/rvmprof/src/vmprof_config.h
rename from rpython/rlib/rvmprof/src/rvmprof_config.h
rename to rpython/rlib/rvmprof/src/vmprof_config.h
diff --git a/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h 
b/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h
@@ -0,0 +1,120 @@
+
+#ifdef PYPY_JIT_CODEMAP
+void *pypy_find_codemap_at_addr(long addr, long *start_addr);
+long pypy_yield_codemap_at_addr(void *codemap_raw, long addr,
+                                long *current_pos_addr);
+long pypy_jit_stack_depth_at_loc(long loc);
+#endif
+
+
+#ifdef CPYTHON_GET_CUSTOM_OFFSET
+static void *tramp_start, *tramp_end;
+#endif
+
+
+static ptrdiff_t vmprof_unw_get_custom_offset(void* ip, void *cp) {
+
+#if defined(PYPY_JIT_CODEMAP)
+
+    intptr_t ip_l = (intptr_t)ip;
+    return pypy_jit_stack_depth_at_loc(ip_l);
+
+#elif defined(CPYTHON_GET_CUSTOM_OFFSET)
+
+    if (ip >= tramp_start && ip <= tramp_end) {
+        // XXX the return value is wrong for all the places before push and
+        //     after pop, fix
+        void *bp;
+        void *sp;
+
+        /* This is a stage2 trampoline created by hotpatch:
+
+               push   %rbx
+               push   %rbp
+               mov    %rsp,%rbp
+               and    $0xfffffffffffffff0,%rsp   // make sure the stack is 
aligned
+               movabs $0x7ffff687bb10,%rbx
+               callq  *%rbx
+               leaveq 
+               pop    %rbx
+               retq   
+
+           the stack layout is like this:
+
+               +-----------+                      high addresses
+               | ret addr  |
+               +-----------+
+               | saved rbx |   start of the function frame
+               +-----------+
+               | saved rbp |
+               +-----------+
+               | ........  |   <-- rbp
+               +-----------+                      low addresses
+
+           So, the trampoline frame starts at rbp+16, and the return address,
+           is at rbp+24.  The vmprof API requires us to return the offset of
+           the frame relative to sp, hence we have this weird computation.
+
+           XXX (antocuni): I think we could change the API to return directly
+           the frame address instead of the offset; however, this require a
+           change in the PyPy code too
+        */
+
+        unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp);
+        unw_get_reg (cp, UNW_X86_64_RBP, (unw_word_t*)&bp);
+        return bp+16+8-sp;
+    }
+    return -1;
+
+#else
+
+    return -1;
+
+#endif
+}
+
+static long vmprof_write_header_for_jit_addr(void **result, long n,
+                                             void *ip, int max_depth)
+{
+#ifdef PYPY_JIT_CODEMAP
+    void *codemap;
+    long current_pos = 0;
+    intptr_t id;
+    long start_addr = 0;
+    intptr_t addr = (intptr_t)ip;
+    int start, k;
+    void *tmp;
+
+    codemap = pypy_find_codemap_at_addr(addr, &start_addr);
+    if (codemap == NULL)
+        // not a jit code at all
+        return n;
+
+    // modify the last entry to point to start address and not the random one
+    // in the middle
+    result[n - 1] = (void*)start_addr;
+    result[n] = (void*)2;
+    n++;
+    start = n;
+    while (n < max_depth) {
+        id = pypy_yield_codemap_at_addr(codemap, addr, &current_pos);
+        if (id == -1)
+            // finish
+            break;
+        if (id == 0)
+            continue; // not main codemap
+        result[n++] = (void *)id;
+    }
+    k = 0;
+    while (k < (n - start) / 2) {
+        tmp = result[start + k];
+        result[start + k] = result[n - k - 1];
+        result[n - k - 1] = tmp;
+        k++;
+    }
+    if (n < max_depth) {
+        result[n++] = (void*)3;
+    }
+#endif
+    return n;
+}
diff --git a/rpython/rlib/rvmprof/src/rvmprof_getpc.h 
b/rpython/rlib/rvmprof/src/vmprof_getpc.h
rename from rpython/rlib/rvmprof/src/rvmprof_getpc.h
rename to rpython/rlib/rvmprof/src/vmprof_getpc.h
--- a/rpython/rlib/rvmprof/src/rvmprof_getpc.h
+++ b/rpython/rlib/rvmprof/src/vmprof_getpc.h
@@ -44,7 +44,7 @@
 #ifndef BASE_GETPC_H_
 #define BASE_GETPC_H_
 
-#include "rvmprof_config.h"
+#include "vmprof_config.h"
 
 // On many linux systems, we may need _GNU_SOURCE to get access to
 // the defined constants that define the register we want to see (eg
diff --git a/rpython/rlib/rvmprof/src/vmprof_main.h 
b/rpython/rlib/rvmprof/src/vmprof_main.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/vmprof_main.h
@@ -0,0 +1,556 @@
+/* VMPROF
+ *
+ * statistical sampling profiler specifically designed to profile programs
+ * which run on a Virtual Machine and/or bytecode interpreter, such as Python,
+ * etc.
+ *
+ * The logic to dump the C stack traces is partly stolen from the code in
+ * gperftools.
+ * The file "getpc.h" has been entirely copied from gperftools.
+ *
+ * Tested only on gcc, linux, x86_64.
+ *
+ * Copyright (C) 2014-2015
+ *   Antonio Cuni - [email protected]
+ *   Maciej Fijalkowski - [email protected]
+ *   Armin Rigo - [email protected]
+ *
+ */
+
+#define _GNU_SOURCE 1
+
+#include <dlfcn.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "vmprof_getpc.h"
+#include "vmprof_unwind.h"
+#include "vmprof_mt.h"
+
+
+/************************************************************/
+
+// functions copied from libunwind using dlopen
+
+static int (*unw_get_reg)(unw_cursor_t*, int, unw_word_t*) = NULL;
+static int (*unw_step)(unw_cursor_t*) = NULL;
+static int (*unw_init_local)(unw_cursor_t *, unw_context_t *) = NULL;
+static int (*unw_get_proc_info)(unw_cursor_t *, unw_proc_info_t *) = NULL;
+
+static int profile_file = -1;
+static long prepare_interval_usec;
+static struct profbuf_s *volatile current_codes;
+static void *(*mainloop_get_virtual_ip)(char *) = 0;
+
+static int opened_profile(char *interp_name);
+static void flush_codes(void);
+
+RPY_EXTERN
+char *vmprof_init(int fd, double interval, char *interp_name)
+{
+    if (interval < 1e-6 || interval >= 1.0)
+        return "bad value for 'interval'";
+    prepare_interval_usec = (int)(interval * 1000000.0);
+
+    if (!unw_get_reg) {
+        void *libhandle;
+
+        if (!(libhandle = dlopen("libunwind.so", RTLD_LAZY | RTLD_LOCAL)))
+            goto error;
+        if (!(unw_get_reg = dlsym(libhandle, "_ULx86_64_get_reg")))
+            goto error;
+        if (!(unw_get_proc_info = dlsym(libhandle, "_ULx86_64_get_proc_info")))
+            goto error;
+        if (!(unw_init_local = dlsym(libhandle, "_ULx86_64_init_local")))
+            goto error;
+        if (!(unw_step = dlsym(libhandle, "_ULx86_64_step")))
+            goto error;
+    }
+    if (prepare_concurrent_bufs() < 0)
+        return "out of memory";
+
+    assert(fd >= 0);
+    profile_file = fd;
+    if (opened_profile(interp_name) < 0) {
+        profile_file = -1;
+        return strerror(errno);
+    }
+    return NULL;
+
+ error:
+    return dlerror();
+}
+
+/************************************************************/
+
+/* value: last bit is 1 if signals must be ignored; all other bits
+   are a counter for how many threads are currently in a signal handler */
+static long volatile signal_handler_value = 1;
+
+RPY_EXTERN
+void vmprof_ignore_signals(int ignored)
+{
+    if (!ignored) {
+        __sync_fetch_and_and(&signal_handler_value, ~1L);
+    }
+    else {
+        /* set the last bit, and wait until concurrently-running signal
+           handlers finish */
+        while (__sync_or_and_fetch(&signal_handler_value, 1L) != 1L) {
+            usleep(1);
+        }
+    }
+}
+
+
+/* *************************************************************
+ * functions to write a profile file compatible with gperftools
+ * *************************************************************
+ */
+
+#define MAX_FUNC_NAME 128
+#define MAX_STACK_DEPTH   \
+    ((SINGLE_BUF_SIZE - sizeof(struct prof_stacktrace_s)) / sizeof(void *))
+
+#define MARKER_STACKTRACE '\x01'
+#define MARKER_VIRTUAL_IP '\x02'
+#define MARKER_TRAILER '\x03'
+#define MARKER_INTERP_NAME '\x04'   /* deprecated */
+#define MARKER_HEADER '\x05'
+
+#define VERSION_BASE '\x00'
+#define VERSION_THREAD_ID '\x01'
+
+struct prof_stacktrace_s {
+    char padding[sizeof(long) - 1];
+    char marker;
+    long count, depth;
+    void *stack[];
+};
+
+static long profile_interval_usec = 0;
+static char atfork_hook_installed = 0;
+
+
+/* ******************************************************
+ * libunwind workaround for process JIT frames correctly
+ * ******************************************************
+ */
+
+#include "vmprof_get_custom_offset.h"
+
+typedef struct {
+    void* _unused1;
+    void* _unused2;
+    void* sp;
+    void* ip;
+    void* _unused3[sizeof(unw_cursor_t)/sizeof(void*) - 4];
+} vmprof_hacked_unw_cursor_t;
+
+static int vmprof_unw_step(unw_cursor_t *cp, int first_run)
+{
+    void* ip;
+    void* sp;
+    ptrdiff_t sp_offset;
+    unw_get_reg (cp, UNW_REG_IP, (unw_word_t*)&ip);
+    unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp);
+    if (!first_run) {
+        // make sure we're pointing to the CALL and not to the first
+        // instruction after. If the callee adjusts the stack for us
+        // it's not safe to be at the instruction after
+        ip -= 1;
+    }
+    sp_offset = vmprof_unw_get_custom_offset(ip, cp);
+
+    if (sp_offset == -1) {
+        // it means that the ip is NOT in JITted code, so we can use the
+        // stardard unw_step
+        return unw_step(cp);
+    }
+    else {
+        // this is a horrible hack to manually walk the stack frame, by
+        // setting the IP and SP in the cursor
+        vmprof_hacked_unw_cursor_t *cp2 = (vmprof_hacked_unw_cursor_t*)cp;
+        void* bp = (void*)sp + sp_offset;
+        cp2->sp = bp;
+        bp -= sizeof(void*);
+        cp2->ip = ((void**)bp)[0];
+        // the ret is on the top of the stack minus WORD
+        return 1;
+    }
+}
+
+
+/* *************************************************************
+ * functions to dump the stack trace
+ * *************************************************************
+ */
+
+static int get_stack_trace(void** result, int max_depth, ucontext_t *ucontext)
+{
+    void *ip;
+    int n = 0;
+    unw_cursor_t cursor;
+    unw_context_t uc = *ucontext;
+
+    int ret = unw_init_local(&cursor, &uc);
+    assert(ret >= 0);
+    (void)ret;
+
+    while (n < max_depth) {
+        if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
+            break;
+        }
+
+        unw_proc_info_t pip;
+        unw_get_proc_info(&cursor, &pip);
+
+        /* if n==0, it means that the signal handler interrupted us while we
+           were in the trampoline, so we are not executing (yet) the real main
+           loop function; just skip it */
+        if (VMPROF_ADDR_OF_TRAMPOLINE((void*)pip.start_ip) && n > 0) {
+            // found main loop stack frame
+            void* sp;
+            unw_get_reg(&cursor, UNW_REG_SP, (unw_word_t *) &sp);
+            if (mainloop_get_virtual_ip)
+                ip = mainloop_get_virtual_ip((char *)sp);
+            else
+                ip = *(void **)sp;
+        }
+
+        int first_run = (n == 0);
+        result[n++] = ip;
+        n = vmprof_write_header_for_jit_addr(result, n, ip, max_depth);
+        if (vmprof_unw_step(&cursor, first_run) <= 0)
+            break;
+    }
+    return n;
+}
+
+static void *get_current_thread_id(void)
+{
+    /* xxx This function is a hack on two fronts:
+
+       - It assumes that pthread_self() is async-signal-safe.  This
+         should be true on Linux.  I hope it is also true elsewhere.
+
+       - It abuses pthread_self() by assuming it just returns an
+         integer.  According to comments in CPython's source code, the
+         platforms where it is not the case are rare nowadays.
+
+       An alternative would be to try to look if the information is
+       available in the ucontext_t in the caller.
+    */
+    return (void *)pthread_self();
+}
+
+
+/* *************************************************************
+ * the signal handler
+ * *************************************************************
+ */
+
+static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext)
+{
+    long val = __sync_fetch_and_add(&signal_handler_value, 2L);
+
+    if ((val & 1) == 0) {
+        int saved_errno = errno;
+        int fd = profile_file;
+        assert(fd >= 0);
+
+        struct profbuf_s *p = reserve_buffer(fd);
+        if (p == NULL) {
+            /* ignore this signal: there are no free buffers right now */
+        }
+        else {
+            int depth;
+            struct prof_stacktrace_s *st = (struct prof_stacktrace_s *)p->data;
+            st->marker = MARKER_STACKTRACE;
+            st->count = 1;
+            st->stack[0] = GetPC((ucontext_t*)ucontext);
+            depth = get_stack_trace(st->stack+1, MAX_STACK_DEPTH-2, ucontext);
+            depth++;  // To account for pc value in stack[0];
+            st->depth = depth;
+            st->stack[depth++] = get_current_thread_id();
+            p->data_offset = offsetof(struct prof_stacktrace_s, marker);
+            p->data_size = (depth * sizeof(void *) +
+                            sizeof(struct prof_stacktrace_s) -
+                            offsetof(struct prof_stacktrace_s, marker));
+            commit_buffer(fd, p);
+        }
+
+        errno = saved_errno;
+    }
+
+    __sync_sub_and_fetch(&signal_handler_value, 2L);
+}
+
+
+/* *************************************************************
+ * the setup and teardown functions
+ * *************************************************************
+ */
+
+static int install_sigprof_handler(void)
+{
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_sigaction = sigprof_handler;
+    sa.sa_flags = SA_RESTART | SA_SIGINFO;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+        sigaction(SIGPROF, &sa, NULL) == -1)
+        return -1;
+    return 0;
+}
+
+static int remove_sigprof_handler(void)
+{
+    if (signal(SIGPROF, SIG_DFL) == SIG_ERR)
+        return -1;
+    return 0;
+}
+
+static int install_sigprof_timer(void)
+{
+    static struct itimerval timer;
+    timer.it_interval.tv_sec = 0;
+    timer.it_interval.tv_usec = profile_interval_usec;
+    timer.it_value = timer.it_interval;
+    if (setitimer(ITIMER_PROF, &timer, NULL) != 0)
+        return -1;
+    return 0;
+}
+
+static int remove_sigprof_timer(void) {
+    static struct itimerval timer;
+    timer.it_interval.tv_sec = 0;
+    timer.it_interval.tv_usec = 0;
+    timer.it_value.tv_sec = 0;
+    timer.it_value.tv_usec = 0;
+    if (setitimer(ITIMER_PROF, &timer, NULL) != 0)
+        return -1;
+    return 0;
+}
+
+static void atfork_disable_timer(void) {
+    if (profile_interval_usec > 0) {
+        remove_sigprof_timer();
+    }
+}
+
+static void atfork_enable_timer(void) {
+    if (profile_interval_usec > 0) {
+        install_sigprof_timer();
+    }
+}
+
+static int install_pthread_atfork_hooks(void) {
+    /* this is needed to prevent the problems described there:
+         - http://code.google.com/p/gperftools/issues/detail?id=278
+         - http://lists.debian.org/debian-glibc/2010/03/msg00161.html
+
+        TL;DR: if the RSS of the process is large enough, the clone() syscall
+        will be interrupted by the SIGPROF before it can complete, then
+        retried, interrupted again and so on, in an endless loop.  The
+        solution is to disable the timer around the fork, and re-enable it
+        only inside the parent.
+    */
+    if (atfork_hook_installed)
+        return 0;
+    int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, NULL);
+    if (ret != 0)
+        return -1;
+    atfork_hook_installed = 1;
+    return 0;
+}
+
+RPY_EXTERN
+int vmprof_enable(void)
+{
+    assert(profile_file >= 0);
+    assert(prepare_interval_usec > 0);
+    profile_interval_usec = prepare_interval_usec;
+
+    if (install_pthread_atfork_hooks() == -1)
+        goto error;
+    if (install_sigprof_handler() == -1)
+        goto error;
+    if (install_sigprof_timer() == -1)
+        goto error;
+    vmprof_ignore_signals(0);
+    return 0;
+
+ error:
+    profile_file = -1;
+    profile_interval_usec = 0;
+    return -1;
+}
+
+static int _write_all(const void *buf, size_t bufsize)
+{
+    while (bufsize > 0) {
+        ssize_t count = write(profile_file, buf, bufsize);
+        if (count <= 0)
+            return -1;   /* failed */
+        buf += count;
+        bufsize -= count;
+    }
+    return 0;
+}
+
+static int opened_profile(char *interp_name)
+{
+    struct {
+        long hdr[5];
+        char interp_name[259];
+    } header;
+
+    size_t namelen = strnlen(interp_name, 255);
+    current_codes = NULL;
+
+    header.hdr[0] = 0;
+    header.hdr[1] = 3;
+    header.hdr[2] = 0;
+    header.hdr[3] = prepare_interval_usec;
+    header.hdr[4] = 0;
+    header.interp_name[0] = MARKER_HEADER;
+    header.interp_name[1] = '\x00';
+    header.interp_name[2] = VERSION_THREAD_ID;
+    header.interp_name[3] = namelen;
+    memcpy(&header.interp_name[4], interp_name, namelen);
+    return _write_all(&header, 5 * sizeof(long) + 4 + namelen);
+}
+
+static int close_profile(void)
+{
+    char buf[4096];
+    ssize_t size;
+    unsigned char marker = MARKER_TRAILER;
+
+    if (_write_all(&marker, 1) < 0)
+        return -1;
+
+#ifdef __linux__
+    // copy /proc/self/maps to the end of the profile file
+    int srcfd = open("/proc/self/maps", O_RDONLY);
+    if (srcfd < 0)
+        return -1;
+
+    while ((size = read(srcfd, buf, sizeof buf)) > 0) {
+        if (_write_all(buf, size) < 0) {
+            close(srcfd);
+            return -1;
+        }
+    }
+    close(srcfd);
+#else
+    // freebsd and mac
+    sprintf(buf, "procstat -v %d", getpid());
+    FILE *srcf = popen(buf, "r");
+    if (!srcf)
+        return -1;
+
+    while ((size = fread(buf, 1, sizeof buf, src))) {
+        if (_write_all(buf, size) < 0) {
+            pclose(srcf);
+            return -1;
+        }
+    }
+    pclose(srcf);
+#endif
+
+    /* don't close() the file descriptor from here */
+    profile_file = -1;
+    return 0;
+}
+
+RPY_EXTERN
+int vmprof_disable(void)
+{
+    vmprof_ignore_signals(1);
+    profile_interval_usec = 0;
+
+    if (remove_sigprof_timer() == -1)
+        return -1;
+    if (remove_sigprof_handler() == -1)
+        return -1;
+    flush_codes();
+    if (shutdown_concurrent_bufs(profile_file) < 0)
+        return -1;
+    return close_profile();
+}
+
+RPY_EXTERN
+int vmprof_register_virtual_function(char *code_name, long code_uid,
+                                     int auto_retry)
+{
+    long namelen = strnlen(code_name, 1023);
+    long blocklen = 1 + 2 * sizeof(long) + namelen;
+    struct profbuf_s *p;
+    char *t;
+
+ retry:
+    p = current_codes;
+    if (p != NULL) {
+        if (__sync_bool_compare_and_swap(&current_codes, p, NULL)) {
+            /* grabbed 'current_codes': we will append the current block
+               to it if it contains enough room */
+            size_t freesize = SINGLE_BUF_SIZE - p->data_size;
+            if (freesize < blocklen) {
+                /* full: flush it */
+                commit_buffer(profile_file, p);
+                p = NULL;
+            }
+        }
+        else {
+            /* compare-and-swap failed, don't try again */
+            p = NULL;
+        }
+    }
+
+    if (p == NULL) {
+        p = reserve_buffer(profile_file);
+        if (p == NULL) {
+            /* can't get a free block; should almost never be the
+               case.  Spin loop if allowed, or return a failure code
+               if not (e.g. we're in a signal handler) */
+            if (auto_retry > 0) {
+                auto_retry--;
+                usleep(1);
+                goto retry;
+            }
+            return -1;
+        }
+    }
+
+    t = p->data + p->data_size;
+    p->data_size += blocklen;
+    assert(p->data_size <= SINGLE_BUF_SIZE);
+    *t++ = MARKER_VIRTUAL_IP;
+    memcpy(t, &code_uid, sizeof(long)); t += sizeof(long);
+    memcpy(t, &namelen, sizeof(long)); t += sizeof(long);
+    memcpy(t, code_name, namelen);
+
+    /* try to reattach 'p' to 'current_codes' */
+    if (!__sync_bool_compare_and_swap(&current_codes, NULL, p)) {
+        /* failed, flush it */
+        commit_buffer(profile_file, p);
+    }
+    return 0;
+}
+
+static void flush_codes(void)
+{
+    struct profbuf_s *p = current_codes;
+    if (p != NULL) {
+        current_codes = NULL;
+        commit_buffer(profile_file, p);
+    }
+}
diff --git a/rpython/rlib/rvmprof/src/vmprof_mt.h 
b/rpython/rlib/rvmprof/src/vmprof_mt.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/vmprof_mt.h
@@ -0,0 +1,217 @@
+/* Support for multithreaded write() operations */
+
+#include <sys/mman.h>
+#include <string.h>
+
+/* The idea is that we have MAX_NUM_BUFFERS available, all of size
+   SINGLE_BUF_SIZE.  Threads and signal handlers can ask to reserve a
+   buffer, fill it, and finally "commit" it, at which point its
+   content is written into the profile file.  There is no hard
+   guarantee about the order in which the committed blocks are
+   actually written.  We do this with two constrains:
+
+   - write() calls should not overlap; only one thread can be
+     currently calling it.
+
+   - the code needs to be multithread-safe *and* signal-handler-safe,
+     which means it must be written in a wait-free style: never have
+     spin loops waiting for some lock to be released, from any of
+     the functions that can be called from the signal handler!  The
+     code holding the lock could be running in the same thread,
+     currently interrupted by the signal handler.
+
+   The value of MAX_NUM_BUFFERS is a trade-off between too high
+   (lots of unnecessary memory, lots of checking all of them)
+   and too low (risk that there is none left).
+*/
+#define MAX_NUM_BUFFERS  20
+#define SINGLE_BUF_SIZE  (8192 - 2 * sizeof(unsigned int))
+
+#if defined(__i386__) || defined(__amd64__)
+  static inline void write_fence(void) { asm("" : : : "memory"); }
+#else
+  static inline void write_fence(void) { __sync_synchronize(); }
+#endif
+
+
+#define PROFBUF_UNUSED   0
+#define PROFBUF_FILLING  1
+#define PROFBUF_READY    2
+
+
+struct profbuf_s {
+    unsigned int data_size;
+    unsigned int data_offset;
+    char data[SINGLE_BUF_SIZE];
+};
+
+static char volatile profbuf_state[MAX_NUM_BUFFERS];
+static struct profbuf_s *profbuf_all_buffers = NULL;
+static int volatile profbuf_write_lock = 2;
+static long profbuf_pending_write;
+
+
+static void unprepare_concurrent_bufs(void)
+{
+    if (profbuf_all_buffers != NULL) {
+        munmap(profbuf_all_buffers, sizeof(struct profbuf_s) * 
MAX_NUM_BUFFERS);
+        profbuf_all_buffers = NULL;
+    }
+}
+
+static int prepare_concurrent_bufs(void)
+{
+    assert(sizeof(struct profbuf_s) == 8192);
+
+    unprepare_concurrent_bufs();
+    profbuf_all_buffers = mmap(NULL, sizeof(struct profbuf_s) * 
MAX_NUM_BUFFERS,
+                               PROT_READ | PROT_WRITE,
+                               MAP_PRIVATE | MAP_ANONYMOUS,
+                               -1, 0);
+    if (profbuf_all_buffers == MAP_FAILED) {
+        profbuf_all_buffers = NULL;
+        return -1;
+    }
+    memset((char *)profbuf_state, PROFBUF_UNUSED, sizeof(profbuf_state));
+    profbuf_write_lock = 0;
+    profbuf_pending_write = -1;
+    return 0;
+}
+
+static int _write_single_ready_buffer(int fd, long i)
+{
+    /* Try to write to disk the buffer number 'i'.  This function must
+       only be called while we hold the write lock. */
+    assert(profbuf_write_lock != 0);
+
+    if (profbuf_pending_write >= 0) {
+        /* A partially written buffer is waiting.  We'll write the
+           rest of this buffer now, instead of 'i'. */
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy value-profiling: merge default

Reply via email to