[pypy-commit] pypy default: merge vmprof-newstack

fijal Thu, 04 Feb 2016 11:46:24 -0800

Author: fijal
Branch: 
Changeset: r82078:cc28605e84eb
Date: 2016-02-04 20:44 +0100
http://bitbucket.org/pypy/pypy/changeset/cc28605e84eb/


Log:    merge vmprof-newstack

diff --git a/pypy/module/_vmprof/__init__.py b/pypy/module/_vmprof/__init__.py
--- a/pypy/module/_vmprof/__init__.py
+++ b/pypy/module/_vmprof/__init__.py
@@ -11,6 +11,7 @@
     interpleveldefs = {
         'enable': 'interp_vmprof.enable',
         'disable': 'interp_vmprof.disable',
+        'write_all_code_objects': 'interp_vmprof.write_all_code_objects',
         'VMProfError': 'space.fromcache(interp_vmprof.Cache).w_VMProfError',
     }
 
diff --git a/pypy/module/_vmprof/interp_vmprof.py 
b/pypy/module/_vmprof/interp_vmprof.py
--- a/pypy/module/_vmprof/interp_vmprof.py
+++ b/pypy/module/_vmprof/interp_vmprof.py
@@ -59,11 +59,21 @@
     'interval' is a float representing the sampling interval, in seconds.
     Must be smaller than 1.0
     """
+    w_modules = space.sys.get('modules')
+    if space.is_true(space.contains(w_modules, space.wrap('_continuation'))):
+        space.warn(space.wrap("Using _continuation/greenlet/stacklet together "
+                              "with vmprof will crash"),
+                   space.w_RuntimeWarning)
     try:
         rvmprof.enable(fileno, period)
     except rvmprof.VMProfError, e:
         raise VMProfError(space, e)
 
+def write_all_code_objects(space):
+    """ Needed on cpython, just empty function here
+    """
+    pass
+
 def disable(space):
     """Disable vmprof.  Remember to close the file descriptor afterwards
     if necessary.
diff --git a/rpython/jit/backend/llsupport/test/zrpy_vmprof_test.py 
b/rpython/jit/backend/llsupport/test/zrpy_vmprof_test.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/llsupport/test/zrpy_vmprof_test.py
@@ -0,0 +1,86 @@
+
+import os, py
+from rpython.jit.backend.test.support import CCompiledMixin
+from rpython.rlib.jit import JitDriver
+from rpython.tool.udir import udir
+from rpython.translator.translator import TranslationContext
+from rpython.jit.backend.detect_cpu import getcpuclass
+
+class CompiledVmprofTest(CCompiledMixin):
+    CPUClass = getcpuclass()
+
+    def _get_TranslationContext(self):
+        t = TranslationContext()
+        t.config.translation.gc = 'incminimark'
+        t.config.translation.list_comprehension_operations = True
+        return t
+
+    def test_vmprof(self):
+        from rpython.rlib import rvmprof
+
+        class MyCode:
+            _vmprof_unique_id = 0
+            def __init__(self, name):
+                self.name = name
+
+        def get_name(code):
+            return code.name
+
+        code2 = MyCode("py:y:foo:4")
+        rvmprof.register_code(code2, get_name)
+
+        try:
+            rvmprof.register_code_object_class(MyCode, get_name)
+        except rvmprof.VMProfPlatformUnsupported, e:
+            py.test.skip(str(e))
+
+        def get_unique_id(code):
+            return rvmprof.get_unique_id(code)
+
+        driver = JitDriver(greens = ['code'], reds = ['i', 's', 'num'],
+            is_recursive=True, get_unique_id=get_unique_id)
+
+        @rvmprof.vmprof_execute_code("xcode13", lambda code, num: code)
+        def main(code, num):
+            return main_jitted(code, num)
+
+        def main_jitted(code, num):
+            s = 0
+            i = 0
+            while i < num:
+                driver.jit_merge_point(code=code, i=i, s=s, num=num)
+                s += (i << 1)
+                if i % 3 == 0 and code is not code2:
+                    main(code2, 100)
+                i += 1
+            return s
+
+        tmpfilename = str(udir.join('test_rvmprof'))
+
+        def f(num):
+            code = MyCode("py:x:foo:3")
+            rvmprof.register_code(code, get_name)
+            fd = os.open(tmpfilename, os.O_WRONLY | os.O_CREAT, 0666)
+            period = 0.0001
+            rvmprof.enable(fd, period)
+            res = main(code, num)
+            #assert res == 499999500000
+            rvmprof.disable()
+            os.close(fd)
+            return 0
+        
+        def check_vmprof_output():
+            from vmprof import read_profile
+            tmpfile = str(udir.join('test_rvmprof'))
+            stats = read_profile(tmpfile)
+            t = stats.get_tree()
+            assert t.name == 'py:x:foo:3'
+            assert len(t.children) == 1 # jit
+
+        self.meta_interp(f, [1000000], inline=True)
+        try:
+            import vmprof
+        except ImportError:
+            pass
+        else:
+            check_vmprof_output()
\ No newline at end of file
diff --git a/rpython/jit/backend/test/test_rvmprof.py 
b/rpython/jit/backend/test/test_rvmprof.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/test/test_rvmprof.py
@@ -0,0 +1,49 @@
+import py
+from rpython.rlib import jit
+from rpython.rtyper.annlowlevel import llhelper
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.rvmprof import cintf
+from rpython.jit.backend.x86.arch import WORD
+from rpython.jit.codewriter.policy import JitPolicy
+
+class BaseRVMProfTest(object):
+    def test_one(self):
+        py.test.skip("needs thread-locals in the JIT, which is only available "
+                     "after translation")
+        visited = []
+
+        def helper():
+            stack = cintf.vmprof_tl_stack.getraw()
+            if stack:
+                # not during tracing
+                visited.append(stack.c_value)
+            else:
+                visited.append(0)
+
+        llfn = llhelper(lltype.Ptr(lltype.FuncType([], lltype.Void)), helper)
+
+        driver = jit.JitDriver(greens=[], reds='auto')
+
+        def f(n):
+            i = 0
+            while i < n:
+                driver.jit_merge_point()
+                i += 1
+                llfn()
+
+        class Hooks(jit.JitHookInterface):
+            def after_compile(self, debug_info):
+                self.raw_start = debug_info.asminfo.rawstart
+
+        hooks = Hooks()
+
+        null = lltype.nullptr(cintf.VMPROFSTACK)
+        cintf.vmprof_tl_stack.setraw(null)   # make it empty
+        self.meta_interp(f, [10], policy=JitPolicy(hooks))
+        v = set(visited)
+        assert 0 in v
+        v.remove(0)
+        assert len(v) == 1
+        assert 0 <= list(v)[0] - hooks.raw_start <= 10*1024
+        assert cintf.vmprof_tl_stack.getraw() == null
+        # ^^^ make sure we didn't leave anything dangling
diff --git a/rpython/jit/backend/x86/arch.py b/rpython/jit/backend/x86/arch.py
--- a/rpython/jit/backend/x86/arch.py
+++ b/rpython/jit/backend/x86/arch.py
@@ -31,7 +31,7 @@
 
 if WORD == 4:
     # ebp + ebx + esi + edi + 15 extra words = 19 words
-    FRAME_FIXED_SIZE = 19
+    FRAME_FIXED_SIZE = 19 + 4 # 4 for vmprof, XXX make more compact!
     PASS_ON_MY_FRAME = 15
     JITFRAME_FIXED_SIZE = 6 + 8 * 2 # 6 GPR + 8 XMM * 2 WORDS/float
     # 'threadlocal_addr' is passed as 2nd argument on the stack,
@@ -41,7 +41,7 @@
     THREADLOCAL_OFS = (FRAME_FIXED_SIZE + 2) * WORD
 else:
     # rbp + rbx + r12 + r13 + r14 + r15 + threadlocal + 12 extra words = 19
-    FRAME_FIXED_SIZE = 19
+    FRAME_FIXED_SIZE = 19 + 4 # 4 for vmprof, XXX make more compact!
     PASS_ON_MY_FRAME = 12
     JITFRAME_FIXED_SIZE = 28 # 13 GPR + 15 XMM
     # 'threadlocal_addr' is passed as 2nd argument in %esi,
diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -12,7 +12,7 @@
 from rpython.jit.metainterp.compile import ResumeGuardDescr
 from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
 from rpython.rtyper.lltypesystem.lloperation import llop
-from rpython.rtyper.annlowlevel import llhelper, cast_instance_to_gcref
+from rpython.rtyper.annlowlevel import cast_instance_to_gcref
 from rpython.rtyper import rclass
 from rpython.rlib.jit import AsmInfo
 from rpython.jit.backend.model import CompiledLoopToken
@@ -837,11 +837,56 @@
             frame_depth = max(frame_depth, target_frame_depth)
         return frame_depth
 
+    def _call_header_vmprof(self):
+        from rpython.rlib.rvmprof.rvmprof import cintf, VMPROF_JITTED_TAG
+
+        # tloc = address of pypy_threadlocal_s
+        if IS_X86_32:
+            # Can't use esi here, its old value is not saved yet.
+            # But we can use eax and ecx.
+            self.mc.MOV_rs(edx.value, THREADLOCAL_OFS)
+            tloc = edx
+            old = ecx
+        else:
+            # The thread-local value is already in esi.
+            # We should avoid if possible to use ecx or edx because they
+            # would be used to pass arguments #3 and #4 (even though, so
+            # far, the assembler only receives two arguments).
+            tloc = esi
+            old = r11
+        # eax = address in the stack of a 3-words struct vmprof_stack_s
+        self.mc.LEA_rs(eax.value, (FRAME_FIXED_SIZE - 4) * WORD)
+        # old = current value of vmprof_tl_stack
+        offset = cintf.vmprof_tl_stack.getoffset()
+        self.mc.MOV_rm(old.value, (tloc.value, offset))
+        # eax->next = old
+        self.mc.MOV_mr((eax.value, 0), old.value)
+        # eax->value = my esp
+        self.mc.MOV_mr((eax.value, WORD), esp.value)
+        # eax->kind = VMPROF_JITTED_TAG
+        self.mc.MOV_mi((eax.value, WORD * 2), VMPROF_JITTED_TAG)
+        # save in vmprof_tl_stack the new eax
+        self.mc.MOV_mr((tloc.value, offset), eax.value)
+
+    def _call_footer_vmprof(self):
+        from rpython.rlib.rvmprof.rvmprof import cintf
+        # edx = address of pypy_threadlocal_s
+        self.mc.MOV_rs(edx.value, THREADLOCAL_OFS)
+        self.mc.AND_ri(edx.value, ~1)
+        # eax = (our local vmprof_tl_stack).next
+        self.mc.MOV_rs(eax.value, (FRAME_FIXED_SIZE - 4 + 0) * WORD)
+        # save in vmprof_tl_stack the value eax
+        offset = cintf.vmprof_tl_stack.getoffset()
+        self.mc.MOV_mr((edx.value, offset), eax.value)
+
     def _call_header(self):
         self.mc.SUB_ri(esp.value, FRAME_FIXED_SIZE * WORD)
         self.mc.MOV_sr(PASS_ON_MY_FRAME * WORD, ebp.value)
         if IS_X86_64:
             self.mc.MOV_sr(THREADLOCAL_OFS, esi.value)
+        if self.cpu.translate_support_code:
+            self._call_header_vmprof()     # on X86_64, this uses esi
+        if IS_X86_64:
             self.mc.MOV_rr(ebp.value, edi.value)
         else:
             self.mc.MOV_rs(ebp.value, (FRAME_FIXED_SIZE + 1) * WORD)
@@ -873,6 +918,8 @@
 
     def _call_footer(self):
         # the return value is the jitframe
+        if self.cpu.translate_support_code:
+            self._call_footer_vmprof()
         self.mc.MOV_rr(eax.value, ebp.value)
 
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
diff --git a/rpython/jit/backend/x86/test/test_rvmprof.py 
b/rpython/jit/backend/x86/test/test_rvmprof.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/x86/test/test_rvmprof.py
@@ -0,0 +1,7 @@
+
+import py
+from rpython.jit.backend.test.test_rvmprof import BaseRVMProfTest
+from rpython.jit.backend.x86.test.test_basic import Jit386Mixin
+
+class TestFfiCall(Jit386Mixin, BaseRVMProfTest):
+    pass
\ No newline at end of file
diff --git a/rpython/jit/backend/x86/test/test_zrpy_vmprof.py 
b/rpython/jit/backend/x86/test/test_zrpy_vmprof.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/x86/test/test_zrpy_vmprof.py
@@ -0,0 +1,7 @@
+
+from rpython.jit.backend.llsupport.test.zrpy_vmprof_test import 
CompiledVmprofTest
+
+class TestZVMprof(CompiledVmprofTest):
+    
+    gcrootfinder = "shadowstack"
+    gc = "incminimark"
\ No newline at end of file
diff --git a/rpython/jit/backend/x86/test/test_zvmprof.py 
b/rpython/jit/backend/x86/test/test_zvmprof.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/x86/test/test_zvmprof.py
@@ -0,0 +1,7 @@
+
+from rpython.jit.backend.llsupport.test.zrpy_vmprof_test import 
CompiledVmprofTest
+
+class TestZVMprof(CompiledVmprofTest):
+    
+    gcrootfinder = "shadowstack"
+    gc = "incminimark"
\ No newline at end of file
diff --git a/rpython/jit/codewriter/test/test_jtransform.py 
b/rpython/jit/codewriter/test/test_jtransform.py
--- a/rpython/jit/codewriter/test/test_jtransform.py
+++ b/rpython/jit/codewriter/test/test_jtransform.py
@@ -1332,7 +1332,7 @@
     tlfield = ThreadLocalField(lltype.Signed, 'foobar_test_',
                                loop_invariant=loop_inv)
     OS_THREADLOCALREF_GET = effectinfo.EffectInfo.OS_THREADLOCALREF_GET
-    c = const(tlfield.offset)
+    c = const(tlfield.getoffset())
     v = varoftype(lltype.Signed)
     op = SpaceOperation('threadlocalref_get', [c], v)
     cc = FakeBuiltinCallControl()
diff --git a/rpython/jit/metainterp/quasiimmut.py 
b/rpython/jit/metainterp/quasiimmut.py
--- a/rpython/jit/metainterp/quasiimmut.py
+++ b/rpython/jit/metainterp/quasiimmut.py
@@ -51,6 +51,7 @@
 class QuasiImmut(object):
     llopaque = True
     compress_limit = 30
+    looptokens_wrefs = None
 
     def __init__(self, cpu):
         self.cpu = cpu
@@ -75,7 +76,7 @@
     def compress_looptokens_list(self):
         self.looptokens_wrefs = [wref for wref in self.looptokens_wrefs
                                       if wref() is not None]
-        # NB. we must keep around the looptoken_wrefs that are
+        # NB. we must keep around the looptokens_wrefs that are
         # already invalidated; see below
         self.compress_limit = (len(self.looptokens_wrefs) + 15) * 2
 
@@ -83,6 +84,9 @@
         # When this is called, all the loops that we record become
         # invalid: all GUARD_NOT_INVALIDATED in these loops (and
         # in attached bridges) must now fail.
+        if self.looptokens_wrefs is None:
+            # can't happen, but helps compiled tests
+            return
         wrefs = self.looptokens_wrefs
         self.looptokens_wrefs = []
         for wref in wrefs:
diff --git a/rpython/jit/metainterp/test/test_jitdriver.py 
b/rpython/jit/metainterp/test/test_jitdriver.py
--- a/rpython/jit/metainterp/test/test_jitdriver.py
+++ b/rpython/jit/metainterp/test/test_jitdriver.py
@@ -193,7 +193,7 @@
             return pc + 1
         
         driver = JitDriver(greens=["pc"], reds='auto',
-                           get_unique_id=get_unique_id)
+                           get_unique_id=get_unique_id, is_recursive=True)
 
         def f(arg):
             i = 0
diff --git a/rpython/jit/metainterp/test/test_recursive.py 
b/rpython/jit/metainterp/test/test_recursive.py
--- a/rpython/jit/metainterp/test/test_recursive.py
+++ b/rpython/jit/metainterp/test/test_recursive.py
@@ -1312,7 +1312,7 @@
                 return (code + 1) * 2
 
             driver = JitDriver(greens=["pc", "code"], reds='auto',
-                               get_unique_id=get_unique_id)
+                               get_unique_id=get_unique_id, is_recursive=True)
 
             def f(pc, code):
                 i = 0
diff --git a/rpython/rlib/jit.py b/rpython/rlib/jit.py
--- a/rpython/rlib/jit.py
+++ b/rpython/rlib/jit.py
@@ -623,6 +623,8 @@
             raise AttributeError("no 'greens' or 'reds' supplied")
         if virtualizables is not None:
             self.virtualizables = virtualizables
+        if get_unique_id is not None:
+            assert is_recursive, "get_unique_id and is_recursive must be 
specified at the same time"
         for v in self.virtualizables:
             assert v in self.reds
         # if reds are automatic, they won't be passed to jit_merge_point, so
diff --git a/rpython/rlib/rthread.py b/rpython/rlib/rthread.py
--- a/rpython/rlib/rthread.py
+++ b/rpython/rlib/rthread.py
@@ -308,7 +308,7 @@
         offset = CDefinedIntSymbolic('RPY_TLOFS_%s' % self.fieldname,
                                      default='?')
         offset.loop_invariant = loop_invariant
-        self.offset = offset
+        self._offset = offset
 
         def getraw():
             if we_are_translated():
@@ -364,7 +364,7 @@
         ThreadLocalField.__init__(self, lltype.Signed, 'tlref%d' % unique_id,
                                   loop_invariant=loop_invariant)
         setraw = self.setraw
-        offset = self.offset
+        offset = self._offset
 
         def get():
             if we_are_translated():
diff --git a/rpython/rlib/rvmprof/cintf.py b/rpython/rlib/rvmprof/cintf.py
--- a/rpython/rlib/rvmprof/cintf.py
+++ b/rpython/rlib/rvmprof/cintf.py
@@ -5,41 +5,41 @@
 from rpython.rtyper.lltypesystem import lltype, llmemory, rffi
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
 from rpython.rtyper.tool import rffi_platform as platform
+from rpython.rlib import rthread
 
 from rpython.jit.backend import detect_cpu
 
 class VMProfPlatformUnsupported(Exception):
     pass
 
+ROOT = py.path.local(rpythonroot).join('rpython', 'rlib', 'rvmprof')
+SRC = ROOT.join('src')
+
+if sys.platform.startswith('linux'):
+    _libs = ['dl']
+else:
+    _libs = []
+eci_kwds = dict(
+    include_dirs = [SRC],
+    includes = ['rvmprof.h'],
+    libraries = _libs,
+    separate_module_files = [SRC.join('rvmprof.c')],
+    post_include_bits=['#define RPYTHON_VMPROF\n'],
+    )
+global_eci = ExternalCompilationInfo(**eci_kwds)
+
+
 def setup():
     if not detect_cpu.autodetect().startswith(detect_cpu.MODEL_X86_64):
         raise VMProfPlatformUnsupported("rvmprof only supports"
                                         " x86-64 CPUs for now")
 
-
-    ROOT = py.path.local(rpythonroot).join('rpython', 'rlib', 'rvmprof')
-    SRC = ROOT.join('src')
-
-
-    if sys.platform.startswith('linux'):
-        libs = ['dl']
-    else:
-        libs = []
-
-    eci_kwds = dict(
-        include_dirs = [SRC],
-        includes = ['rvmprof.h'],
-        libraries = libs,
-        separate_module_files = [SRC.join('rvmprof.c')],
-        post_include_bits=['#define RPYTHON_VMPROF\n'],
-        )
-    eci = ExternalCompilationInfo(**eci_kwds)
-
     platform.verify_eci(ExternalCompilationInfo(
         compile_extra=['-DRPYTHON_LL2CTYPES'],
         **eci_kwds))
 
 
+    eci = global_eci
     vmprof_init = rffi.llexternal("vmprof_init",
                                   [rffi.INT, rffi.DOUBLE, rffi.CCHARP],
                                   rffi.CCHARP, compilation_info=eci)
@@ -55,7 +55,8 @@
                                            rffi.INT, compilation_info=eci)
     vmprof_ignore_signals = rffi.llexternal("vmprof_ignore_signals",
                                             [rffi.INT], lltype.Void,
-                                            compilation_info=eci)
+                                            compilation_info=eci,
+                                            _nowrapper=True)
     return CInterface(locals())
 
 
@@ -67,112 +68,34 @@
     def _freeze_(self):
         return True
 
-def token2lltype(tok):
-    if tok == 'i':
-        return lltype.Signed
-    if tok == 'r':
-        return llmemory.GCREF
-    raise NotImplementedError(repr(tok))
 
-def make_trampoline_function(name, func, token, restok):
-    from rpython.jit.backend import detect_cpu
+# --- copy a few declarations from src/vmprof_stack.h ---
 
-    cont_name = 'rpyvmprof_f_%s_%s' % (name, token)
-    tramp_name = 'rpyvmprof_t_%s_%s' % (name, token)
-    orig_tramp_name = tramp_name
+VMPROF_CODE_TAG = 1
 
-    func.c_name = cont_name
-    func._dont_inline_ = True
+VMPROFSTACK = lltype.ForwardReference()
+PVMPROFSTACK = lltype.Ptr(VMPROFSTACK)
+VMPROFSTACK.become(rffi.CStruct("vmprof_stack_s",
+                                ('next', PVMPROFSTACK),
+                                ('value', lltype.Signed),
+                                ('kind', lltype.Signed)))
+# ----------
 
-    if sys.platform == 'darwin':
-        # according to internet "At the time UNIX was written in 1974...."
-        # "... all C functions are prefixed with _"
-        cont_name = '_' + cont_name
-        tramp_name = '_' + tramp_name
-        PLT = ""
-        size_decl = ""
-        type_decl = ""
-        extra_align = ""
-    else:
-        PLT = "@PLT"
-        type_decl = "\t.type\t%s, @function" % (tramp_name,)
-        size_decl = "\t.size\t%s, .-%s" % (
-            tramp_name, tramp_name)
-        extra_align = "\t.cfi_def_cfa_offset 8"
 
-    assert detect_cpu.autodetect().startswith(detect_cpu.MODEL_X86_64), (
-        "rvmprof only supports x86-64 CPUs for now")
+vmprof_tl_stack = rthread.ThreadLocalField(PVMPROFSTACK, "vmprof_tl_stack")
+do_use_eci = rffi.llexternal_use_eci(
+    ExternalCompilationInfo(includes=['vmprof_stack.h'],
+                            include_dirs = [SRC]))
 
-    # mapping of argument count (not counting the final uid argument) to
-    # the register that holds this uid argument
-    reg = {0: '%rdi',
-           1: '%rsi',
-           2: '%rdx',
-           3: '%rcx',
-           4: '%r8',
-           5: '%r9',
-           }
-    try:
-        reg = reg[len(token)]
-    except KeyError:
-        raise NotImplementedError(
-            "not supported: %r takes more than 5 arguments" % (func,))
+def enter_code(unique_id):
+    do_use_eci()
+    s = lltype.malloc(VMPROFSTACK, flavor='raw')
+    s.c_next = vmprof_tl_stack.get_or_make_raw()
+    s.c_value = unique_id
+    s.c_kind = VMPROF_CODE_TAG
+    vmprof_tl_stack.setraw(s)
+    return s
 
-    target = udir.join('module_cache')
-    target.ensure(dir=1)
-    target = target.join('trampoline_%s_%s.vmprof.s' % (name, token))
-    # NOTE! the tabs in this file are absolutely essential, things
-    #       that don't start with \t are silently ignored (<arigato>: WAT!?)
-    target.write("""\
-\t.text
-\t.globl\t%(tramp_name)s
-%(type_decl)s
-%(tramp_name)s:
-\t.cfi_startproc
-\tpushq\t%(reg)s
-\t.cfi_def_cfa_offset 16
-\tcall %(cont_name)s%(PLT)s
-\taddq\t$8, %%rsp
-%(extra_align)s
-\tret
-\t.cfi_endproc
-%(size_decl)s
-""" % locals())
-
-    def tok2cname(tok):
-        if tok == 'i':
-            return 'long'
-        if tok == 'r':
-            return 'void *'
-        raise NotImplementedError(repr(tok))
-
-    header = 'RPY_EXTERN %s %s(%s);\n' % (
-        tok2cname(restok),
-        orig_tramp_name,
-        ', '.join([tok2cname(tok) for tok in token] + ['long']))
-
-    header += """\
-static int cmp_%s(void *addr) {
-    if (addr == %s) return 1;
-#ifdef VMPROF_ADDR_OF_TRAMPOLINE
-    return VMPROF_ADDR_OF_TRAMPOLINE(addr);
-#undef VMPROF_ADDR_OF_TRAMPOLINE
-#else
-    return 0;
-#endif
-#define VMPROF_ADDR_OF_TRAMPOLINE cmp_%s
-}
-""" % (tramp_name, orig_tramp_name, tramp_name)
-
-    eci = ExternalCompilationInfo(
-        post_include_bits = [header],
-        separate_module_files = [str(target)],
-    )
-
-    return rffi.llexternal(
-        orig_tramp_name,
-        [token2lltype(tok) for tok in token] + [lltype.Signed],
-        token2lltype(restok),
-        compilation_info=eci,
-        _nowrapper=True, sandboxsafe=True,
-        random_effects_on_gcobjs=True)
+def leave_code(s):
+    vmprof_tl_stack.setraw(s.c_next)
+    lltype.free(s, flavor='raw')
diff --git a/rpython/rlib/rvmprof/rvmprof.py b/rpython/rlib/rvmprof/rvmprof.py
--- a/rpython/rlib/rvmprof/rvmprof.py
+++ b/rpython/rlib/rvmprof/rvmprof.py
@@ -4,12 +4,19 @@
 from rpython.rlib.rvmprof import cintf
 from rpython.rtyper.annlowlevel import cast_instance_to_gcref
 from rpython.rtyper.annlowlevel import cast_base_ptr_to_instance
-from rpython.rtyper.lltypesystem import rffi
+from rpython.rtyper.lltypesystem import rffi, llmemory
+from rpython.rtyper.lltypesystem.lloperation import llop
 
 MAX_FUNC_NAME = 1023
 
 # ____________________________________________________________
 
+# keep in sync with vmprof_stack.h
+VMPROF_CODE_TAG = 1
+VMPROF_BLACKHOLE_TAG = 2
+VMPROF_JITTED_TAG = 3
+VMPROF_JITTING_TAG = 4
+VMPROF_GC_TAG = 5
 
 class VMProfError(Exception):
     def __init__(self, msg):
@@ -19,17 +26,16 @@
 
 class VMProf(object):
 
+    _immutable_fields_ = ['is_enabled?']
+
     def __init__(self):
         "NOT_RPYTHON: use _get_vmprof()"
         self._code_classes = set()
         self._gather_all_code_objs = lambda: None
         self._cleanup_()
-        if sys.maxint == 2147483647:
-            self._code_unique_id = 0 # XXX this is wrong, it won't work on 
32bit
-        else:
-            self._code_unique_id = 0x7000000000000000
+        self._code_unique_id = 4
         self.cintf = cintf.setup()
-
+        
     def _cleanup_(self):
         self.is_enabled = False
 
@@ -127,7 +133,6 @@
         if self.cintf.vmprof_register_virtual_function(name, uid, 500000) < 0:
             raise VMProfError("vmprof buffers full!  disk full or too slow")
 
-
 def vmprof_execute_code(name, get_code_fn, result_class=None):
     """Decorator to be used on the function that interprets a code object.
 
@@ -136,12 +141,7 @@
     'get_code_fn(*args)' is called to extract the code object from the
     arguments given to the decorated function.
 
-    The original function can return None, an integer, or an instance.
-    In the latter case (only), 'result_class' must be set.
-
-    NOTE: for now, this assumes that the decorated functions only takes
-    instances or plain integer arguments, and at most 5 of them
-    (including 'self' if applicable).
+    'result_class' is ignored (backward compatibility).
     """
     def decorate(func):
         try:
@@ -149,52 +149,19 @@
         except cintf.VMProfPlatformUnsupported:
             return func
 
-        if hasattr(func, 'im_self'):
-            assert func.im_self is None
-            func = func.im_func
-
-        def lower(*args):
-            if len(args) == 0:
-                return (), ""
-            ll_args, token = lower(*args[1:])
-            ll_arg = args[0]
-            if isinstance(ll_arg, int):
-                tok = "i"
-            else:
-                tok = "r"
-                ll_arg = cast_instance_to_gcref(ll_arg)
-            return (ll_arg,) + ll_args, tok + token
-
-        @specialize.memo()
-        def get_ll_trampoline(token):
-            if result_class is None:
-                restok = "i"
-            else:
-                restok = "r"
-            return cintf.make_trampoline_function(name, func, token, restok)
-
         def decorated_function(*args):
-            # go through the asm trampoline ONLY if we are translated but not
-            # being JITted.
-            #
-            # If we are not translated, we obviously don't want to go through
-            # the trampoline because there is no C function it can call.
-            #
             # If we are being JITted, we want to skip the trampoline, else the
             # JIT cannot see through it.
-            #
-            if we_are_translated() and not jit.we_are_jitted():
-                # if we are translated, call the trampoline
+            if not jit.we_are_jitted():
                 unique_id = get_code_fn(*args)._vmprof_unique_id
-                ll_args, token = lower(*args)
-                ll_trampoline = get_ll_trampoline(token)
-                ll_result = ll_trampoline(*ll_args + (unique_id,))
-                if result_class is not None:
-                    return cast_base_ptr_to_instance(result_class, ll_result)
-                else:
-                    return ll_result
+                x = cintf.enter_code(unique_id)
+                try:
+                    return func(*args)
+                finally:
+                    cintf.leave_code(x)
             else:
                 return func(*args)
+
         decorated_function.__name__ = func.__name__ + '_rvmprof'
         return decorated_function
 
diff --git a/rpython/rlib/rvmprof/src/rvmprof.c 
b/rpython/rlib/rvmprof/src/rvmprof.c
--- a/rpython/rlib/rvmprof/src/rvmprof.c
+++ b/rpython/rlib/rvmprof/src/rvmprof.c
@@ -12,10 +12,12 @@
 #else
 
 #  include "common_header.h"
+#  include "structdef.h"
+#  include "src/threadlocal.h"
 #  include "rvmprof.h"
-#  ifndef VMPROF_ADDR_OF_TRAMPOLINE
+/*#  ifndef VMPROF_ADDR_OF_TRAMPOLINE
 #   error "RPython program using rvmprof, but not calling 
vmprof_execute_code()"
-#  endif
+#  endif*/
 
 #endif
 
diff --git a/rpython/rlib/rvmprof/src/rvmprof.h 
b/rpython/rlib/rvmprof/src/rvmprof.h
--- a/rpython/rlib/rvmprof/src/rvmprof.h
+++ b/rpython/rlib/rvmprof/src/rvmprof.h
@@ -4,3 +4,7 @@
 RPY_EXTERN int vmprof_enable(void);
 RPY_EXTERN int vmprof_disable(void);
 RPY_EXTERN int vmprof_register_virtual_function(char *, long, int);
+RPY_EXTERN void* vmprof_stack_new(void);
+RPY_EXTERN int vmprof_stack_append(void*, long);
+RPY_EXTERN long vmprof_stack_pop(void*);
+RPY_EXTERN void vmprof_stack_free(void*);
diff --git a/rpython/rlib/rvmprof/src/vmprof_common.h 
b/rpython/rlib/rvmprof/src/vmprof_common.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/vmprof_common.h
@@ -0,0 +1,72 @@
+#include <stddef.h>
+
+#define MAX_FUNC_NAME 1024
+
+static int profile_file = -1;
+static long prepare_interval_usec = 0;
+static long profile_interval_usec = 0;
+static int opened_profile(char *interp_name);
+
+#define MAX_STACK_DEPTH   \
+    ((SINGLE_BUF_SIZE - sizeof(struct prof_stacktrace_s)) / sizeof(void *))
+
+#define MARKER_STACKTRACE '\x01'
+#define MARKER_VIRTUAL_IP '\x02'
+#define MARKER_TRAILER '\x03'
+#define MARKER_INTERP_NAME '\x04'   /* deprecated */
+#define MARKER_HEADER '\x05'
+
+#define VERSION_BASE '\x00'
+#define VERSION_THREAD_ID '\x01'
+#define VERSION_TAG '\x02'
+
+typedef struct prof_stacktrace_s {
+    char padding[sizeof(long) - 1];
+    char marker;
+    long count, depth;
+    void *stack[];
+} prof_stacktrace_s;
+
+
+RPY_EXTERN
+char *vmprof_init(int fd, double interval, char *interp_name)
+{
+    if (interval < 1e-6 || interval >= 1.0)
+        return "bad value for 'interval'";
+    prepare_interval_usec = (int)(interval * 1000000.0);
+
+    if (prepare_concurrent_bufs() < 0)
+        return "out of memory";
+
+    assert(fd >= 0);
+    profile_file = fd;
+    if (opened_profile(interp_name) < 0) {
+        profile_file = -1;
+        return strerror(errno);
+    }
+    return NULL;
+}
+
+static int _write_all(const char *buf, size_t bufsize);
+
+static int opened_profile(char *interp_name)
+{
+    struct {
+        long hdr[5];
+        char interp_name[259];
+    } header;
+
+    size_t namelen = strnlen(interp_name, 255);
+
+    header.hdr[0] = 0;
+    header.hdr[1] = 3;
+    header.hdr[2] = 0;
+    header.hdr[3] = prepare_interval_usec;
+    header.hdr[4] = 0;
+    header.interp_name[0] = MARKER_HEADER;
+    header.interp_name[1] = '\x00';
+    header.interp_name[2] = VERSION_TAG;
+    header.interp_name[3] = namelen;
+    memcpy(&header.interp_name[4], interp_name, namelen);
+    return _write_all((char*)&header, 5 * sizeof(long) + 4 + namelen);
+}
diff --git a/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h 
b/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h
--- a/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h
+++ b/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h
@@ -1,119 +1,49 @@
 
-#ifdef PYPY_JIT_CODEMAP
 void *pypy_find_codemap_at_addr(long addr, long *start_addr);
 long pypy_yield_codemap_at_addr(void *codemap_raw, long addr,
                                 long *current_pos_addr);
 long pypy_jit_stack_depth_at_loc(long loc);
-#endif
 
 
-#ifdef CPYTHON_GET_CUSTOM_OFFSET
-static void *tramp_start, *tramp_end;
-#endif
-
-
-static ptrdiff_t vmprof_unw_get_custom_offset(void* ip, void *cp) {
-
-#if defined(PYPY_JIT_CODEMAP)
-
-    intptr_t ip_l = (intptr_t)ip;
-    return pypy_jit_stack_depth_at_loc(ip_l);
-
-#elif defined(CPYTHON_GET_CUSTOM_OFFSET)
-
-    if (ip >= tramp_start && ip <= tramp_end) {
-        // XXX the return value is wrong for all the places before push and
-        //     after pop, fix
-        void *bp;
-        void *sp;
-
-        /* This is a stage2 trampoline created by hotpatch:
-
-               push   %rbx
-               push   %rbp
-               mov    %rsp,%rbp
-               and    $0xfffffffffffffff0,%rsp   // make sure the stack is 
aligned
-               movabs $0x7ffff687bb10,%rbx
-               callq  *%rbx
-               leaveq 
-               pop    %rbx
-               retq   
-
-           the stack layout is like this:
-
-               +-----------+                      high addresses
-               | ret addr  |
-               +-----------+
-               | saved rbx |   start of the function frame
-               +-----------+
-               | saved rbp |
-               +-----------+
-               | ........  |   <-- rbp
-               +-----------+                      low addresses
-
-           So, the trampoline frame starts at rbp+16, and the return address,
-           is at rbp+24.  The vmprof API requires us to return the offset of
-           the frame relative to sp, hence we have this weird computation.
-
-           XXX (antocuni): I think we could change the API to return directly
-           the frame address instead of the offset; however, this require a
-           change in the PyPy code too
-        */
-
-        unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp);
-        unw_get_reg (cp, UNW_X86_64_RBP, (unw_word_t*)&bp);
-        return bp+16+8-sp;
-    }
-    return -1;
-
-#else
-
-    return -1;
-
-#endif
-}
-
-static long vmprof_write_header_for_jit_addr(void **result, long n,
-                                             void *ip, int max_depth)
+static long vmprof_write_header_for_jit_addr(intptr_t *result, long n,
+                                             intptr_t ip, int max_depth)
 {
 #ifdef PYPY_JIT_CODEMAP
     void *codemap;
     long current_pos = 0;
-    intptr_t id;
+    intptr_t ident;
     long start_addr = 0;
     intptr_t addr = (intptr_t)ip;
     int start, k;
-    void *tmp;
+    intptr_t tmp;
 
     codemap = pypy_find_codemap_at_addr(addr, &start_addr);
-    if (codemap == NULL)
-        // not a jit code at all
+    if (codemap == NULL || n >= max_depth - 2)
+        // not a jit code at all or almost max depth
         return n;
 
     // modify the last entry to point to start address and not the random one
     // in the middle
-    result[n - 1] = (void*)start_addr;
-    result[n] = (void*)2;
-    n++;
+    result[n++] = VMPROF_ASSEMBLER_TAG;
+    result[n++] = start_addr;
     start = n;
     while (n < max_depth) {
-        id = pypy_yield_codemap_at_addr(codemap, addr, &current_pos);
-        if (id == -1)
+        ident = pypy_yield_codemap_at_addr(codemap, addr, &current_pos);
+        if (ident == -1)
             // finish
             break;
-        if (id == 0)
+        if (ident == 0)
             continue; // not main codemap
-        result[n++] = (void *)id;
+        result[n++] = VMPROF_JITTED_TAG;
+        result[n++] = ident;
     }
-    k = 0;
+    k = 1;
+
     while (k < (n - start) / 2) {
         tmp = result[start + k];
-        result[start + k] = result[n - k - 1];
-        result[n - k - 1] = tmp;
-        k++;
-    }
-    if (n < max_depth) {
-        result[n++] = (void*)3;
+        result[start + k] = result[n - k];
+        result[n - k] = tmp;
+        k += 2;
     }
 #endif
     return n;
diff --git a/rpython/rlib/rvmprof/src/vmprof_getpc.h 
b/rpython/rlib/rvmprof/src/vmprof_getpc.h
--- a/rpython/rlib/rvmprof/src/vmprof_getpc.h
+++ b/rpython/rlib/rvmprof/src/vmprof_getpc.h
@@ -134,7 +134,7 @@
   }
 };
 
-void* GetPC(ucontext_t *signal_ucontext) {
+intptr_t GetPC(ucontext_t *signal_ucontext) {
   // See comment above struct CallUnrollInfo.  Only try instruction
   // flow matching if both eip and esp looks reasonable.
   const int eip = signal_ucontext->uc_mcontext.gregs[REG_EIP];
@@ -146,12 +146,12 @@
       if (!memcmp(eip_char + callunrollinfo[i].pc_offset,
                   callunrollinfo[i].ins, callunrollinfo[i].ins_size)) {
         // We have a match.
-        void **retaddr = (void**)(esp + callunrollinfo[i].return_sp_offset);
+        intptr_t *retaddr = (intptr_t*)(esp + 
callunrollinfo[i].return_sp_offset);
         return *retaddr;
       }
     }
   }
-  return (void*)eip;
+  return eip;
 }
 
 // Special case #2: Windows, which has to do something totally different.
@@ -170,7 +170,7 @@
 typedef int ucontext_t;
 #endif
 
-void* GetPC(ucontext_t *signal_ucontext) {
+intptr_t GetPC(ucontext_t *signal_ucontext) {
   RAW_LOG(ERROR, "GetPC is not yet implemented on Windows\n");
   return NULL;
 }
@@ -180,11 +180,11 @@
 // the right value for your system, and add it to the list in
 // configure.ac (or set it manually in your config.h).
 #else
-void* GetPC(ucontext_t *signal_ucontext) {
+intptr_t GetPC(ucontext_t *signal_ucontext) {
 #ifdef __APPLE__
-  return (void*)(signal_ucontext->uc_mcontext->__ss.__rip);
+  return (signal_ucontext->uc_mcontext->__ss.__rip);
 #else
-  return (void*)signal_ucontext->PC_FROM_UCONTEXT;   // defined in config.h
+  return signal_ucontext->PC_FROM_UCONTEXT;   // defined in config.h
 #endif
 }
 
diff --git a/rpython/rlib/rvmprof/src/vmprof_main.h 
b/rpython/rlib/rvmprof/src/vmprof_main.h
--- a/rpython/rlib/rvmprof/src/vmprof_main.h
+++ b/rpython/rlib/rvmprof/src/vmprof_main.h
@@ -25,84 +25,28 @@
 #include <sys/time.h>
 #include <errno.h>
 #include <unistd.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <signal.h>
 #include <sys/stat.h>
+#include <unistd.h>
 #include <fcntl.h>
 #include "vmprof_getpc.h"
-#ifdef __APPLE__
-#include "libunwind.h"
-#else
-#include "vmprof_unwind.h"
-#endif
 #include "vmprof_mt.h"
-
+#include "vmprof_stack.h"
+#include "vmprof_common.h"
 
 /************************************************************/
 
-// functions copied from libunwind using dlopen
-
-#ifndef __APPLE__ // should be linux only probably
-static int (*unw_get_reg)(unw_cursor_t*, int, unw_word_t*) = NULL;
-static int (*unw_step)(unw_cursor_t*) = NULL;
-static int (*unw_init_local)(unw_cursor_t *, unw_context_t *) = NULL;
-static int (*unw_get_proc_info)(unw_cursor_t *, unw_proc_info_t *) = NULL;
-#endif
-
-static int profile_file = -1;
 static long prepare_interval_usec;
+static long saved_profile_file;
 static struct profbuf_s *volatile current_codes;
 static void *(*mainloop_get_virtual_ip)(char *) = 0;
 
 static int opened_profile(char *interp_name);
 static void flush_codes(void);
 
-#ifdef __APPLE__
-#define UNWIND_NAME "/usr/lib/system/libunwind.dylib"
-#define UNW_PREFIX "unw"
-#else
-#define UNWIND_NAME "libunwind.so"
-#define UNW_PREFIX "_ULx86_64"
-#endif
-
-RPY_EXTERN
-char *vmprof_init(int fd, double interval, char *interp_name)
-{
-    if (interval < 1e-6 || interval >= 1.0)
-        return "bad value for 'interval'";
-    prepare_interval_usec = (int)(interval * 1000000.0);
-
-#ifndef __APPLE__
-    if (!unw_get_reg) {
-        void *libhandle;
-
-        if (!(libhandle = dlopen(UNWIND_NAME, RTLD_LAZY | RTLD_LOCAL)))
-            goto error;
-        if (!(unw_get_reg = dlsym(libhandle, UNW_PREFIX "_get_reg")))
-            goto error;
-        if (!(unw_get_proc_info = dlsym(libhandle, UNW_PREFIX 
"_get_proc_info")))
-            goto error;
-        if (!(unw_init_local = dlsym(libhandle, UNW_PREFIX  "_init_local")))
-            goto error;
-        if (!(unw_step = dlsym(libhandle, UNW_PREFIX  "_step")))
-            goto error;
-    }
-#endif
-    if (prepare_concurrent_bufs() < 0)
-        return "out of memory";
-
-    assert(fd >= 0);
-    profile_file = fd;
-    if (opened_profile(interp_name) < 0) {
-        profile_file = -1;
-        return strerror(errno);
-    }
-    return NULL;
-
- error:
-    return dlerror();
-}
 
 /************************************************************/
 
@@ -131,131 +75,62 @@
  * *************************************************************
  */
 
-#define MAX_FUNC_NAME 128
-#define MAX_STACK_DEPTH   \
-    ((SINGLE_BUF_SIZE - sizeof(struct prof_stacktrace_s)) / sizeof(void *))
-
-#define MARKER_STACKTRACE '\x01'
-#define MARKER_VIRTUAL_IP '\x02'
-#define MARKER_TRAILER '\x03'
-#define MARKER_INTERP_NAME '\x04'   /* deprecated */
-#define MARKER_HEADER '\x05'
-
-#define VERSION_BASE '\x00'
-#define VERSION_THREAD_ID '\x01'
-
-struct prof_stacktrace_s {
-    char padding[sizeof(long) - 1];
-    char marker;
-    long count, depth;
-    void *stack[];
-};
-
-static long profile_interval_usec = 0;
 static char atfork_hook_installed = 0;
 
 
-/* ******************************************************
- * libunwind workaround for process JIT frames correctly
- * ******************************************************
- */
-
 #include "vmprof_get_custom_offset.h"
 
-typedef struct {
-    void* _unused1;
-    void* _unused2;
-    void* sp;
-    void* ip;
-    void* _unused3[sizeof(unw_cursor_t)/sizeof(void*) - 4];
-} vmprof_hacked_unw_cursor_t;
-
-static int vmprof_unw_step(unw_cursor_t *cp, int first_run)
-{
-    void* ip;
-    void* sp;
-    ptrdiff_t sp_offset;
-    unw_get_reg (cp, UNW_REG_IP, (unw_word_t*)&ip);
-    unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp);
-    if (!first_run) {
-        // make sure we're pointing to the CALL and not to the first
-        // instruction after. If the callee adjusts the stack for us
-        // it's not safe to be at the instruction after
-        ip -= 1;
-    }
-    sp_offset = vmprof_unw_get_custom_offset(ip, cp);
-
-    if (sp_offset == -1) {
-        // it means that the ip is NOT in JITted code, so we can use the
-        // stardard unw_step
-        return unw_step(cp);
-    }
-    else {
-        // this is a horrible hack to manually walk the stack frame, by
-        // setting the IP and SP in the cursor
-        vmprof_hacked_unw_cursor_t *cp2 = (vmprof_hacked_unw_cursor_t*)cp;
-        void* bp = (void*)sp + sp_offset;
-        cp2->sp = bp;
-        bp -= sizeof(void*);
-        cp2->ip = ((void**)bp)[0];
-        // the ret is on the top of the stack minus WORD
-        return 1;
-    }
-}
-
-
 /* *************************************************************
  * functions to dump the stack trace
  * *************************************************************
  */
 
-static int get_stack_trace(void** result, int max_depth, ucontext_t *ucontext)
+
+#ifndef RPYTHON_LL2CTYPES
+static vmprof_stack_t *get_vmprof_stack(void)
 {
-    void *ip;
-    int n = 0;
-    unw_cursor_t cursor;
-#ifdef __APPLE__
-    unw_context_t uc;
-    unw_getcontext(&uc);
+    return RPY_THREADLOCALREF_GET(vmprof_tl_stack);
+}
 #else
-    unw_context_t uc = *ucontext;
+static vmprof_stack_t *get_vmprof_stack(void)
+{
+    return 0;
+}
 #endif
 
-    int ret = unw_init_local(&cursor, &uc);
-    assert(ret >= 0);
-    (void)ret;
-
-    while (n < max_depth) {
-        if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
-            break;
+static int get_stack_trace(intptr_t *result, int max_depth, intptr_t pc, 
ucontext_t *ucontext)
+{
+    vmprof_stack_t* stack = get_vmprof_stack();
+    int n = 0;
+    intptr_t addr = 0;
+    int bottom_jitted = 0;
+    // check if the pc is in JIT
+#ifdef PYPY_JIT_CODEMAP
+    if (pypy_find_codemap_at_addr((intptr_t)pc, &addr)) {
+        // the bottom part is jitted, means we can fill up the first part
+        // from the JIT
+        n = vmprof_write_header_for_jit_addr(result, n, pc, max_depth);
+        stack = stack->next; // skip the first item as it contains garbage
+    }
+#endif
+    while (n < max_depth - 1 && stack) {
+        if (stack->kind == VMPROF_CODE_TAG) {
+            result[n] = stack->kind;
+            result[n + 1] = stack->value;
+            n += 2;
         }
-
-        unw_proc_info_t pip;
-        unw_get_proc_info(&cursor, &pip);
-
-        /* if n==0, it means that the signal handler interrupted us while we
-           were in the trampoline, so we are not executing (yet) the real main
-           loop function; just skip it */
-        if (VMPROF_ADDR_OF_TRAMPOLINE((void*)pip.start_ip) && n > 0) {
-            // found main loop stack frame
-            void* sp;
-            unw_get_reg(&cursor, UNW_REG_SP, (unw_word_t *) &sp);
-            if (mainloop_get_virtual_ip)
-                ip = mainloop_get_virtual_ip((char *)sp);
-            else
-                ip = *(void **)sp;
+#ifdef PYPY_JIT_CODEMAP
+        else if (stack->kind == VMPROF_JITTED_TAG) {
+            pc = ((intptr_t*)(stack->value - sizeof(intptr_t)))[0];
+            n = vmprof_write_header_for_jit_addr(result, n, pc, max_depth);
         }
-
-        int first_run = (n == 0);
-        result[n++] = ip;
-        n = vmprof_write_header_for_jit_addr(result, n, ip, max_depth);
-        if (vmprof_unw_step(&cursor, first_run) <= 0)
-            break;
+#endif
+        stack = stack->next;
     }
     return n;
 }
 
-static void *get_current_thread_id(void)
+static intptr_t get_current_thread_id(void)
 {
     /* xxx This function is a hack on two fronts:
 
@@ -269,7 +144,7 @@
        An alternative would be to try to look if the information is
        available in the ucontext_t in the caller.
     */
-    return (void *)pthread_self();
+    return (intptr_t)pthread_self();
 }
 
 
@@ -278,8 +153,43 @@
  * *************************************************************
  */
 
+#include <setjmp.h>
+
+volatile int spinlock;
+jmp_buf restore_point;
+
+static void segfault_handler(int arg)
+{
+    longjmp(restore_point, SIGSEGV);
+}
+
 static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext)
 {
+#ifdef __APPLE__
+    // TERRIBLE HACK AHEAD
+    // on OS X, the thread local storage is sometimes uninitialized
+    // when the signal handler runs - it means it's impossible to read errno
+    // or call any syscall or read PyThread_Current or pthread_self. 
Additionally,
+    // it seems impossible to read the register gs.
+    // here we register segfault handler (all guarded by a spinlock) and call
+    // longjmp in case segfault happens while reading a thread local
+    while (__sync_lock_test_and_set(&spinlock, 1)) {
+    }
+    signal(SIGSEGV, &segfault_handler);
+    int fault_code = setjmp(restore_point);
+    if (fault_code == 0) {
+        pthread_self();
+        get_current_thread_id();
+    } else {
+        signal(SIGSEGV, SIG_DFL);
+        __sync_synchronize();
+        spinlock = 0;
+        return;    
+    }
+    signal(SIGSEGV, SIG_DFL);
+    __sync_synchronize();
+    spinlock = 0;
+#endif
     long val = __sync_fetch_and_add(&signal_handler_value, 2L);
 
     if ((val & 1) == 0) {
@@ -296,9 +206,8 @@
             struct prof_stacktrace_s *st = (struct prof_stacktrace_s *)p->data;
             st->marker = MARKER_STACKTRACE;
             st->count = 1;
-            st->stack[0] = GetPC((ucontext_t*)ucontext);
-            depth = get_stack_trace(st->stack+1, MAX_STACK_DEPTH-2, ucontext);
-            depth++;  // To account for pc value in stack[0];
+            depth = get_stack_trace(st->stack,
+                MAX_STACK_DEPTH-2, GetPC((ucontext_t*)ucontext), ucontext);
             st->depth = depth;
             st->stack[depth++] = get_current_thread_id();
             p->data_offset = offsetof(struct prof_stacktrace_s, marker);
@@ -363,12 +272,15 @@
 
 static void atfork_disable_timer(void) {
     if (profile_interval_usec > 0) {
+        saved_profile_file = profile_file;
+        profile_file = -1;
         remove_sigprof_timer();
     }
 }
 
 static void atfork_enable_timer(void) {
     if (profile_interval_usec > 0) {
+        profile_file = saved_profile_file;
         install_sigprof_timer();
     }
 }
@@ -415,7 +327,7 @@
     return -1;
 }
 
-static int _write_all(const void *buf, size_t bufsize)
+static int _write_all(const char *buf, size_t bufsize)
 {
     while (bufsize > 0) {
         ssize_t count = write(profile_file, buf, bufsize);
@@ -427,71 +339,13 @@
     return 0;
 }
 
-static int opened_profile(char *interp_name)
-{
-    struct {
-        long hdr[5];
-        char interp_name[259];
-    } header;
-
-    size_t namelen = strnlen(interp_name, 255);
-    current_codes = NULL;
-
-    header.hdr[0] = 0;
-    header.hdr[1] = 3;
-    header.hdr[2] = 0;
-    header.hdr[3] = prepare_interval_usec;
-    header.hdr[4] = 0;
-    header.interp_name[0] = MARKER_HEADER;
-    header.interp_name[1] = '\x00';
-    header.interp_name[2] = VERSION_THREAD_ID;
-    header.interp_name[3] = namelen;
-    memcpy(&header.interp_name[4], interp_name, namelen);
-    return _write_all(&header, 5 * sizeof(long) + 4 + namelen);
-}
-
 static int close_profile(void)
 {
-    char buf[4096];
-    ssize_t size;
     unsigned char marker = MARKER_TRAILER;
 
     if (_write_all(&marker, 1) < 0)
         return -1;
 
-#ifdef __linux__
-    // copy /proc/self/maps to the end of the profile file
-    int srcfd = open("/proc/self/maps", O_RDONLY);
-    if (srcfd < 0)
-        return -1;
-
-    while ((size = read(srcfd, buf, sizeof buf)) > 0) {
-        if (_write_all(buf, size) < 0) {
-            close(srcfd);
-            return -1;
-        }
-    }
-    close(srcfd);
-#else
-    // freebsd and mac
-#if defined(__APPLE__)
-       sprintf(buf, "vmmap %d", getpid());
-#else
-    sprintf(buf, "procstat -v %d", getpid());
-#endif
-    FILE *srcf = popen(buf, "r");
-    if (!srcf)
-        return -1;
-
-    while ((size = fread(buf, 1, sizeof buf, srcf))) {
-        if (_write_all(buf, size) < 0) {
-            pclose(srcf);
-            return -1;
-        }
-    }
-    pclose(srcf);
-#endif
-
     /* don't close() the file descriptor from here */
     profile_file = -1;
     return 0;
@@ -522,6 +376,9 @@
     struct profbuf_s *p;
     char *t;
 
+    if (profile_file == -1)
+        return 0; // silently don't write it
+
  retry:
     p = current_codes;
     if (p != NULL) {
@@ -529,7 +386,7 @@
             /* grabbed 'current_codes': we will append the current block
                to it if it contains enough room */
             size_t freesize = SINGLE_BUF_SIZE - p->data_size;
-            if (freesize < blocklen) {
+            if (freesize < (size_t)blocklen) {
                 /* full: flush it */
                 commit_buffer(profile_file, p);
                 p = NULL;
diff --git a/rpython/rlib/rvmprof/src/vmprof_stack.h 
b/rpython/rlib/rvmprof/src/vmprof_stack.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rvmprof/src/vmprof_stack.h
@@ -0,0 +1,25 @@
+#ifndef _VMPROF_STACK_H_
+#define _VMPROF_STACK_H_
+
+#include <unistd.h>
+
+#define VMPROF_CODE_TAG 1        /* <- also in cintf.py */
+#define VMPROF_BLACKHOLE_TAG 2
+#define VMPROF_JITTED_TAG 3
+#define VMPROF_JITTING_TAG 4
+#define VMPROF_GC_TAG 5
+#define VMPROF_ASSEMBLER_TAG 6
+// whatever we want here
+
+typedef struct vmprof_stack_s {
+    struct vmprof_stack_s* next;
+    intptr_t value;
+    intptr_t kind;
+} vmprof_stack_t;
+
+// the kind is WORD so we consume exactly 3 WORDs and we don't have
+// to worry too much. There is a potential for squeezing it with bit
+// patterns into one WORD, but I don't want to care RIGHT NOW, potential
+// for future optimization potential
+
+#endif
diff --git a/rpython/rlib/rvmprof/test/test_ztranslation.py 
b/rpython/rlib/rvmprof/test/test_ztranslation.py
--- a/rpython/rlib/rvmprof/test/test_ztranslation.py
+++ b/rpython/rlib/rvmprof/test/test_ztranslation.py
@@ -64,8 +64,14 @@
 def test_interpreted():
     # takes forever if the Python process is already big...
     import subprocess
-    subprocess.check_call([sys.executable, os.path.basename(__file__)],
-                          cwd=(os.path.dirname(__file__) or '.'))
+    me = os.path.basename(__file__)
+    if me.endswith('pyc') or me.endswith('pyo'):
+        me = me[:-1]
+    env = os.environ.copy()
+    env['PYTHONPATH'] = ''
+    subprocess.check_call([sys.executable, me],
+                          cwd=(os.path.dirname(__file__) or '.'),
+                          env=env)
 
 def test_compiled():
     fn = compile(main, [], gcpolicy="minimark")
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy default: merge vmprof-newstack

Reply via email to