Author: Carl Friedrich Bolz <cfb...@gmx.de> Branch: value-profiling Changeset: r79065:bbcbd47d4cec Date: 2015-08-19 18:25 +0200 http://bitbucket.org/pypy/pypy/changeset/bbcbd47d4cec/
Log: merge default diff too long, truncating to 2000 out of 2204 lines diff --git a/_pytest/assertion/rewrite.py b/_pytest/assertion/rewrite.py --- a/_pytest/assertion/rewrite.py +++ b/_pytest/assertion/rewrite.py @@ -308,7 +308,10 @@ if (len(data) != 8 or data[:4] != imp.get_magic() or struct.unpack("<l", data[4:])[0] != mtime): return None - co = marshal.load(fp) + try: + co = marshal.load(fp) + except ValueError: + return None # e.g. bad marshal data because of pypy/cpython mix if not isinstance(co, types.CodeType): # That's interesting.... return None diff --git a/pypy/doc/embedding.rst b/pypy/doc/embedding.rst --- a/pypy/doc/embedding.rst +++ b/pypy/doc/embedding.rst @@ -46,7 +46,11 @@ source. It'll acquire the GIL. Note: this is meant to be called *only once* or a few times at most. See - the `more complete example`_ below. + the `more complete example`_ below. In PyPy <= 2.6.0, the globals + dictionary is *reused* across multiple calls, giving potentially + strange results (e.g. objects dying too early). In PyPy >= 2.6.1, + you get a new globals dictionary for every call (but then, all globals + dictionaries are all kept alive forever, in ``sys._pypy_execute_source``). .. function:: int pypy_execute_source_ptr(char* source, void* ptr); diff --git a/pypy/goal/targetpypystandalone.py b/pypy/goal/targetpypystandalone.py --- a/pypy/goal/targetpypystandalone.py +++ b/pypy/goal/targetpypystandalone.py @@ -128,13 +128,7 @@ @entrypoint('main', [rffi.CCHARP], c_name='pypy_execute_source') def pypy_execute_source(ll_source): - after = rffi.aroundstate.after - if after: after() - source = rffi.charp2str(ll_source) - res = _pypy_execute_source(source) - before = rffi.aroundstate.before - if before: before() - return rffi.cast(rffi.INT, res) + return pypy_execute_source_ptr(ll_source, 0) @entrypoint('main', [rffi.CCHARP, lltype.Signed], c_name='pypy_execute_source_ptr') @@ -142,9 +136,7 @@ after = rffi.aroundstate.after if after: after() source = rffi.charp2str(ll_source) - space.setitem(w_globals, space.wrap('c_argument'), - space.wrap(ll_ptr)) - res = _pypy_execute_source(source) + res = _pypy_execute_source(source, ll_ptr) before = rffi.aroundstate.before if before: before() return rffi.cast(rffi.INT, res) @@ -169,15 +161,21 @@ before = rffi.aroundstate.before if before: before() - w_globals = space.newdict() - space.setitem(w_globals, space.wrap('__builtins__'), - space.builtin_modules['__builtin__']) - - def _pypy_execute_source(source): + def _pypy_execute_source(source, c_argument): try: - compiler = space.createcompiler() - stmt = compiler.compile(source, 'c callback', 'exec', 0) - stmt.exec_code(space, w_globals, w_globals) + w_globals = space.newdict(module=True) + space.setitem(w_globals, space.wrap('__builtins__'), + space.builtin_modules['__builtin__']) + space.setitem(w_globals, space.wrap('c_argument'), + space.wrap(c_argument)) + space.appexec([space.wrap(source), w_globals], """(src, glob): + import sys + stmt = compile(src, 'c callback', 'exec') + if not hasattr(sys, '_pypy_execute_source'): + sys._pypy_execute_source = [] + sys._pypy_execute_source.append(glob) + exec stmt in glob + """) except OperationError, e: debug("OperationError:") debug(" operror-type: " + e.w_type.getname(space)) diff --git a/pypy/module/_vmprof/test/test__vmprof.py b/pypy/module/_vmprof/test/test__vmprof.py --- a/pypy/module/_vmprof/test/test__vmprof.py +++ b/pypy/module/_vmprof/test/test__vmprof.py @@ -21,11 +21,12 @@ i = 0 count = 0 i += 5 * WORD # header - assert s[i] == '\x04' - i += 1 # marker - assert s[i] == '\x04' - i += 1 # length - i += len('pypy') + assert s[i ] == '\x05' # MARKER_HEADER + assert s[i + 1] == '\x00' # 0 + assert s[i + 2] == '\x01' # VERSION_THREAD_ID + assert s[i + 3] == chr(4) # len('pypy') + assert s[i + 4: i + 8] == 'pypy' + i += 8 while i < len(s): if s[i] == '\x03': break diff --git a/pypy/module/_vmprof/test/test_direct.py b/pypy/module/_vmprof/test/test_direct.py --- a/pypy/module/_vmprof/test/test_direct.py +++ b/pypy/module/_vmprof/test/test_direct.py @@ -42,7 +42,7 @@ } -""" + open(str(srcdir.join("rvmprof_get_custom_offset.h"))).read()) +""" + open(str(srcdir.join("vmprof_get_custom_offset.h"))).read()) class TestDirect(object): def test_infrastructure(self): diff --git a/pypy/module/struct/formatiterator.py b/pypy/module/struct/formatiterator.py --- a/pypy/module/struct/formatiterator.py +++ b/pypy/module/struct/formatiterator.py @@ -82,7 +82,13 @@ w_index = space.int(w_obj) # wrapped float -> wrapped int or long if w_index is None: raise StructError("cannot convert argument to integer") - return getattr(space, meth)(w_index) + method = getattr(space, meth) + try: + return method(w_index) + except OperationError as e: + if e.match(self.space, self.space.w_OverflowError): + raise StructError("argument out of range") + raise def accept_bool_arg(self): w_obj = self.accept_obj_arg() diff --git a/pypy/module/struct/test/test_struct.py b/pypy/module/struct/test/test_struct.py --- a/pypy/module/struct/test/test_struct.py +++ b/pypy/module/struct/test/test_struct.py @@ -428,6 +428,9 @@ assert s.unpack(s.pack(42)) == (42,) assert s.unpack_from(memoryview(s.pack(42))) == (42,) + def test_overflow(self): + raises(self.struct.error, self.struct.pack, 'i', 1<<65) + class AppTestStructBuffer(object): spaceconfig = dict(usemodules=['struct', '__pypy__']) diff --git a/rpython/flowspace/objspace.py b/rpython/flowspace/objspace.py --- a/rpython/flowspace/objspace.py +++ b/rpython/flowspace/objspace.py @@ -13,6 +13,11 @@ def _assert_rpythonic(func): """Raise ValueError if ``func`` is obviously not RPython""" + try: + func.func_code.co_cellvars + except AttributeError: + raise ValueError("%r is not RPython: it is likely an unexpected " + "built-in function or type" % (func,)) if func.func_doc and func.func_doc.lstrip().startswith('NOT_RPYTHON'): raise ValueError("%r is tagged as NOT_RPYTHON" % (func,)) if func.func_code.co_cellvars: diff --git a/rpython/flowspace/test/test_objspace.py b/rpython/flowspace/test/test_objspace.py --- a/rpython/flowspace/test/test_objspace.py +++ b/rpython/flowspace/test/test_objspace.py @@ -1363,6 +1363,15 @@ simplify_graph(graph) assert self.all_operations(graph) == {'bool': 1, 'inplace_add': 1} + def test_unexpected_builtin_function(self): + import itertools + e = py.test.raises(ValueError, build_flow, itertools.permutations) + assert ' is not RPython:' in str(e.value) + e = py.test.raises(ValueError, build_flow, itertools.tee) + assert ' is not RPython:' in str(e.value) + e = py.test.raises(ValueError, build_flow, Exception.__init__) + assert ' is not RPython:' in str(e.value) + DATA = {'x': 5, 'y': 6} diff --git a/rpython/jit/backend/detect_cpu.py b/rpython/jit/backend/detect_cpu.py --- a/rpython/jit/backend/detect_cpu.py +++ b/rpython/jit/backend/detect_cpu.py @@ -63,6 +63,7 @@ 'AMD64': MODEL_X86, # win64 'armv7l': MODEL_ARM, 'armv6l': MODEL_ARM, + 'arm': MODEL_ARM, # freebsd }.get(mach) if result is None: diff --git a/rpython/jit/backend/llsupport/src/codemap.c b/rpython/jit/backend/llsupport/src/codemap.c --- a/rpython/jit/backend/llsupport/src/codemap.c +++ b/rpython/jit/backend/llsupport/src/codemap.c @@ -6,9 +6,9 @@ #endif #ifdef RPYTHON_VMPROF -RPY_EXTERN void rpython_vmprof_ignore_signals(int ignored); +RPY_EXTERN void vmprof_ignore_signals(int ignored); static void pypy_codemap_invalid_set(int ignored) { - rpython_vmprof_ignore_signals(ignored); + vmprof_ignore_signals(ignored); } #else static void pypy_codemap_invalid_set(int ignored) { diff --git a/rpython/jit/backend/tool/viewcode.py b/rpython/jit/backend/tool/viewcode.py --- a/rpython/jit/backend/tool/viewcode.py +++ b/rpython/jit/backend/tool/viewcode.py @@ -17,18 +17,6 @@ import subprocess from bisect import bisect_left -# don't use rpython.tool.udir here to avoid removing old usessions which -# might still contain interesting executables -udir = py.path.local.make_numbered_dir(prefix='viewcode-', keep=2) -tmpfile = str(udir.join('dump.tmp')) - -# hack hack -import rpython.tool -mod = new.module('rpython.tool.udir') -mod.udir = udir -sys.modules['rpython.tool.udir'] = mod -rpython.tool.udir = mod - # ____________________________________________________________ # Some support code from Psyco. There is more over there, # I am porting it in a lazy fashion... See py-utils/xam.py @@ -438,6 +426,18 @@ # ____________________________________________________________ if __name__ == '__main__': + # don't use rpython.tool.udir here to avoid removing old usessions which + # might still contain interesting executables + udir = py.path.local.make_numbered_dir(prefix='viewcode-', keep=2) + tmpfile = str(udir.join('dump.tmp')) + + # hack hack + import rpython.tool + mod = new.module('rpython.tool.udir') + mod.udir = udir + sys.modules['rpython.tool.udir'] = mod + rpython.tool.udir = mod + if '--text' in sys.argv: sys.argv.remove('--text') showgraph = False @@ -463,3 +463,7 @@ world.show(showtext=True) else: world.showtextonly() +else: + from rpython.tool.udir import udir + tmpfile = str(udir.join('dump.tmp')) + diff --git a/rpython/rlib/jit.py b/rpython/rlib/jit.py --- a/rpython/rlib/jit.py +++ b/rpython/rlib/jit.py @@ -1087,6 +1087,16 @@ """ assert value is not None and type(value) is cls +def ll_record_exact_class(ll_value, ll_cls): + from rpython.rlib.debug import ll_assert + from rpython.rtyper.lltypesystem.lloperation import llop + from rpython.rtyper.lltypesystem import lltype + from rpython.rtyper.rclass import ll_type + ll_assert(ll_value == lltype.nullptr(lltype.typeOf(ll_value).TO), "record_exact_class called with None argument") + ll_assert(ll_type(ll_value) is ll_cls, "record_exact_class called with invalid arguments") + llop.jit_record_exact_class(lltype.Void, ll_value, ll_cls) + + class Entry(ExtRegistryEntry): _about_ = record_exact_class @@ -1099,12 +1109,10 @@ from rpython.rtyper import rclass classrepr = rclass.get_type_repr(hop.rtyper) - - hop.exception_cannot_occur() v_inst = hop.inputarg(hop.args_r[0], arg=0) v_cls = hop.inputarg(classrepr, arg=1) - return hop.genop('jit_record_exact_class', [v_inst, v_cls], - resulttype=lltype.Void) + hop.exception_is_here() + return hop.gendirectcall(ll_record_exact_class, v_inst, v_cls) def _jit_conditional_call(condition, function, *args): pass diff --git a/rpython/rlib/rvmprof/cintf.py b/rpython/rlib/rvmprof/cintf.py --- a/rpython/rlib/rvmprof/cintf.py +++ b/rpython/rlib/rvmprof/cintf.py @@ -40,24 +40,20 @@ **eci_kwds)) - vmprof_init = rffi.llexternal("rpython_vmprof_init", [rffi.INT], rffi.CCHARP, - compilation_info=eci) - vmprof_enable = rffi.llexternal("rpython_vmprof_enable", [rffi.LONG], rffi.INT, + vmprof_init = rffi.llexternal("vmprof_init", + [rffi.INT, rffi.DOUBLE, rffi.CCHARP], + rffi.CCHARP, compilation_info=eci) + vmprof_enable = rffi.llexternal("vmprof_enable", [], rffi.INT, compilation_info=eci, save_err=rffi.RFFI_SAVE_ERRNO) - vmprof_disable = rffi.llexternal("rpython_vmprof_disable", [], rffi.INT, + vmprof_disable = rffi.llexternal("vmprof_disable", [], rffi.INT, compilation_info=eci, save_err=rffi.RFFI_SAVE_ERRNO) - vmprof_write_buf = rffi.llexternal("rpython_vmprof_write_buf", - [rffi.CCHARP, rffi.LONG], - lltype.Void, compilation_info=eci) - - ## vmprof_register_virtual_function = rffi.llexternal( - ## "vmprof_register_virtual_function", - ## [rffi.CCHARP, rffi.VOIDP, rffi.VOIDP], lltype.Void, - ## compilation_info=eci, _nowrapper=True) - - vmprof_ignore_signals = rffi.llexternal("rpython_vmprof_ignore_signals", + vmprof_register_virtual_function = rffi.llexternal( + "vmprof_register_virtual_function", + [rffi.CCHARP, rffi.LONG, rffi.INT], + rffi.INT, compilation_info=eci) + vmprof_ignore_signals = rffi.llexternal("vmprof_ignore_signals", [rffi.INT], lltype.Void, compilation_info=eci) return CInterface(locals()) diff --git a/rpython/rlib/rvmprof/rvmprof.py b/rpython/rlib/rvmprof/rvmprof.py --- a/rpython/rlib/rvmprof/rvmprof.py +++ b/rpython/rlib/rvmprof/rvmprof.py @@ -1,14 +1,12 @@ import sys, os from rpython.rlib.objectmodel import specialize, we_are_translated -from rpython.rlib.rstring import StringBuilder from rpython.rlib import jit, rgc, rposix from rpython.rlib.rvmprof import cintf from rpython.rtyper.annlowlevel import cast_instance_to_gcref from rpython.rtyper.annlowlevel import cast_base_ptr_to_instance from rpython.rtyper.lltypesystem import rffi -MAX_CODES = 8000 - 255 -MAX_FUNC_NAME = 255 +MAX_FUNC_NAME = 1023 # ____________________________________________________________ @@ -34,8 +32,6 @@ def _cleanup_(self): self.is_enabled = False - self.fileno = -1 - self._current_codes = None @specialize.argtype(1) def register_code(self, code, full_name_func): @@ -102,18 +98,13 @@ assert fileno >= 0 if self.is_enabled: raise VMProfError("vmprof is already enabled") - if not (1e-6 <= interval < 1.0): - raise VMProfError("bad value for 'interval'") - interval_usec = int(interval * 1000000.0) - p_error = self.cintf.vmprof_init(fileno) + p_error = self.cintf.vmprof_init(fileno, interval, "pypy") if p_error: raise VMProfError(rffi.charp2str(p_error)) - self.fileno = fileno - self._write_header(interval_usec) self._gather_all_code_objs() - res = self.cintf.vmprof_enable(interval_usec) + res = self.cintf.vmprof_enable() if res < 0: raise VMProfError(os.strerror(rposix.get_saved_errno())) self.is_enabled = True @@ -125,9 +116,6 @@ if not self.is_enabled: raise VMProfError("vmprof is not enabled") self.is_enabled = False - if self._current_codes is not None: - self._flush_codes() - self.fileno = -1 res = self.cintf.vmprof_disable() if res < 0: raise VMProfError(os.strerror(rposix.get_saved_errno())) @@ -136,48 +124,8 @@ assert name.count(':') == 3 and len(name) <= MAX_FUNC_NAME, ( "the name must be 'class:func_name:func_line:filename' " "and at most %d characters; got '%s'" % (MAX_FUNC_NAME, name)) - b = self._current_codes - if b is None: - b = self._current_codes = StringBuilder() - b.append('\x02') - _write_long_to_string_builder(uid, b) - _write_long_to_string_builder(len(name), b) - b.append(name) - if b.getlength() >= MAX_CODES: - self._flush_codes() - - def _flush_codes(self): - buf = self._current_codes.build() - self._current_codes = None - self.cintf.vmprof_write_buf(buf, len(buf)) - # NOTE: keep in mind that vmprof_write_buf() can only write - # a maximum of 8184 bytes. This should be guaranteed here because: - assert MAX_CODES + 17 + MAX_FUNC_NAME <= 8184 - - def _write_header(self, interval_usec): - b = StringBuilder() - _write_long_to_string_builder(0, b) - _write_long_to_string_builder(3, b) - _write_long_to_string_builder(0, b) - _write_long_to_string_builder(interval_usec, b) - _write_long_to_string_builder(0, b) - b.append('\x04') # interp name - b.append(chr(len('pypy'))) - b.append('pypy') - buf = b.build() - self.cintf.vmprof_write_buf(buf, len(buf)) - - -def _write_long_to_string_builder(l, b): - b.append(chr(l & 0xff)) - b.append(chr((l >> 8) & 0xff)) - b.append(chr((l >> 16) & 0xff)) - b.append(chr((l >> 24) & 0xff)) - if sys.maxint > 2147483647: - b.append(chr((l >> 32) & 0xff)) - b.append(chr((l >> 40) & 0xff)) - b.append(chr((l >> 48) & 0xff)) - b.append(chr((l >> 56) & 0xff)) + if self.cintf.vmprof_register_virtual_function(name, uid, 500000) < 0: + raise VMProfError("vmprof buffers full! disk full or too slow") def vmprof_execute_code(name, get_code_fn, result_class=None): diff --git a/rpython/rlib/rvmprof/src/rvmprof.c b/rpython/rlib/rvmprof/src/rvmprof.c --- a/rpython/rlib/rvmprof/src/rvmprof.c +++ b/rpython/rlib/rvmprof/src/rvmprof.c @@ -1,22 +1,3 @@ -/* VMPROF - * - * statistical sampling profiler specifically designed to profile programs - * which run on a Virtual Machine and/or bytecode interpreter, such as Python, - * etc. - * - * The logic to dump the C stack traces is partly stolen from the code in - * gperftools. - * The file "getpc.h" has been entirely copied from gperftools. - * - * Tested only on gcc, linux, x86_64. - * - * Copyright (C) 2014-2015 - * Antonio Cuni - anto.c...@gmail.com - * Maciej Fijalkowski - fij...@gmail.com - * Armin Rigo - ar...@tunes.org - * - */ - #define _GNU_SOURCE 1 @@ -39,431 +20,4 @@ #endif -#include <dlfcn.h> -#include <assert.h> -#include <pthread.h> -#include <sys/time.h> -#include <errno.h> -#include <unistd.h> -#include <stdio.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include "rvmprof_getpc.h" -#include "rvmprof_unwind.h" -#include "rvmprof_mt.h" - - -/************************************************************/ - -// functions copied from libunwind using dlopen - -static int (*unw_get_reg)(unw_cursor_t*, int, unw_word_t*) = NULL; -static int (*unw_step)(unw_cursor_t*) = NULL; -static int (*unw_init_local)(unw_cursor_t *, unw_context_t *) = NULL; -static int (*unw_get_proc_info)(unw_cursor_t *, unw_proc_info_t *) = NULL; - -static int profile_file = -1; - - -RPY_EXTERN -char *rpython_vmprof_init(int fd) -{ - if (!unw_get_reg) { - void *libhandle; - - if (!(libhandle = dlopen("libunwind.so", RTLD_LAZY | RTLD_LOCAL))) - goto error; - if (!(unw_get_reg = dlsym(libhandle, "_ULx86_64_get_reg"))) - goto error; - if (!(unw_get_proc_info = dlsym(libhandle, "_ULx86_64_get_proc_info"))) - goto error; - if (!(unw_init_local = dlsym(libhandle, "_ULx86_64_init_local"))) - goto error; - if (!(unw_step = dlsym(libhandle, "_ULx86_64_step"))) - goto error; - } - if (prepare_concurrent_bufs() < 0) - return "out of memory"; - - assert(fd >= 0); - profile_file = fd; - return NULL; - - error: - return dlerror(); -} - -/************************************************************/ - -/* value: last bit is 1 if signals must be ignored; all other bits - are a counter for how many threads are currently in a signal handler */ -static long volatile signal_handler_value = 1; - -RPY_EXTERN -void rpython_vmprof_ignore_signals(int ignored) -{ - if (!ignored) { - __sync_fetch_and_and(&signal_handler_value, ~1L); - } - else { - /* set the last bit, and wait until concurrently-running signal - handlers finish */ - while (__sync_or_and_fetch(&signal_handler_value, 1L) != 1L) { - usleep(1); - } - } -} - - -/* ************************************************************* - * functions to write a profile file compatible with gperftools - * ************************************************************* - */ - -#define MAX_FUNC_NAME 128 -#define MAX_STACK_DEPTH \ - ((SINGLE_BUF_SIZE - sizeof(struct prof_stacktrace_s)) / sizeof(void *)) - -#define MARKER_STACKTRACE '\x01' -#define MARKER_VIRTUAL_IP '\x02' -#define MARKER_TRAILER '\x03' - -struct prof_stacktrace_s { - char padding[sizeof(long) - 1]; - char marker; - long count, depth; - void *stack[]; -}; - -static long profile_interval_usec = 0; -static char atfork_hook_installed = 0; - - -/* ****************************************************** - * libunwind workaround for process JIT frames correctly - * ****************************************************** - */ - -#include "rvmprof_get_custom_offset.h" - -typedef struct { - void* _unused1; - void* _unused2; - void* sp; - void* ip; - void* _unused3[sizeof(unw_cursor_t)/sizeof(void*) - 4]; -} vmprof_hacked_unw_cursor_t; - -static int vmprof_unw_step(unw_cursor_t *cp, int first_run) -{ - void* ip; - void* sp; - ptrdiff_t sp_offset; - unw_get_reg (cp, UNW_REG_IP, (unw_word_t*)&ip); - unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp); - if (!first_run) { - // make sure we're pointing to the CALL and not to the first - // instruction after. If the callee adjusts the stack for us - // it's not safe to be at the instruction after - ip -= 1; - } - sp_offset = vmprof_unw_get_custom_offset(ip, cp); - - if (sp_offset == -1) { - // it means that the ip is NOT in JITted code, so we can use the - // stardard unw_step - return unw_step(cp); - } - else { - // this is a horrible hack to manually walk the stack frame, by - // setting the IP and SP in the cursor - vmprof_hacked_unw_cursor_t *cp2 = (vmprof_hacked_unw_cursor_t*)cp; - void* bp = (void*)sp + sp_offset; - cp2->sp = bp; - bp -= sizeof(void*); - cp2->ip = ((void**)bp)[0]; - // the ret is on the top of the stack minus WORD - return 1; - } -} - - -/* ************************************************************* - * functions to dump the stack trace - * ************************************************************* - */ - -static int get_stack_trace(void** result, int max_depth, ucontext_t *ucontext) -{ - void *ip; - int n = 0; - unw_cursor_t cursor; - unw_context_t uc = *ucontext; - - int ret = unw_init_local(&cursor, &uc); - assert(ret >= 0); - (void)ret; - - while (n < max_depth) { - if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) { - break; - } - - unw_proc_info_t pip; - unw_get_proc_info(&cursor, &pip); - - /* if n==0, it means that the signal handler interrupted us while we - were in the trampoline, so we are not executing (yet) the real main - loop function; just skip it */ - if (VMPROF_ADDR_OF_TRAMPOLINE((void*)pip.start_ip) && n > 0) { - // found main loop stack frame - void* sp; - unw_get_reg(&cursor, UNW_REG_SP, (unw_word_t *) &sp); - void *arg_addr = (char*)sp /* + mainloop_sp_offset */; - void **arg_ptr = (void**)arg_addr; - /* if (mainloop_get_virtual_ip) { - ip = mainloop_get_virtual_ip(*arg_ptr); - } else { */ - ip = *arg_ptr; - } - - int first_run = (n == 0); - result[n++] = ip; - n = vmprof_write_header_for_jit_addr(result, n, ip, max_depth); - if (vmprof_unw_step(&cursor, first_run) <= 0) - break; - } - return n; -} - - -/* ************************************************************* - * the signal handler - * ************************************************************* - */ - -static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext) -{ - long val = __sync_fetch_and_add(&signal_handler_value, 2L); - - if ((val & 1) == 0) { - int saved_errno = errno; - int fd = profile_file; - assert(fd >= 0); - - struct profbuf_s *p = reserve_buffer(fd); - if (p == NULL) { - /* ignore this signal: there are no free buffers right now */ - } - else { - int depth; - struct prof_stacktrace_s *st = (struct prof_stacktrace_s *)p->data; - st->marker = MARKER_STACKTRACE; - st->count = 1; - st->stack[0] = GetPC((ucontext_t*)ucontext); - depth = get_stack_trace(st->stack+1, MAX_STACK_DEPTH-1, ucontext); - depth++; // To account for pc value in stack[0]; - st->depth = depth; - p->data_offset = offsetof(struct prof_stacktrace_s, marker); - p->data_size = (depth * sizeof(void *) + - sizeof(struct prof_stacktrace_s) - - offsetof(struct prof_stacktrace_s, marker)); - commit_buffer(fd, p); - } - - errno = saved_errno; - } - - __sync_sub_and_fetch(&signal_handler_value, 2L); -} - - -/* ************************************************************* - * the setup and teardown functions - * ************************************************************* - */ - -static int install_sigprof_handler(void) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = sigprof_handler; - sa.sa_flags = SA_RESTART | SA_SIGINFO; - if (sigemptyset(&sa.sa_mask) == -1 || - sigaction(SIGPROF, &sa, NULL) == -1) - return -1; - return 0; -} - -static int remove_sigprof_handler(void) -{ - if (signal(SIGPROF, SIG_DFL) == SIG_ERR) - return -1; - return 0; -} - -static int install_sigprof_timer(void) -{ - static struct itimerval timer; - timer.it_interval.tv_sec = 0; - timer.it_interval.tv_usec = profile_interval_usec; - timer.it_value = timer.it_interval; - if (setitimer(ITIMER_PROF, &timer, NULL) != 0) - return -1; - return 0; -} - -static int remove_sigprof_timer(void) { - static struct itimerval timer; - timer.it_interval.tv_sec = 0; - timer.it_interval.tv_usec = 0; - timer.it_value.tv_sec = 0; - timer.it_value.tv_usec = 0; - if (setitimer(ITIMER_PROF, &timer, NULL) != 0) - return -1; - return 0; -} - -static void atfork_disable_timer(void) { - if (profile_interval_usec > 0) { - remove_sigprof_timer(); - } -} - -static void atfork_enable_timer(void) { - if (profile_interval_usec > 0) { - install_sigprof_timer(); - } -} - -static int install_pthread_atfork_hooks(void) { - /* this is needed to prevent the problems described there: - - http://code.google.com/p/gperftools/issues/detail?id=278 - - http://lists.debian.org/debian-glibc/2010/03/msg00161.html - - TL;DR: if the RSS of the process is large enough, the clone() syscall - will be interrupted by the SIGPROF before it can complete, then - retried, interrupted again and so on, in an endless loop. The - solution is to disable the timer around the fork, and re-enable it - only inside the parent. - */ - if (atfork_hook_installed) - return 0; - int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, NULL); - if (ret != 0) - return -1; - atfork_hook_installed = 1; - return 0; -} - -RPY_EXTERN -int rpython_vmprof_enable(long interval_usec) -{ - assert(profile_file >= 0); - assert(interval_usec > 0); - profile_interval_usec = interval_usec; - - if (install_pthread_atfork_hooks() == -1) - goto error; - if (install_sigprof_handler() == -1) - goto error; - if (install_sigprof_timer() == -1) - goto error; - rpython_vmprof_ignore_signals(0); - return 0; - - error: - profile_file = -1; - profile_interval_usec = 0; - return -1; -} - -static int _write_all(const void *buf, size_t bufsize) -{ - while (bufsize > 0) { - ssize_t count = write(profile_file, buf, bufsize); - if (count <= 0) - return -1; /* failed */ - buf += count; - bufsize -= count; - } - return 0; -} - -static int close_profile(void) -{ - char buf[4096]; - ssize_t size; - unsigned char marker = MARKER_TRAILER; - - if (_write_all(&marker, 1) < 0) - return -1; - -#ifdef __linux__ - // copy /proc/self/maps to the end of the profile file - int srcfd = open("/proc/self/maps", O_RDONLY); - if (srcfd < 0) - return -1; - - while ((size = read(srcfd, buf, sizeof buf)) > 0) { - if (_write_all(buf, size) < 0) { - close(srcfd); - return -1; - } - } - close(srcfd); -#else - // freebsd and mac - sprintf(buf, "procstat -v %d", getpid()); - FILE *srcf = popen(buf, "r"); - if (!srcf) - return -1; - - while ((size = fread(buf, 1, sizeof buf, src))) { - if (_write_all(buf, size) < 0) { - pclose(srcf); - return -1; - } - } - pclose(srcf); -#endif - - /* don't close() the file descriptor from here */ - profile_file = -1; - return 0; -} - -RPY_EXTERN -int rpython_vmprof_disable(void) -{ - rpython_vmprof_ignore_signals(1); - profile_interval_usec = 0; - - if (remove_sigprof_timer() == -1) - return -1; - if (remove_sigprof_handler() == -1) - return -1; - if (shutdown_concurrent_bufs(profile_file) < 0) - return -1; - return close_profile(); -} - -RPY_EXTERN -void rpython_vmprof_write_buf(char *buf, long size) -{ - struct profbuf_s *p; - - while ((p = reserve_buffer(profile_file)) == NULL) { - /* spin loop waiting for a buffer to be ready; should almost never - be the case */ - usleep(1); - } - - if (size > SINGLE_BUF_SIZE) - size = SINGLE_BUF_SIZE; - memcpy(p->data, buf, size); - p->data_size = size; - - commit_buffer(profile_file, p); -} +#include "vmprof_main.h" diff --git a/rpython/rlib/rvmprof/src/rvmprof.h b/rpython/rlib/rvmprof/src/rvmprof.h --- a/rpython/rlib/rvmprof/src/rvmprof.h +++ b/rpython/rlib/rvmprof/src/rvmprof.h @@ -1,6 +1,6 @@ -RPY_EXTERN char *rpython_vmprof_init(int); -RPY_EXTERN void rpython_vmprof_ignore_signals(int); -RPY_EXTERN int rpython_vmprof_enable(long); -RPY_EXTERN int rpython_vmprof_disable(void); -RPY_EXTERN void rpython_vmprof_write_buf(char *, long); +RPY_EXTERN char *vmprof_init(int, double, char *); +RPY_EXTERN void vmprof_ignore_signals(int); +RPY_EXTERN int vmprof_enable(void); +RPY_EXTERN int vmprof_disable(void); +RPY_EXTERN int vmprof_register_virtual_function(char *, long, int); diff --git a/rpython/rlib/rvmprof/src/rvmprof_get_custom_offset.h b/rpython/rlib/rvmprof/src/rvmprof_get_custom_offset.h deleted file mode 100644 --- a/rpython/rlib/rvmprof/src/rvmprof_get_custom_offset.h +++ /dev/null @@ -1,63 +0,0 @@ - -#ifdef PYPY_JIT_CODEMAP -void *pypy_find_codemap_at_addr(long addr, long *start_addr); -long pypy_yield_codemap_at_addr(void *codemap_raw, long addr, - long *current_pos_addr); -long pypy_jit_stack_depth_at_loc(long loc); -#endif - - -static ptrdiff_t vmprof_unw_get_custom_offset(void* ip, void *cp) { -#ifdef PYPY_JIT_CODEMAP - intptr_t ip_l = (intptr_t)ip; - return pypy_jit_stack_depth_at_loc(ip_l); -#else - return -1; -#endif -} - -static long vmprof_write_header_for_jit_addr(void **result, long n, - void *ip, int max_depth) -{ -#ifdef PYPY_JIT_CODEMAP - void *codemap; - long current_pos = 0; - intptr_t id; - long start_addr = 0; - intptr_t addr = (intptr_t)ip; - int start, k; - void *tmp; - - codemap = pypy_find_codemap_at_addr(addr, &start_addr); - if (codemap == NULL) - // not a jit code at all - return n; - - // modify the last entry to point to start address and not the random one - // in the middle - result[n - 1] = (void*)start_addr; - result[n] = (void*)2; - n++; - start = n; - while (n < max_depth) { - id = pypy_yield_codemap_at_addr(codemap, addr, ¤t_pos); - if (id == -1) - // finish - break; - if (id == 0) - continue; // not main codemap - result[n++] = (void *)id; - } - k = 0; - while (k < (n - start) / 2) { - tmp = result[start + k]; - result[start + k] = result[n - k - 1]; - result[n - k - 1] = tmp; - k++; - } - if (n < max_depth) { - result[n++] = (void*)3; - } -#endif - return n; -} diff --git a/rpython/rlib/rvmprof/src/rvmprof_mt.h b/rpython/rlib/rvmprof/src/rvmprof_mt.h deleted file mode 100644 --- a/rpython/rlib/rvmprof/src/rvmprof_mt.h +++ /dev/null @@ -1,210 +0,0 @@ -/* Support for multithreaded write() operations */ - -#include <sys/mman.h> -#include <string.h> - -/* The idea is that we have MAX_NUM_BUFFERS available, all of size - SINGLE_BUF_SIZE. Threads and signal handlers can ask to reserve a - buffer, fill it, and finally "commit" it, at which point its - content is written into the profile file. There is no hard - guarantee about the order in which the committed blocks are - actually written. We do this with two constrains: - - - write() calls should not overlap; only one thread can be - currently calling it. - - - the code needs to be multithread-safe *and* signal-handler-safe, - which means it must be written in a wait-free style: never have - spin loops waiting for some lock to be released, from any of - the functions that can be called from the signal handler! The - code holding the lock could be running in the same thread, - currently interrupted by the signal handler. - - The value of MAX_NUM_BUFFERS is a trade-off between too high - (lots of unnecessary memory, lots of checking all of them) - and too low (risk that there is none left). -*/ -#define MAX_NUM_BUFFERS 20 -#define SINGLE_BUF_SIZE (8192 - 2 * sizeof(unsigned int)) - -#if defined(__i386__) || defined(__amd64__) - static inline void write_fence(void) { asm("" : : : "memory"); } -#else - static inline void write_fence(void) { __sync_synchronize(); } -#endif - - -#define PROFBUF_UNUSED 0 -#define PROFBUF_FILLING 1 -#define PROFBUF_READY 2 - - -struct profbuf_s { - unsigned int data_size; - unsigned int data_offset; - char data[SINGLE_BUF_SIZE]; -}; - -static char volatile profbuf_state[MAX_NUM_BUFFERS]; -static struct profbuf_s *profbuf_all_buffers = NULL; -static int volatile profbuf_write_lock = 2; -static long profbuf_pending_write; - - -static void unprepare_concurrent_bufs(void) -{ - if (profbuf_all_buffers != NULL) { - munmap(profbuf_all_buffers, sizeof(struct profbuf_s) * MAX_NUM_BUFFERS); - profbuf_all_buffers = NULL; - } -} - -static int prepare_concurrent_bufs(void) -{ - assert(sizeof(struct profbuf_s) == 8192); - - unprepare_concurrent_bufs(); - profbuf_all_buffers = mmap(NULL, sizeof(struct profbuf_s) * MAX_NUM_BUFFERS, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - -1, 0); - if (profbuf_all_buffers == MAP_FAILED) { - profbuf_all_buffers = NULL; - return -1; - } - memset((char *)profbuf_state, PROFBUF_UNUSED, sizeof(profbuf_state)); - profbuf_write_lock = 0; - profbuf_pending_write = -1; - return 0; -} - -static int _write_single_ready_buffer(int fd, long i) -{ - /* Try to write to disk the buffer number 'i'. This function must - only be called while we hold the write lock. */ - assert(profbuf_write_lock != 0); - - if (profbuf_pending_write >= 0) { - /* A partially written buffer is waiting. We'll write the - rest of this buffer now, instead of 'i'. */ - i = profbuf_pending_write; - assert(profbuf_state[i] == PROFBUF_READY); - } - - if (profbuf_state[i] != PROFBUF_READY) { - /* this used to be a race condition: the buffer was written by a - different thread already, nothing to do now */ - return 0; - } - - int err; - struct profbuf_s *p = &profbuf_all_buffers[i]; - ssize_t count = write(fd, p->data + p->data_offset, p->data_size); - if (count == p->data_size) { - profbuf_state[i] = PROFBUF_UNUSED; - profbuf_pending_write = -1; - } - else { - if (count > 0) { - p->data_offset += count; - p->data_size -= count; - } - profbuf_pending_write = i; - if (count < 0) - return -1; - } - return 0; -} - -static void _write_ready_buffers(int fd) -{ - long i; - int has_write_lock = 0; - - for (i = 0; i < MAX_NUM_BUFFERS; i++) { - if (profbuf_state[i] == PROFBUF_READY) { - if (!has_write_lock) { - if (!__sync_bool_compare_and_swap(&profbuf_write_lock, 0, 1)) - return; /* can't acquire the write lock, give up */ - has_write_lock = 1; - } - if (_write_single_ready_buffer(fd, i) < 0) - break; - } - } - if (has_write_lock) - profbuf_write_lock = 0; -} - -static struct profbuf_s *reserve_buffer(int fd) -{ - /* Tries to enter a region of code that fills one buffer. If - successful, returns the profbuf_s. It fails only if the - concurrent buffers are all busy (extreme multithreaded usage). - - This might call write() to emit the data sitting in - previously-prepared buffers. In case of write() error, the - error is ignored but unwritten data stays in the buffers. - */ - long i; - - _write_ready_buffers(fd); - - for (i = 0; i < MAX_NUM_BUFFERS; i++) { - if (profbuf_state[i] == PROFBUF_UNUSED && - __sync_bool_compare_and_swap(&profbuf_state[i], PROFBUF_UNUSED, - PROFBUF_FILLING)) { - struct profbuf_s *p = &profbuf_all_buffers[i]; - p->data_size = 0; - p->data_offset = 0; - return p; - } - } - /* no unused buffer found */ - return NULL; -} - -static void commit_buffer(int fd, struct profbuf_s *buf) -{ - /* Leaves a region of code that filled 'buf'. - - This might call write() to emit the data now ready. In case of - write() error, the error is ignored but unwritten data stays in - the buffers. - */ - - /* Make sure every thread sees the full content of 'buf' */ - write_fence(); - - /* Then set the 'ready' flag */ - long i = buf - profbuf_all_buffers; - assert(profbuf_state[i] == PROFBUF_FILLING); - profbuf_state[i] = PROFBUF_READY; - - if (!__sync_bool_compare_and_swap(&profbuf_write_lock, 0, 1)) { - /* can't acquire the write lock, ignore */ - } - else { - _write_single_ready_buffer(fd, i); - profbuf_write_lock = 0; - } -} - -static int shutdown_concurrent_bufs(int fd) -{ - /* no signal handler can be running concurrently here, because we - already did rpython_vmprof_ignore_signals(1) */ - assert(profbuf_write_lock == 0); - profbuf_write_lock = 2; - - /* last attempt to flush buffers */ - int i; - for (i = 0; i < MAX_NUM_BUFFERS; i++) { - while (profbuf_state[i] == PROFBUF_READY) { - if (_write_single_ready_buffer(fd, i) < 0) - return -1; - } - } - unprepare_concurrent_bufs(); - return 0; -} diff --git a/rpython/rlib/rvmprof/src/rvmprof_config.h b/rpython/rlib/rvmprof/src/vmprof_config.h rename from rpython/rlib/rvmprof/src/rvmprof_config.h rename to rpython/rlib/rvmprof/src/vmprof_config.h diff --git a/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h b/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h new file mode 100644 --- /dev/null +++ b/rpython/rlib/rvmprof/src/vmprof_get_custom_offset.h @@ -0,0 +1,120 @@ + +#ifdef PYPY_JIT_CODEMAP +void *pypy_find_codemap_at_addr(long addr, long *start_addr); +long pypy_yield_codemap_at_addr(void *codemap_raw, long addr, + long *current_pos_addr); +long pypy_jit_stack_depth_at_loc(long loc); +#endif + + +#ifdef CPYTHON_GET_CUSTOM_OFFSET +static void *tramp_start, *tramp_end; +#endif + + +static ptrdiff_t vmprof_unw_get_custom_offset(void* ip, void *cp) { + +#if defined(PYPY_JIT_CODEMAP) + + intptr_t ip_l = (intptr_t)ip; + return pypy_jit_stack_depth_at_loc(ip_l); + +#elif defined(CPYTHON_GET_CUSTOM_OFFSET) + + if (ip >= tramp_start && ip <= tramp_end) { + // XXX the return value is wrong for all the places before push and + // after pop, fix + void *bp; + void *sp; + + /* This is a stage2 trampoline created by hotpatch: + + push %rbx + push %rbp + mov %rsp,%rbp + and $0xfffffffffffffff0,%rsp // make sure the stack is aligned + movabs $0x7ffff687bb10,%rbx + callq *%rbx + leaveq + pop %rbx + retq + + the stack layout is like this: + + +-----------+ high addresses + | ret addr | + +-----------+ + | saved rbx | start of the function frame + +-----------+ + | saved rbp | + +-----------+ + | ........ | <-- rbp + +-----------+ low addresses + + So, the trampoline frame starts at rbp+16, and the return address, + is at rbp+24. The vmprof API requires us to return the offset of + the frame relative to sp, hence we have this weird computation. + + XXX (antocuni): I think we could change the API to return directly + the frame address instead of the offset; however, this require a + change in the PyPy code too + */ + + unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp); + unw_get_reg (cp, UNW_X86_64_RBP, (unw_word_t*)&bp); + return bp+16+8-sp; + } + return -1; + +#else + + return -1; + +#endif +} + +static long vmprof_write_header_for_jit_addr(void **result, long n, + void *ip, int max_depth) +{ +#ifdef PYPY_JIT_CODEMAP + void *codemap; + long current_pos = 0; + intptr_t id; + long start_addr = 0; + intptr_t addr = (intptr_t)ip; + int start, k; + void *tmp; + + codemap = pypy_find_codemap_at_addr(addr, &start_addr); + if (codemap == NULL) + // not a jit code at all + return n; + + // modify the last entry to point to start address and not the random one + // in the middle + result[n - 1] = (void*)start_addr; + result[n] = (void*)2; + n++; + start = n; + while (n < max_depth) { + id = pypy_yield_codemap_at_addr(codemap, addr, ¤t_pos); + if (id == -1) + // finish + break; + if (id == 0) + continue; // not main codemap + result[n++] = (void *)id; + } + k = 0; + while (k < (n - start) / 2) { + tmp = result[start + k]; + result[start + k] = result[n - k - 1]; + result[n - k - 1] = tmp; + k++; + } + if (n < max_depth) { + result[n++] = (void*)3; + } +#endif + return n; +} diff --git a/rpython/rlib/rvmprof/src/rvmprof_getpc.h b/rpython/rlib/rvmprof/src/vmprof_getpc.h rename from rpython/rlib/rvmprof/src/rvmprof_getpc.h rename to rpython/rlib/rvmprof/src/vmprof_getpc.h --- a/rpython/rlib/rvmprof/src/rvmprof_getpc.h +++ b/rpython/rlib/rvmprof/src/vmprof_getpc.h @@ -44,7 +44,7 @@ #ifndef BASE_GETPC_H_ #define BASE_GETPC_H_ -#include "rvmprof_config.h" +#include "vmprof_config.h" // On many linux systems, we may need _GNU_SOURCE to get access to // the defined constants that define the register we want to see (eg diff --git a/rpython/rlib/rvmprof/src/vmprof_main.h b/rpython/rlib/rvmprof/src/vmprof_main.h new file mode 100644 --- /dev/null +++ b/rpython/rlib/rvmprof/src/vmprof_main.h @@ -0,0 +1,556 @@ +/* VMPROF + * + * statistical sampling profiler specifically designed to profile programs + * which run on a Virtual Machine and/or bytecode interpreter, such as Python, + * etc. + * + * The logic to dump the C stack traces is partly stolen from the code in + * gperftools. + * The file "getpc.h" has been entirely copied from gperftools. + * + * Tested only on gcc, linux, x86_64. + * + * Copyright (C) 2014-2015 + * Antonio Cuni - anto.c...@gmail.com + * Maciej Fijalkowski - fij...@gmail.com + * Armin Rigo - ar...@tunes.org + * + */ + +#define _GNU_SOURCE 1 + +#include <dlfcn.h> +#include <assert.h> +#include <pthread.h> +#include <sys/time.h> +#include <errno.h> +#include <unistd.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include "vmprof_getpc.h" +#include "vmprof_unwind.h" +#include "vmprof_mt.h" + + +/************************************************************/ + +// functions copied from libunwind using dlopen + +static int (*unw_get_reg)(unw_cursor_t*, int, unw_word_t*) = NULL; +static int (*unw_step)(unw_cursor_t*) = NULL; +static int (*unw_init_local)(unw_cursor_t *, unw_context_t *) = NULL; +static int (*unw_get_proc_info)(unw_cursor_t *, unw_proc_info_t *) = NULL; + +static int profile_file = -1; +static long prepare_interval_usec; +static struct profbuf_s *volatile current_codes; +static void *(*mainloop_get_virtual_ip)(char *) = 0; + +static int opened_profile(char *interp_name); +static void flush_codes(void); + +RPY_EXTERN +char *vmprof_init(int fd, double interval, char *interp_name) +{ + if (interval < 1e-6 || interval >= 1.0) + return "bad value for 'interval'"; + prepare_interval_usec = (int)(interval * 1000000.0); + + if (!unw_get_reg) { + void *libhandle; + + if (!(libhandle = dlopen("libunwind.so", RTLD_LAZY | RTLD_LOCAL))) + goto error; + if (!(unw_get_reg = dlsym(libhandle, "_ULx86_64_get_reg"))) + goto error; + if (!(unw_get_proc_info = dlsym(libhandle, "_ULx86_64_get_proc_info"))) + goto error; + if (!(unw_init_local = dlsym(libhandle, "_ULx86_64_init_local"))) + goto error; + if (!(unw_step = dlsym(libhandle, "_ULx86_64_step"))) + goto error; + } + if (prepare_concurrent_bufs() < 0) + return "out of memory"; + + assert(fd >= 0); + profile_file = fd; + if (opened_profile(interp_name) < 0) { + profile_file = -1; + return strerror(errno); + } + return NULL; + + error: + return dlerror(); +} + +/************************************************************/ + +/* value: last bit is 1 if signals must be ignored; all other bits + are a counter for how many threads are currently in a signal handler */ +static long volatile signal_handler_value = 1; + +RPY_EXTERN +void vmprof_ignore_signals(int ignored) +{ + if (!ignored) { + __sync_fetch_and_and(&signal_handler_value, ~1L); + } + else { + /* set the last bit, and wait until concurrently-running signal + handlers finish */ + while (__sync_or_and_fetch(&signal_handler_value, 1L) != 1L) { + usleep(1); + } + } +} + + +/* ************************************************************* + * functions to write a profile file compatible with gperftools + * ************************************************************* + */ + +#define MAX_FUNC_NAME 128 +#define MAX_STACK_DEPTH \ + ((SINGLE_BUF_SIZE - sizeof(struct prof_stacktrace_s)) / sizeof(void *)) + +#define MARKER_STACKTRACE '\x01' +#define MARKER_VIRTUAL_IP '\x02' +#define MARKER_TRAILER '\x03' +#define MARKER_INTERP_NAME '\x04' /* deprecated */ +#define MARKER_HEADER '\x05' + +#define VERSION_BASE '\x00' +#define VERSION_THREAD_ID '\x01' + +struct prof_stacktrace_s { + char padding[sizeof(long) - 1]; + char marker; + long count, depth; + void *stack[]; +}; + +static long profile_interval_usec = 0; +static char atfork_hook_installed = 0; + + +/* ****************************************************** + * libunwind workaround for process JIT frames correctly + * ****************************************************** + */ + +#include "vmprof_get_custom_offset.h" + +typedef struct { + void* _unused1; + void* _unused2; + void* sp; + void* ip; + void* _unused3[sizeof(unw_cursor_t)/sizeof(void*) - 4]; +} vmprof_hacked_unw_cursor_t; + +static int vmprof_unw_step(unw_cursor_t *cp, int first_run) +{ + void* ip; + void* sp; + ptrdiff_t sp_offset; + unw_get_reg (cp, UNW_REG_IP, (unw_word_t*)&ip); + unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp); + if (!first_run) { + // make sure we're pointing to the CALL and not to the first + // instruction after. If the callee adjusts the stack for us + // it's not safe to be at the instruction after + ip -= 1; + } + sp_offset = vmprof_unw_get_custom_offset(ip, cp); + + if (sp_offset == -1) { + // it means that the ip is NOT in JITted code, so we can use the + // stardard unw_step + return unw_step(cp); + } + else { + // this is a horrible hack to manually walk the stack frame, by + // setting the IP and SP in the cursor + vmprof_hacked_unw_cursor_t *cp2 = (vmprof_hacked_unw_cursor_t*)cp; + void* bp = (void*)sp + sp_offset; + cp2->sp = bp; + bp -= sizeof(void*); + cp2->ip = ((void**)bp)[0]; + // the ret is on the top of the stack minus WORD + return 1; + } +} + + +/* ************************************************************* + * functions to dump the stack trace + * ************************************************************* + */ + +static int get_stack_trace(void** result, int max_depth, ucontext_t *ucontext) +{ + void *ip; + int n = 0; + unw_cursor_t cursor; + unw_context_t uc = *ucontext; + + int ret = unw_init_local(&cursor, &uc); + assert(ret >= 0); + (void)ret; + + while (n < max_depth) { + if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) { + break; + } + + unw_proc_info_t pip; + unw_get_proc_info(&cursor, &pip); + + /* if n==0, it means that the signal handler interrupted us while we + were in the trampoline, so we are not executing (yet) the real main + loop function; just skip it */ + if (VMPROF_ADDR_OF_TRAMPOLINE((void*)pip.start_ip) && n > 0) { + // found main loop stack frame + void* sp; + unw_get_reg(&cursor, UNW_REG_SP, (unw_word_t *) &sp); + if (mainloop_get_virtual_ip) + ip = mainloop_get_virtual_ip((char *)sp); + else + ip = *(void **)sp; + } + + int first_run = (n == 0); + result[n++] = ip; + n = vmprof_write_header_for_jit_addr(result, n, ip, max_depth); + if (vmprof_unw_step(&cursor, first_run) <= 0) + break; + } + return n; +} + +static void *get_current_thread_id(void) +{ + /* xxx This function is a hack on two fronts: + + - It assumes that pthread_self() is async-signal-safe. This + should be true on Linux. I hope it is also true elsewhere. + + - It abuses pthread_self() by assuming it just returns an + integer. According to comments in CPython's source code, the + platforms where it is not the case are rare nowadays. + + An alternative would be to try to look if the information is + available in the ucontext_t in the caller. + */ + return (void *)pthread_self(); +} + + +/* ************************************************************* + * the signal handler + * ************************************************************* + */ + +static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext) +{ + long val = __sync_fetch_and_add(&signal_handler_value, 2L); + + if ((val & 1) == 0) { + int saved_errno = errno; + int fd = profile_file; + assert(fd >= 0); + + struct profbuf_s *p = reserve_buffer(fd); + if (p == NULL) { + /* ignore this signal: there are no free buffers right now */ + } + else { + int depth; + struct prof_stacktrace_s *st = (struct prof_stacktrace_s *)p->data; + st->marker = MARKER_STACKTRACE; + st->count = 1; + st->stack[0] = GetPC((ucontext_t*)ucontext); + depth = get_stack_trace(st->stack+1, MAX_STACK_DEPTH-2, ucontext); + depth++; // To account for pc value in stack[0]; + st->depth = depth; + st->stack[depth++] = get_current_thread_id(); + p->data_offset = offsetof(struct prof_stacktrace_s, marker); + p->data_size = (depth * sizeof(void *) + + sizeof(struct prof_stacktrace_s) - + offsetof(struct prof_stacktrace_s, marker)); + commit_buffer(fd, p); + } + + errno = saved_errno; + } + + __sync_sub_and_fetch(&signal_handler_value, 2L); +} + + +/* ************************************************************* + * the setup and teardown functions + * ************************************************************* + */ + +static int install_sigprof_handler(void) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = sigprof_handler; + sa.sa_flags = SA_RESTART | SA_SIGINFO; + if (sigemptyset(&sa.sa_mask) == -1 || + sigaction(SIGPROF, &sa, NULL) == -1) + return -1; + return 0; +} + +static int remove_sigprof_handler(void) +{ + if (signal(SIGPROF, SIG_DFL) == SIG_ERR) + return -1; + return 0; +} + +static int install_sigprof_timer(void) +{ + static struct itimerval timer; + timer.it_interval.tv_sec = 0; + timer.it_interval.tv_usec = profile_interval_usec; + timer.it_value = timer.it_interval; + if (setitimer(ITIMER_PROF, &timer, NULL) != 0) + return -1; + return 0; +} + +static int remove_sigprof_timer(void) { + static struct itimerval timer; + timer.it_interval.tv_sec = 0; + timer.it_interval.tv_usec = 0; + timer.it_value.tv_sec = 0; + timer.it_value.tv_usec = 0; + if (setitimer(ITIMER_PROF, &timer, NULL) != 0) + return -1; + return 0; +} + +static void atfork_disable_timer(void) { + if (profile_interval_usec > 0) { + remove_sigprof_timer(); + } +} + +static void atfork_enable_timer(void) { + if (profile_interval_usec > 0) { + install_sigprof_timer(); + } +} + +static int install_pthread_atfork_hooks(void) { + /* this is needed to prevent the problems described there: + - http://code.google.com/p/gperftools/issues/detail?id=278 + - http://lists.debian.org/debian-glibc/2010/03/msg00161.html + + TL;DR: if the RSS of the process is large enough, the clone() syscall + will be interrupted by the SIGPROF before it can complete, then + retried, interrupted again and so on, in an endless loop. The + solution is to disable the timer around the fork, and re-enable it + only inside the parent. + */ + if (atfork_hook_installed) + return 0; + int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, NULL); + if (ret != 0) + return -1; + atfork_hook_installed = 1; + return 0; +} + +RPY_EXTERN +int vmprof_enable(void) +{ + assert(profile_file >= 0); + assert(prepare_interval_usec > 0); + profile_interval_usec = prepare_interval_usec; + + if (install_pthread_atfork_hooks() == -1) + goto error; + if (install_sigprof_handler() == -1) + goto error; + if (install_sigprof_timer() == -1) + goto error; + vmprof_ignore_signals(0); + return 0; + + error: + profile_file = -1; + profile_interval_usec = 0; + return -1; +} + +static int _write_all(const void *buf, size_t bufsize) +{ + while (bufsize > 0) { + ssize_t count = write(profile_file, buf, bufsize); + if (count <= 0) + return -1; /* failed */ + buf += count; + bufsize -= count; + } + return 0; +} + +static int opened_profile(char *interp_name) +{ + struct { + long hdr[5]; + char interp_name[259]; + } header; + + size_t namelen = strnlen(interp_name, 255); + current_codes = NULL; + + header.hdr[0] = 0; + header.hdr[1] = 3; + header.hdr[2] = 0; + header.hdr[3] = prepare_interval_usec; + header.hdr[4] = 0; + header.interp_name[0] = MARKER_HEADER; + header.interp_name[1] = '\x00'; + header.interp_name[2] = VERSION_THREAD_ID; + header.interp_name[3] = namelen; + memcpy(&header.interp_name[4], interp_name, namelen); + return _write_all(&header, 5 * sizeof(long) + 4 + namelen); +} + +static int close_profile(void) +{ + char buf[4096]; + ssize_t size; + unsigned char marker = MARKER_TRAILER; + + if (_write_all(&marker, 1) < 0) + return -1; + +#ifdef __linux__ + // copy /proc/self/maps to the end of the profile file + int srcfd = open("/proc/self/maps", O_RDONLY); + if (srcfd < 0) + return -1; + + while ((size = read(srcfd, buf, sizeof buf)) > 0) { + if (_write_all(buf, size) < 0) { + close(srcfd); + return -1; + } + } + close(srcfd); +#else + // freebsd and mac + sprintf(buf, "procstat -v %d", getpid()); + FILE *srcf = popen(buf, "r"); + if (!srcf) + return -1; + + while ((size = fread(buf, 1, sizeof buf, src))) { + if (_write_all(buf, size) < 0) { + pclose(srcf); + return -1; + } + } + pclose(srcf); +#endif + + /* don't close() the file descriptor from here */ + profile_file = -1; + return 0; +} + +RPY_EXTERN +int vmprof_disable(void) +{ + vmprof_ignore_signals(1); + profile_interval_usec = 0; + + if (remove_sigprof_timer() == -1) + return -1; + if (remove_sigprof_handler() == -1) + return -1; + flush_codes(); + if (shutdown_concurrent_bufs(profile_file) < 0) + return -1; + return close_profile(); +} + +RPY_EXTERN +int vmprof_register_virtual_function(char *code_name, long code_uid, + int auto_retry) +{ + long namelen = strnlen(code_name, 1023); + long blocklen = 1 + 2 * sizeof(long) + namelen; + struct profbuf_s *p; + char *t; + + retry: + p = current_codes; + if (p != NULL) { + if (__sync_bool_compare_and_swap(¤t_codes, p, NULL)) { + /* grabbed 'current_codes': we will append the current block + to it if it contains enough room */ + size_t freesize = SINGLE_BUF_SIZE - p->data_size; + if (freesize < blocklen) { + /* full: flush it */ + commit_buffer(profile_file, p); + p = NULL; + } + } + else { + /* compare-and-swap failed, don't try again */ + p = NULL; + } + } + + if (p == NULL) { + p = reserve_buffer(profile_file); + if (p == NULL) { + /* can't get a free block; should almost never be the + case. Spin loop if allowed, or return a failure code + if not (e.g. we're in a signal handler) */ + if (auto_retry > 0) { + auto_retry--; + usleep(1); + goto retry; + } + return -1; + } + } + + t = p->data + p->data_size; + p->data_size += blocklen; + assert(p->data_size <= SINGLE_BUF_SIZE); + *t++ = MARKER_VIRTUAL_IP; + memcpy(t, &code_uid, sizeof(long)); t += sizeof(long); + memcpy(t, &namelen, sizeof(long)); t += sizeof(long); + memcpy(t, code_name, namelen); + + /* try to reattach 'p' to 'current_codes' */ + if (!__sync_bool_compare_and_swap(¤t_codes, NULL, p)) { + /* failed, flush it */ + commit_buffer(profile_file, p); + } + return 0; +} + +static void flush_codes(void) +{ + struct profbuf_s *p = current_codes; + if (p != NULL) { + current_codes = NULL; + commit_buffer(profile_file, p); + } +} diff --git a/rpython/rlib/rvmprof/src/vmprof_mt.h b/rpython/rlib/rvmprof/src/vmprof_mt.h new file mode 100644 --- /dev/null +++ b/rpython/rlib/rvmprof/src/vmprof_mt.h @@ -0,0 +1,217 @@ +/* Support for multithreaded write() operations */ + +#include <sys/mman.h> +#include <string.h> + +/* The idea is that we have MAX_NUM_BUFFERS available, all of size + SINGLE_BUF_SIZE. Threads and signal handlers can ask to reserve a + buffer, fill it, and finally "commit" it, at which point its + content is written into the profile file. There is no hard + guarantee about the order in which the committed blocks are + actually written. We do this with two constrains: + + - write() calls should not overlap; only one thread can be + currently calling it. + + - the code needs to be multithread-safe *and* signal-handler-safe, + which means it must be written in a wait-free style: never have + spin loops waiting for some lock to be released, from any of + the functions that can be called from the signal handler! The + code holding the lock could be running in the same thread, + currently interrupted by the signal handler. + + The value of MAX_NUM_BUFFERS is a trade-off between too high + (lots of unnecessary memory, lots of checking all of them) + and too low (risk that there is none left). +*/ +#define MAX_NUM_BUFFERS 20 +#define SINGLE_BUF_SIZE (8192 - 2 * sizeof(unsigned int)) + +#if defined(__i386__) || defined(__amd64__) + static inline void write_fence(void) { asm("" : : : "memory"); } +#else + static inline void write_fence(void) { __sync_synchronize(); } +#endif + + +#define PROFBUF_UNUSED 0 +#define PROFBUF_FILLING 1 +#define PROFBUF_READY 2 + + +struct profbuf_s { + unsigned int data_size; + unsigned int data_offset; + char data[SINGLE_BUF_SIZE]; +}; + +static char volatile profbuf_state[MAX_NUM_BUFFERS]; +static struct profbuf_s *profbuf_all_buffers = NULL; +static int volatile profbuf_write_lock = 2; +static long profbuf_pending_write; + + +static void unprepare_concurrent_bufs(void) +{ + if (profbuf_all_buffers != NULL) { + munmap(profbuf_all_buffers, sizeof(struct profbuf_s) * MAX_NUM_BUFFERS); + profbuf_all_buffers = NULL; + } +} + +static int prepare_concurrent_bufs(void) +{ + assert(sizeof(struct profbuf_s) == 8192); + + unprepare_concurrent_bufs(); + profbuf_all_buffers = mmap(NULL, sizeof(struct profbuf_s) * MAX_NUM_BUFFERS, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (profbuf_all_buffers == MAP_FAILED) { + profbuf_all_buffers = NULL; + return -1; + } + memset((char *)profbuf_state, PROFBUF_UNUSED, sizeof(profbuf_state)); + profbuf_write_lock = 0; + profbuf_pending_write = -1; + return 0; +} + +static int _write_single_ready_buffer(int fd, long i) +{ + /* Try to write to disk the buffer number 'i'. This function must + only be called while we hold the write lock. */ + assert(profbuf_write_lock != 0); + + if (profbuf_pending_write >= 0) { + /* A partially written buffer is waiting. We'll write the + rest of this buffer now, instead of 'i'. */ _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit