Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: py3.5 Changeset: r91069:8059c5eb1ca8 Date: 2017-04-17 17:28 +0200 http://bitbucket.org/pypy/pypy/changeset/8059c5eb1ca8/
Log: hg merge default diff too long, truncating to 2000 out of 18946 lines diff --git a/.hgignore b/.hgignore --- a/.hgignore +++ b/.hgignore @@ -86,3 +86,5 @@ ^\.cache$ pypy/module/cppyy/.+/*\.pcm + + diff --git a/.hgtags b/.hgtags --- a/.hgtags +++ b/.hgtags @@ -36,3 +36,5 @@ aff251e543859ce4508159dd9f1a82a2f553de00 release-pypy2.7-v5.6.0 fa3249d55d15b9829e1be69cdf45b5a44cec902d release-pypy2.7-v5.7.0 b16a4363e930f6401bceb499b9520955504c6cb0 release-pypy3.5-v5.7.0 +1aa2d8e03cdfab54b7121e93fda7e98ea88a30bf release-pypy2.7-v5.7.1 +2875f328eae2216a87f3d6f335092832eb031f56 release-pypy3.5-v5.7.1 diff --git a/lib-python/2.7/distutils/sysconfig_pypy.py b/lib-python/2.7/distutils/sysconfig_pypy.py --- a/lib-python/2.7/distutils/sysconfig_pypy.py +++ b/lib-python/2.7/distutils/sysconfig_pypy.py @@ -61,12 +61,12 @@ def _init_posix(): """Initialize the module as appropriate for POSIX systems.""" g = {} - g['CC'] = "gcc -pthread" - g['CXX'] = "g++ -pthread" + g['CC'] = "cc -pthread" + g['CXX'] = "c++ -pthread" g['OPT'] = "-DNDEBUG -O2" g['CFLAGS'] = "-DNDEBUG -O2" g['CCSHARED'] = "-fPIC" - g['LDSHARED'] = "gcc -pthread -shared" + g['LDSHARED'] = "cc -pthread -shared" g['SO'] = [s[0] for s in imp.get_suffixes() if s[2] == imp.C_EXTENSION][0] g['AR'] = "ar" g['ARFLAGS'] = "rc" diff --git a/lib_pypy/_ctypes/array.py b/lib_pypy/_ctypes/array.py --- a/lib_pypy/_ctypes/array.py +++ b/lib_pypy/_ctypes/array.py @@ -83,8 +83,9 @@ res = self.__new__(self) ffiarray = self._ffiarray.fromaddress(resarray.buffer, self._length_) res._buffer = ffiarray - res._base = base - res._index = index + if base is not None: + res._base = base + res._index = index return res def _CData_retval(self, resbuffer): diff --git a/lib_pypy/_ctypes/basics.py b/lib_pypy/_ctypes/basics.py --- a/lib_pypy/_ctypes/basics.py +++ b/lib_pypy/_ctypes/basics.py @@ -64,8 +64,9 @@ res = object.__new__(self) res.__class__ = self res.__dict__['_buffer'] = resbuffer - res.__dict__['_base'] = base - res.__dict__['_index'] = index + if base is not None: + res.__dict__['_base'] = base + res.__dict__['_index'] = index return res def _CData_retval(self, resbuffer): diff --git a/pypy/doc/index-of-release-notes.rst b/pypy/doc/index-of-release-notes.rst --- a/pypy/doc/index-of-release-notes.rst +++ b/pypy/doc/index-of-release-notes.rst @@ -6,6 +6,7 @@ .. toctree:: + release-v5.7.1.rst release-v5.7.0.rst release-pypy2.7-v5.6.0.rst release-pypy2.7-v5.4.1.rst @@ -59,6 +60,7 @@ .. toctree:: + release-v5.7.1.rst release-v5.7.0.rst CPython 3.3 compatible versions diff --git a/pypy/doc/install.rst b/pypy/doc/install.rst --- a/pypy/doc/install.rst +++ b/pypy/doc/install.rst @@ -32,10 +32,10 @@ .. code-block:: console - $ tar xf pypy-2.1.tar.bz2 - $ ./pypy-2.1/bin/pypy - Python 2.7.3 (480845e6b1dd, Jul 31 2013, 11:05:31) - [PyPy 2.1.0 with GCC 4.4.3] on linux2 + $ tar xf pypy-x.y.z.tar.bz2 + $ ./pypy-x.y.z/bin/pypy + Python 2.7.x (xxxxxxxxxxxx, Date, Time) + [PyPy x.y.z with GCC x.y.z] on linux2 Type "help", "copyright", "credits" or "license" for more information. And now for something completely different: ``PyPy is an exciting technology that lets you to write fast, portable, multi-platform interpreters with less diff --git a/pypy/doc/release-v5.7.0.rst b/pypy/doc/release-v5.7.0.rst --- a/pypy/doc/release-v5.7.0.rst +++ b/pypy/doc/release-v5.7.0.rst @@ -99,7 +99,7 @@ tp_dealloc * refactor and clean up poor handling of unicode exposed in work on py3.5 * builtin module cppyy_ supports C++ 11, 14, etc. via cling (reflex has been removed) - * adapt ``weakref`` according to CPython issue #19542_, will be in CPython 2.7.14 + * adapt ``weakref`` according to CPython issue 19542_, will be in CPython 2.7.14 * support translations with cpyext and the Boehm GC (for special cases like RevDB_ * implement ``StringBuffer.get_raw_address`` for the buffer protocol, it is @@ -125,18 +125,18 @@ * disable ``clock_gettime()`` on OS/X, since we support 10.11 and it was only added in 10.12 * support ``HAVE_FSTATVFS`` which was unintentionally always false - * fix user-created C-API heaptype, issue #2434_ + * fix user-created C-API heaptype, issue 2434_ * fix ``PyDict_Update`` is not actually the same as ``dict.update`` * assign ``tp_doc`` on ``PyTypeObject`` and tie it to the app-level ``__doc__`` attribute - issue #2446_ + issue 2446_ * clean up memory leaks around ``PyObject_GetBuffer``, ``PyMemoryView_GET_BUFFER``, ``PyMemoryView_FromBuffer``, and ``PyBuffer_Release`` * improve support for creating C-extension objects from app-level classes, filling more slots, especially ``tp_new`` and ``tp_dealloc`` - * fix for ``ctypes.c_bool`` returning ``bool`` restype, issue #2475_ + * fix for ``ctypes.c_bool`` returning ``bool`` restype, issue 2475_ * fix in corner cases with the GIL and C-API functions - * allow overriding thread.local.__init__ in a subclass, issue #2501_ - * allow ``PyClass_New`` to be called with NULL as the first arguemnt, issue #2504_ + * allow overriding thread.local.__init__ in a subclass, issue 2501_ + * allow ``PyClass_New`` to be called with NULL as the first arguemnt, issue 2504_ * Performance improvements: diff --git a/pypy/doc/release-v5.7.1.rst b/pypy/doc/release-v5.7.1.rst new file mode 100644 --- /dev/null +++ b/pypy/doc/release-v5.7.1.rst @@ -0,0 +1,50 @@ +========== +PyPy 5.7.1 +========== + +We have released a bugfix PyPy2.7-v5.7.1 and PyPy3.5-v5.7.1 beta (Linux 64bit), +due to the following issues: + + * correctly handle an edge case in dict.pop (issue 2508_) + + * fix a regression to correctly handle multiple inheritance in a C-API type + where the second base is an app-level class with a ``__new__`` function + + * fix a regression to fill a C-API type's ``tp_getattr`` slot from a + ``__getattr__`` method (issue 2523_) + +Thanks to those who reported the issues. + +.. _2508: https://bitbucket.org/pypy/pypy/issues/2508 +.. _2523: https://bitbucket.org/pypy/pypy/issues/2523 + +What is PyPy? +============= + +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7 and CPython 3.5. It's fast (`PyPy and CPython 2.7.x`_ performance comparison) +due to its integrated tracing JIT compiler. + +We also welcome developers of other `dynamic languages`_ to see what RPython +can do for them. + +The PyPy 2.7 release supports: + + * **x86** machines on most common operating systems + (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD) + + * newer **ARM** hardware (ARMv6 or ARMv7, with VFPv3) running Linux, + + * big- and little-endian variants of **PPC64** running Linux, + + * **s390x** running Linux + +.. _`PyPy and CPython 2.7.x`: http://speed.pypy.org +.. _`dynamic languages`: http://rpython.readthedocs.io/en/latest/examples.html + +Please update, and continue to help us make PyPy better. + +Cheers + +The PyPy Team + diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -21,3 +21,10 @@ .. branch: vmprof-native PyPy support to profile native frames in vmprof. + +.. branch: reusing-r11 +.. branch: branch-prediction + +Performance tweaks in the x86 JIT-generated machine code: rarely taken +blocks are moved off-line. Also, the temporary register used to contain +large constants is reused across instructions. diff --git a/pypy/interpreter/astcompiler/assemble.py b/pypy/interpreter/astcompiler/assemble.py --- a/pypy/interpreter/astcompiler/assemble.py +++ b/pypy/interpreter/astcompiler/assemble.py @@ -7,6 +7,7 @@ from pypy.interpreter.astcompiler import ast, consts, misc, symtable from pypy.interpreter.error import OperationError from pypy.interpreter.pycode import PyCode +from pypy.interpreter.miscutils import string_sort from pypy.tool import stdlib_opcode as ops @@ -138,9 +139,12 @@ def _make_index_dict_filter(syms, flag1, flag2): + names = syms.keys() + string_sort(names) # return cell vars in alphabetical order i = 0 result = {} - for name, scope in syms.iteritems(): + for name in names: + scope = syms[name] if scope in (flag1, flag2): result[name] = i i += 1 @@ -172,6 +176,7 @@ self.cell_vars = _make_index_dict_filter(scope.symbols, symtable.SCOPE_CELL, symtable.SCOPE_CELL_CLASS) + string_sort(scope.free_vars) # return free vars in alphabetical order self.free_vars = _iter_to_dict(scope.free_vars, len(self.cell_vars)) self.w_consts = space.newdict() self.argcount = 0 diff --git a/pypy/interpreter/astcompiler/symtable.py b/pypy/interpreter/astcompiler/symtable.py --- a/pypy/interpreter/astcompiler/symtable.py +++ b/pypy/interpreter/astcompiler/symtable.py @@ -38,7 +38,7 @@ self.roles = {} self.varnames = [] self.children = [] - self.free_vars = {} + self.free_vars = [] # a bag of names: the order doesn't matter here self.temp_name_counter = 1 self.has_free = False self.child_has_free = False diff --git a/pypy/interpreter/miscutils.py b/pypy/interpreter/miscutils.py --- a/pypy/interpreter/miscutils.py +++ b/pypy/interpreter/miscutils.py @@ -2,6 +2,9 @@ Miscellaneous utilities. """ +from rpython.rlib.listsort import make_timsort_class + + class ThreadLocals: """Pseudo thread-local storage, for 'space.threadlocals'. This is not really thread-local at all; the intention is that the PyPy @@ -53,3 +56,15 @@ def set(self, key, value): self._dict[key] = value return FakeWeakValueDict() + + +_StringBaseTimSort = make_timsort_class() + +class StringSort(_StringBaseTimSort): + def lt(self, a, b): + return a < b + +def string_sort(lst): + """Sort a (resizable) list of strings.""" + sorter = StringSort(lst, len(lst)) + sorter.sort() diff --git a/pypy/interpreter/test/test_compiler.py b/pypy/interpreter/test/test_compiler.py --- a/pypy/interpreter/test/test_compiler.py +++ b/pypy/interpreter/test/test_compiler.py @@ -1035,6 +1035,32 @@ else: assert l1 == l2 == l3 == l4 == [1, 3, 2, 4] + def test_freevars_order(self): + # co_cellvars and co_freevars are guaranteed to appear in + # alphabetical order. See CPython Issue #15368 (which does + # not come with tests). + source = """if 1: + def f1(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15): + def g1(): + return (x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) + return g1 + def f2(x15,x14,x13,x12,x11,x10,x9,x8,x7,x6,x5,x4,x3,x2,x1): + def g2(): + return (x15,x14,x13,x12,x11,x10,x9,x8,x7,x6,x5,x4,x3,x2,x1) + return g2 + c1 = f1(*range(15)).__code__.co_freevars + c2 = f2(*range(15)).__code__.co_freevars + r1 = f1.__code__.co_cellvars + r2 = f2.__code__.co_cellvars + """ + d = {} + exec(source, d) + assert d['c1'] == d['c2'] + # the test above is important for a few bytecode hacks, + # but actually we get them in alphabetical order, so check that: + assert d['c1'] == tuple(sorted(d['c1'])) + assert d['r1'] == d['r2'] == d['c1'] + def test_ast_equality(self): import _ast sample_code = [ diff --git a/pypy/module/_collections/interp_deque.py b/pypy/module/_collections/interp_deque.py --- a/pypy/module/_collections/interp_deque.py +++ b/pypy/module/_collections/interp_deque.py @@ -406,11 +406,7 @@ def repr(self): space = self.space - ec = space.getexecutioncontext() - w_currently_in_repr = ec._py_repr - if w_currently_in_repr is None: - w_currently_in_repr = ec._py_repr = space.newdict() - return dequerepr(space, w_currently_in_repr, self) + return dequerepr(space, space.get_objects_in_repr(), self) @specialize.arg(2) def compare(self, w_other, op): @@ -523,18 +519,17 @@ app = gateway.applevel(""" def dequerepr(currently_in_repr, d): 'The app-level part of repr().' - deque_id = id(d) - if deque_id in currently_in_repr: + if d in currently_in_repr: return '[...]' # strange because it's a deque and this # strongly suggests it's a list instead, # but confirmed behavior from python-dev else: - currently_in_repr[deque_id] = 1 + currently_in_repr[d] = 1 try: listrepr = "[" + ", ".join([repr(x) for x in d]) + ']' finally: try: - del currently_in_repr[deque_id] + del currently_in_repr[d] except: pass if d.maxlen is None: diff --git a/pypy/module/_vmprof/test/test_direct.py b/pypy/module/_vmprof/test/test_direct.py --- a/pypy/module/_vmprof/test/test_direct.py +++ b/pypy/module/_vmprof/test/test_direct.py @@ -1,4 +1,4 @@ - +import sys import py try: import cffi @@ -7,6 +7,7 @@ from rpython.rlib import rvmprof srcdir = py.path.local(rvmprof.__file__).join("..", "src") +shareddir = srcdir.join('shared') ffi = cffi.FFI() ffi.cdef(""" @@ -43,7 +44,7 @@ } -""" + open(str(srcdir.join("shared/vmprof_get_custom_offset.h"))).read(), include_dirs=[str(srcdir)]) +""" + open(str(srcdir.join("shared/vmprof_get_custom_offset.h"))).read(), include_dirs=[str(srcdir), str(shareddir)]) class TestDirect(object): def test_infrastructure(self): diff --git a/pypy/module/test_lib_pypy/ctypes_tests/test_functions.py b/pypy/module/test_lib_pypy/ctypes_tests/test_functions.py --- a/pypy/module/test_lib_pypy/ctypes_tests/test_functions.py +++ b/pypy/module/test_lib_pypy/ctypes_tests/test_functions.py @@ -596,3 +596,37 @@ get_data.errcheck = ret_list_p(1) assert get_data('testing!') == [-1, -2, -3, -4] + + def test_issue2533(self): + import cffi + ffi = cffi.FFI() + ffi.cdef("int **fetchme(void);") + ffi.set_source("_x_cffi", """ + int **fetchme(void) + { + static int a = 42; + static int *pa = &a; + return &pa; + } + """) + from rpython.tool.udir import udir + ffi.compile(verbose=True, tmpdir=str(udir)) + + import sys + sys.path.insert(0, str(udir)) + try: + from _x_cffi import ffi, lib + finally: + sys.path.pop(0) + fetchme = ffi.addressof(lib, 'fetchme') + fetchme = int(ffi.cast("intptr_t", fetchme)) + + FN = CFUNCTYPE(POINTER(POINTER(c_int))) + ff = cast(fetchme, FN) + + g = ff() + assert g.contents.contents.value == 42 + + h = c_int(43) + g[0] = pointer(h) # used to crash here + assert g.contents.contents.value == 43 diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py --- a/pypy/objspace/std/dictmultiobject.py +++ b/pypy/objspace/std/dictmultiobject.py @@ -143,11 +143,7 @@ init_or_update(space, self, __args__, 'dict') def descr_repr(self, space): - ec = space.getexecutioncontext() - w_currently_in_repr = ec._py_repr - if w_currently_in_repr is None: - w_currently_in_repr = ec._py_repr = space.newdict() - return dictrepr(space, w_currently_in_repr, self) + return dictrepr(space, space.get_objects_in_repr(), self) def descr_eq(self, space, w_other): if space.is_w(self, w_other): @@ -404,10 +400,9 @@ def dictrepr(currently_in_repr, d): if len(d) == 0: return "{}" - dict_id = id(d) - if dict_id in currently_in_repr: + if d in currently_in_repr: return '{...}' - currently_in_repr[dict_id] = 1 + currently_in_repr[d] = 1 try: items = [] # XXX for now, we cannot use items() without list at @@ -419,7 +414,7 @@ return "{" + ', '.join(items) + "}" finally: try: - del currently_in_repr[dict_id] + del currently_in_repr[d] except: pass ''', filename=__file__) diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py --- a/pypy/objspace/std/listobject.py +++ b/pypy/objspace/std/listobject.py @@ -23,6 +23,7 @@ WrappedDefault, applevel, interp2app, unwrap_spec) from pypy.interpreter.signature import Signature from pypy.interpreter.typedef import TypeDef +from pypy.interpreter.miscutils import StringSort from pypy.objspace.std.bytesobject import W_BytesObject from pypy.objspace.std.floatobject import W_FloatObject from pypy.objspace.std.intobject import W_IntObject @@ -438,11 +439,7 @@ def descr_repr(self, space): if self.length() == 0: return space.newtext('[]') - ec = space.getexecutioncontext() - w_currently_in_repr = ec._py_repr - if w_currently_in_repr is None: - w_currently_in_repr = ec._py_repr = space.newdict() - return listrepr(space, w_currently_in_repr, self) + return listrepr(space, space.get_objects_in_repr(), self) def descr_eq(self, space, w_other): if not isinstance(w_other, W_ListObject): @@ -2014,15 +2011,14 @@ app = applevel(""" def listrepr(currently_in_repr, l): 'The app-level part of repr().' - list_id = id(l) - if list_id in currently_in_repr: + if l in currently_in_repr: return '[...]' - currently_in_repr[list_id] = 1 + currently_in_repr[l] = 1 try: return "[" + ", ".join([repr(x) for x in l]) + ']' finally: try: - del currently_in_repr[list_id] + del currently_in_repr[l] except: pass """, filename=__file__) @@ -2039,7 +2035,6 @@ IntBaseTimSort = make_timsort_class() FloatBaseTimSort = make_timsort_class() IntOrFloatBaseTimSort = make_timsort_class() -StringBaseTimSort = make_timsort_class() UnicodeBaseTimSort = make_timsort_class() @@ -2076,11 +2071,6 @@ return fa < fb -class StringSort(StringBaseTimSort): - def lt(self, a, b): - return a < b - - class UnicodeSort(UnicodeBaseTimSort): def lt(self, a, b): return a < b diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -126,6 +126,14 @@ ec._py_repr = None return ec + def get_objects_in_repr(self): + from pypy.module.__pypy__.interp_identitydict import W_IdentityDict + ec = self.getexecutioncontext() + w_currently_in_repr = ec._py_repr + if w_currently_in_repr is None: + w_currently_in_repr = ec._py_repr = W_IdentityDict(self) + return w_currently_in_repr + def gettypefor(self, cls): return self.gettypeobject(cls.typedef) diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py --- a/pypy/objspace/std/setobject.py +++ b/pypy/objspace/std/setobject.py @@ -165,11 +165,7 @@ _initialize_set(space, self, w_iterable) def descr_repr(self, space): - ec = space.getexecutioncontext() - w_currently_in_repr = ec._py_repr - if w_currently_in_repr is None: - w_currently_in_repr = ec._py_repr = space.newdict() - return setrepr(space, w_currently_in_repr, self) + return setrepr(space, space.get_objects_in_repr(), self) def descr_eq(self, space, w_other): if isinstance(w_other, W_BaseSetObject): @@ -1700,10 +1696,9 @@ app = gateway.applevel(""" def setrepr(currently_in_repr, s): 'The app-level part of repr().' - set_id = id(s) - if set_id in currently_in_repr: + if s in currently_in_repr: return '%s(...)' % (s.__class__.__name__,) - currently_in_repr[set_id] = 1 + currently_in_repr[s] = 1 try: if not s: return '%s()' % (s.__class__.__name__,) @@ -1714,7 +1709,7 @@ return '%s({%s})' % (s.__class__.__name__, listrepr[1:-1]) finally: try: - del currently_in_repr[set_id] + del currently_in_repr[s] except: pass """, filename=__file__) diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py --- a/pypy/objspace/std/typeobject.py +++ b/pypy/objspace/std/typeobject.py @@ -1087,7 +1087,7 @@ return w_name.text_w(space) def create_all_slots(w_self, hasoldstylebase, w_bestbase, force_new_layout): - from pypy.objspace.std.listobject import StringSort + from pypy.interpreter.miscutils import string_sort base_layout = w_bestbase.layout index_next_extra_slot = base_layout.nslots @@ -1120,8 +1120,7 @@ else: newslotnames.append(slot_name) # Sort the list of names collected so far - sorter = StringSort(newslotnames, len(newslotnames)) - sorter.sort() + string_sort(newslotnames) # Try to create all slots in order. The creation of some of # them might silently fail; then we delete the name from the # list. At the end, 'index_next_extra_slot' has been advanced diff --git a/pypy/tool/release/package.py b/pypy/tool/release/package.py --- a/pypy/tool/release/package.py +++ b/pypy/tool/release/package.py @@ -241,12 +241,14 @@ zf.close() else: archive = str(builddir.join(name + '.tar.bz2')) - if sys.platform == 'darwin' or sys.platform.startswith('freebsd'): + if sys.platform == 'darwin': print >>sys.stderr, """Warning: tar on current platform does not suport overriding the uid and gid for its contents. The tarball will contain your uid and gid. If you are building the actual release for the PyPy website, you may want to be using another platform...""" e = os.system('tar --numeric-owner -cvjf ' + archive + " " + name) + elif sys.platform.startswith('freebsd'): + e = os.system('tar --uname=root --gname=wheel -cvjf ' + archive + " " + name) elif sys.platform == 'cygwin': e = os.system('tar --owner=Administrator --group=Administrators --numeric-owner -cvjf ' + archive + " " + name) else: diff --git a/rpython/jit/backend/llsupport/asmmemmgr.py b/rpython/jit/backend/llsupport/asmmemmgr.py --- a/rpython/jit/backend/llsupport/asmmemmgr.py +++ b/rpython/jit/backend/llsupport/asmmemmgr.py @@ -250,7 +250,7 @@ return self.rawstart def overwrite(self, index, char): - assert 0 <= index < self.get_relative_pos() + assert 0 <= index < self.get_relative_pos(break_basic_block=False) block = self._cursubblock index -= self._baserelpos while index < 0: @@ -264,7 +264,8 @@ self.overwrite(index + 2, chr((val >> 16) & 0xff)) self.overwrite(index + 3, chr((val >> 24) & 0xff)) - def get_relative_pos(self): + def get_relative_pos(self, break_basic_block=True): + # 'break_basic_block' is only used in x86 return self._baserelpos + self._cursubindex def copy_to_raw_memory(self, addr): @@ -288,7 +289,7 @@ HEX = '0123456789ABCDEF' dump = [] src = rffi.cast(rffi.CCHARP, addr) - end = self.get_relative_pos() + end = self.get_relative_pos(break_basic_block=False) if count != -1: end = offset + count for p in range(offset, end): @@ -336,17 +337,20 @@ def _become_a_plain_block_builder(self): # hack purely for speed of tests - self._data = [] - self.writechar = self._data.append - self.overwrite = self._data.__setitem__ - self.get_relative_pos = self._data.__len__ + self._data = _data = [] + self.writechar = _data.append + self.overwrite = _data.__setitem__ + def get_relative_pos(break_basic_block=True): + return len(_data) + self.get_relative_pos = get_relative_pos def plain_copy_to_raw_memory(addr): dst = rffi.cast(rffi.CCHARP, addr) - for i, c in enumerate(self._data): + for i, c in enumerate(_data): dst[i] = c self._copy_to_raw_memory = plain_copy_to_raw_memory def insert_gcroot_marker(self, mark): if self.gcroot_markers is None: self.gcroot_markers = [] - self.gcroot_markers.append((self.get_relative_pos(), mark)) + self.gcroot_markers.append( + (self.get_relative_pos(break_basic_block=False), mark)) diff --git a/rpython/jit/backend/llsupport/assembler.py b/rpython/jit/backend/llsupport/assembler.py --- a/rpython/jit/backend/llsupport/assembler.py +++ b/rpython/jit/backend/llsupport/assembler.py @@ -265,14 +265,16 @@ def enter_portal_frame(self, op): if self.cpu.HAS_CODEMAP: + pos = self.mc.get_relative_pos(break_basic_block=False) self.codemap_builder.enter_portal_frame(op.getarg(0).getint(), op.getarg(1).getint(), - self.mc.get_relative_pos()) + pos) def leave_portal_frame(self, op): if self.cpu.HAS_CODEMAP: + pos = self.mc.get_relative_pos(break_basic_block=False) self.codemap_builder.leave_portal_frame(op.getarg(0).getint(), - self.mc.get_relative_pos()) + pos) def call_assembler(self, op, argloc, vloc, result_loc, tmploc): """ diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -76,6 +76,7 @@ BaseAssembler.setup(self, looptoken) assert self.memcpy_addr != 0, "setup_once() not called?" self.current_clt = looptoken.compiled_loop_token + self.pending_slowpaths = [] self.pending_guard_tokens = [] if WORD == 8: self.pending_memoryerror_trampoline_from = [] @@ -95,6 +96,7 @@ self.pending_memoryerror_trampoline_from = None self.mc = None self.current_clt = None + self.frame_depth_to_patch = None def _build_float_constants(self): # 0x80000000000000008000000000000000 @@ -181,6 +183,7 @@ """ This builds a general call slowpath, for whatever call happens to come. """ + self.pending_slowpaths = [] mc = codebuf.MachineCodeBlockWrapper() # copy registers to the frame, with the exception of the # 'cond_call_register_arguments' and eax, because these have already @@ -211,6 +214,7 @@ self.pop_gcmap(mc) # cancel the push_gcmap(store=True) in the caller self._pop_all_regs_from_frame(mc, [eax], supports_floats, callee_only) mc.RET() + self.flush_pending_slowpaths(mc) return mc.materialize(self.cpu, []) def _build_malloc_slowpath(self, kind): @@ -227,6 +231,7 @@ This function must preserve all registers apart from ecx and edx. """ assert kind in ['fixed', 'str', 'unicode', 'var'] + self.pending_slowpaths = [] mc = codebuf.MachineCodeBlockWrapper() self._push_all_regs_to_frame(mc, [ecx, edx], self.cpu.supports_floats) # the caller already did push_gcmap(store=True) @@ -276,13 +281,13 @@ self.set_extra_stack_depth(mc, 0) # mc.TEST_rr(eax.value, eax.value) + # common case: not taken mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later - jz_location = mc.get_relative_pos() + jz_location = mc.get_relative_pos(break_basic_block=False) mc.MOV_rr(ecx.value, eax.value) # nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr() self._pop_all_regs_from_frame(mc, [ecx, edx], self.cpu.supports_floats) - mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX self.pop_gcmap(mc) # push_gcmap(store=True) done by the caller mc.RET() # @@ -298,6 +303,7 @@ mc.force_frame_size(DEFAULT_FRAME_BYTES + WORD) mc.ADD_ri(esp.value, WORD) mc.JMP(imm(self.propagate_exception_path)) + self.flush_pending_slowpaths(mc) # rawstart = mc.materialize(self.cpu, []) return rawstart @@ -718,6 +724,7 @@ if rx86.fits_in_32bits(offset): mc.JMP_l(offset) else: + # mc.forget_scratch_register() not needed here mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target) mc.JMP_r(X86_64_SCRATCH_REG.value) mc.copy_to_raw_memory(adr_jump_offset) @@ -755,7 +762,17 @@ gcreftracers.append(tracer) # keepalive self.teardown_gcrefs_list() + def flush_pending_slowpaths(self, mc): + # for each pending slowpath, generate it now. Note that this + # may occasionally add an extra guard_token in + # pending_guard_tokens, so it must be done before the + # following loop in write_pending_failure_recoveries(). + for sp in self.pending_slowpaths: + sp.generate(self, mc) + self.pending_slowpaths = None + def write_pending_failure_recoveries(self, regalloc): + self.flush_pending_slowpaths(self.mc) # for each pending guard, generate the code of the recovery stub # at the end of self.mc. for tok in self.pending_guard_tokens: @@ -821,6 +838,14 @@ for ofs in self.frame_depth_to_patch: self._patch_frame_depth(ofs + rawstart, framedepth) + class IncreaseStackSlowPath(codebuf.SlowPath): + def generate_body(self, assembler, mc): + mc.MOV_si(WORD, 0xffffff) # force writing 32 bit + ofs2 = mc.get_relative_pos(break_basic_block=False) - 4 + assembler.frame_depth_to_patch.append(ofs2) + assembler.push_gcmap(mc, self.gcmap, store=True) + mc.CALL(imm(assembler._frame_realloc_slowpath)) + def _check_frame_depth(self, mc, gcmap): """ check if the frame is of enough depth to follow this bridge. Otherwise reallocate the frame in a helper. @@ -830,16 +855,12 @@ descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu) ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr) mc.CMP_bi(ofs, 0xffffff) # force writing 32 bit - stack_check_cmp_ofs = mc.get_relative_pos() - 4 - jg_location = mc.emit_forward_jump('GE') - mc.MOV_si(WORD, 0xffffff) # force writing 32 bit - ofs2 = mc.get_relative_pos() - 4 - self.push_gcmap(mc, gcmap, store=True) - mc.CALL(imm(self._frame_realloc_slowpath)) - # patch the JG above - mc.patch_forward_jump(jg_location) + stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4 self.frame_depth_to_patch.append(stack_check_cmp_ofs) - self.frame_depth_to_patch.append(ofs2) + sp = self.IncreaseStackSlowPath(mc, rx86.Conditions['L']) + sp.gcmap = gcmap + sp.set_continue_addr(mc) + self.pending_slowpaths.append(sp) def _check_frame_depth_debug(self, mc): """ double check the depth size. It prints the error (and potentially @@ -850,11 +871,11 @@ descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu) ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr) mc.CMP_bi(ofs, 0xffffff) - stack_check_cmp_ofs = mc.get_relative_pos() - 4 + stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4 jg_location = mc.emit_forward_jump('GE') mc.MOV_rr(edi.value, ebp.value) mc.MOV_ri(esi.value, 0xffffff) - ofs2 = mc.get_relative_pos() - 4 + ofs2 = mc.get_relative_pos(break_basic_block=False) - 4 mc.CALL(imm(self.cpu.realloc_frame_crash)) # patch the JG above mc.patch_forward_jump(jg_location) @@ -895,6 +916,7 @@ # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there # because we always write "mov r11, imm-as-8-bytes; call *r11" in # the first place. + # mc.forget_scratch_register() not needed here mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target) mc.JMP_r(X86_64_SCRATCH_REG.value) p = rffi.cast(rffi.INTP, adr_jump_offset) @@ -939,7 +961,7 @@ # would be used to pass arguments #3 and #4 (even though, so # far, the assembler only receives two arguments). tloc = esi - old = r11 + old = r10 # eax = address in the stack of a 3-words struct vmprof_stack_s self.mc.LEA_rs(eax.value, (FRAME_FIXED_SIZE - 4) * WORD) # old = current value of vmprof_tl_stack @@ -984,6 +1006,10 @@ if gcrootmap and gcrootmap.is_shadow_stack: self._call_header_shadowstack(gcrootmap) + class StackCheckSlowPath(codebuf.SlowPath): + def generate_body(self, assembler, mc): + mc.CALL(imm(assembler.stack_check_slowpath)) + def _call_header_with_stack_check(self): self._call_header() if self.stack_check_slowpath == 0: @@ -993,11 +1019,9 @@ self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start] self.mc.SUB(eax, esp) # SUB eax, current self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length] - jb_location = self.mc.emit_forward_jump('BE')#JBE .skip - self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath - # patch the JB above # .skip: - self.mc.patch_forward_jump(jb_location) - # + sp = self.StackCheckSlowPath(self.mc, rx86.Conditions['A']) + sp.set_continue_addr(self.mc) + self.pending_slowpaths.append(sp) def _call_footer(self): # the return value is the jitframe @@ -1023,27 +1047,14 @@ fit in 32 bits, it will be loaded in r11. """ rst = gcrootmap.get_root_stack_top_addr() - if rx86.fits_in_32bits(rst): - mc.MOV_rj(ebx.value, rst) # MOV ebx, [rootstacktop] - else: - mc.MOV_ri(X86_64_SCRATCH_REG.value, rst) # MOV r11, rootstacktop - mc.MOV_rm(ebx.value, (X86_64_SCRATCH_REG.value, 0)) - # MOV ebx, [r11] - # + mc.MOV(ebx, heap(rst)) # maybe via loading r11 return rst def _call_header_shadowstack(self, gcrootmap): rst = self._load_shadowstack_top_in_ebx(self.mc, gcrootmap) self.mc.MOV_mr((ebx.value, 0), ebp.value) # MOV [ebx], ebp self.mc.ADD_ri(ebx.value, WORD) - if rx86.fits_in_32bits(rst): - self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx - else: - # The integer 'rst' doesn't fit in 32 bits, so we know that - # _load_shadowstack_top_in_ebx() above loaded it in r11. - # Reuse it. Be careful not to overwrite r11 in the middle! - self.mc.MOV_mr((X86_64_SCRATCH_REG.value, 0), - ebx.value) # MOV [r11], ebx + self.mc.MOV(heap(rst), ebx) # MOV [rootstacktop], ebx def _call_footer_shadowstack(self, gcrootmap): rst = gcrootmap.get_root_stack_top_addr() @@ -1166,9 +1177,9 @@ faillocs, frame_depth) genop_guard_list[guard_opnum](self, guard_op, guard_token, arglocs, resloc) - if not we_are_translated(): - # must be added by the genop_guard_list[]() - assert guard_token is self.pending_guard_tokens[-1] + # this must usually have added guard_token as last element + # of self.pending_guard_tokens, but not always (see + # genop_guard_guard_no_exception) def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0): self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale)) @@ -1449,7 +1460,7 @@ # has been emitted. 64-bit mode only. assert IS_X86_64 address_in_buffer = index * WORD # at the start of the buffer - p_location = self.mc.get_relative_pos() + p_location = self.mc.get_relative_pos(break_basic_block=False) offset = address_in_buffer - p_location self.mc.overwrite32(p_location-4, offset) @@ -1547,11 +1558,13 @@ reg = arglocs[0] self.mc.TEST(reg, reg) if WORD == 4: + # common case: not taken self.mc.J_il(rx86.Conditions['Z'], self.propagate_exception_path) self.mc.add_pending_relocation() elif WORD == 8: + # common case: not taken self.mc.J_il(rx86.Conditions['Z'], 0) - pos = self.mc.get_relative_pos() + pos = self.mc.get_relative_pos(break_basic_block=False) self.pending_memoryerror_trampoline_from.append(pos) # ---------- @@ -1706,23 +1719,29 @@ genop_guard_guard_isnull = genop_guard_guard_false def genop_guard_guard_no_exception(self, guard_op, guard_token, locs, ign): + # If the previous operation was a COND_CALL, don't emit + # anything now. Instead, we'll emit the GUARD_NO_EXCEPTION at + # the end of the slowpath in CondCallSlowPath. + if self._find_nearby_operation(-1).getopnum() in ( + rop.COND_CALL, rop.COND_CALL_VALUE_I, rop.COND_CALL_VALUE_R): + sp = self.pending_slowpaths[-1] + assert isinstance(sp, self.CondCallSlowPath) + sp.guard_token_no_exception = guard_token + else: + self.generate_guard_no_exception(guard_token) + + def generate_guard_no_exception(self, guard_token): self.mc.CMP(heap(self.cpu.pos_exception()), imm0) self.guard_success_cc = rx86.Conditions['Z'] self.implement_guard(guard_token) - # If the previous operation was a COND_CALL, overwrite its conditional - # jump to jump over this GUARD_NO_EXCEPTION as well, if we can - if self._find_nearby_operation(-1).getopnum() in ( - rop.COND_CALL, rop.COND_CALL_VALUE_I, rop.COND_CALL_VALUE_R): - j_location = self.previous_cond_call_jcond - try: - self.mc.patch_forward_jump(j_location) - except codebuf.ShortJumpTooFar: - pass # ignore this case def genop_guard_guard_not_invalidated(self, guard_op, guard_token, locs, ign): - pos = self.mc.get_relative_pos() + 1 # after potential jmp + pos = self.mc.get_relative_pos(break_basic_block=False) + pos += 1 # after potential jmp guard_token.pos_jump_offset = pos + saved = self.mc.get_scratch_register_known_value() + guard_token.known_scratch_value = saved self.pending_guard_tokens.append(guard_token) def genop_guard_guard_exception(self, guard_op, guard_token, locs, resloc): @@ -1918,6 +1937,8 @@ """ self.mc.force_frame_size(DEFAULT_FRAME_BYTES) startpos = self.mc.get_relative_pos() + self.mc.restore_scratch_register_known_value( + guardtok.known_scratch_value) # self._update_at_exit(guardtok.fail_locs, guardtok.failargs, guardtok.faildescr, regalloc) @@ -2075,9 +2096,13 @@ def implement_guard(self, guard_token): # These jumps are patched later. assert self.guard_success_cc >= 0 + # common case: not taken self.mc.J_il(rx86.invert_condition(self.guard_success_cc), 0) self.guard_success_cc = rx86.cond_none - guard_token.pos_jump_offset = self.mc.get_relative_pos() - 4 + pos = self.mc.get_relative_pos(break_basic_block=False) + guard_token.pos_jump_offset = pos - 4 + saved = self.mc.get_scratch_register_known_value() + guard_token.known_scratch_value = saved self.pending_guard_tokens.append(guard_token) def _genop_real_call(self, op, arglocs, resloc): @@ -2125,6 +2150,7 @@ faildescrindex = self.get_gcref_from_faildescr(faildescr) if IS_X86_64: + self.mc.forget_scratch_register() self.mc.MOV_rp(X86_64_SCRATCH_REG.value, 0) self._patch_load_from_gc_table(faildescrindex) self.mc.MOV(raw_stack(ofs), X86_64_SCRATCH_REG) @@ -2238,6 +2264,84 @@ # ------------------- END CALL ASSEMBLER ----------------------- + class WriteBarrierSlowPath(codebuf.SlowPath): + def generate_body(self, assembler, mc): + mc.force_frame_size(DEFAULT_FRAME_BYTES) + # for cond_call_gc_wb_array, also add another fast path: + # if GCFLAG_CARDS_SET, then we can just set one bit and be done + card_marking = (self.loc_index is not None) + if card_marking: + # GCFLAG_CARDS_SET is in this byte at 0x80, so this fact can + # been checked by the sign flags of the previous TEST8 + js_location = mc.emit_forward_jump('S') # patched later + else: + js_location = 0 + + # Write only a CALL to the helper prepared in advance, passing it as + # argument the address of the structure we are writing into + # (the first argument to COND_CALL_GC_WB). + helper_num = self.helper_num + is_frame = (helper_num == 4) + descr = self.descr + loc_base = self.loc_base + # + if not is_frame: + mc.PUSH(loc_base) + mc.CALL(imm(assembler.wb_slowpath[helper_num])) + if not is_frame: + mc.stack_frame_size_delta(-WORD) + + if card_marking: + # The helper ends again with a check of the flag in the object. + # So here, we can simply write again a 'JNS', which will be + # taken if GCFLAG_CARDS_SET is still not set. + jns_location = mc.emit_forward_jump('NS') # patched later + # + # patch the JS above + mc.patch_forward_jump(js_location) + # + # case GCFLAG_CARDS_SET: emit a few instructions to do + # directly the card flag setting + loc_index = self.loc_index + if isinstance(loc_index, RegLoc): + if IS_X86_64 and isinstance(loc_base, RegLoc): + # copy loc_index into r11 + tmp1 = X86_64_SCRATCH_REG + mc.forget_scratch_register() + mc.MOV_rr(tmp1.value, loc_index.value) + final_pop = False + else: + # must save the register loc_index before it is mutated + mc.PUSH_r(loc_index.value) + tmp1 = loc_index + final_pop = True + # SHR tmp, card_page_shift + mc.SHR_ri(tmp1.value, descr.jit_wb_card_page_shift) + # XOR tmp, -8 + mc.XOR_ri(tmp1.value, -8) + # BTS [loc_base], tmp + if final_pop: + # r11 is not specially used, fall back to regloc.py + mc.BTS(addr_add_const(loc_base, 0), tmp1) + else: + # tmp1 is r11! but in this case, loc_base is a + # register so we can invoke directly rx86.py + mc.BTS_mr((loc_base.value, 0), tmp1.value) + # done + if final_pop: + mc.POP_r(loc_index.value) + # + elif isinstance(loc_index, ImmedLoc): + byte_index = loc_index.value >> descr.jit_wb_card_page_shift + byte_ofs = ~(byte_index >> 3) + byte_val = 1 << (byte_index & 7) + mc.OR8(addr_add_const(loc_base, byte_ofs), imm(byte_val)) + else: + raise AssertionError("index is neither RegLoc nor ImmedLoc") + # + # patch the JNS above + mc.patch_forward_jump(jns_location) + def _write_barrier_fastpath(self, mc, descr, arglocs, array=False, is_frame=False): # Write code equivalent to write_barrier() in the GC: it checks @@ -2249,6 +2353,7 @@ assert cls is not None and isinstance(descr, cls) # card_marking = False + loc_index = None mask = descr.jit_wb_if_flag_singlebyte if array and descr.jit_wb_cards_set != 0: # assumptions the rest of the function depends on: @@ -2256,6 +2361,7 @@ descr.jit_wb_if_flag_byteofs) assert descr.jit_wb_cards_set_singlebyte == -0x80 card_marking = True + loc_index = arglocs[1] mask = descr.jit_wb_if_flag_singlebyte | -0x80 # loc_base = arglocs[0] @@ -2264,21 +2370,7 @@ loc = raw_stack(descr.jit_wb_if_flag_byteofs) else: loc = addr_add_const(loc_base, descr.jit_wb_if_flag_byteofs) - mc.TEST8(loc, imm(mask)) - jz_location = mc.emit_forward_jump('Z') # patched later - - # for cond_call_gc_wb_array, also add another fast path: - # if GCFLAG_CARDS_SET, then we can just set one bit and be done - if card_marking: - # GCFLAG_CARDS_SET is in this byte at 0x80, so this fact can - # been checked by the sign flags of the previous TEST8 - js_location = mc.emit_forward_jump('S') # patched later - else: - js_location = 0 - - # Write only a CALL to the helper prepared in advance, passing it as - # argument the address of the structure we are writing into - # (the first argument to COND_CALL_GC_WB). + # helper_num = card_marking if is_frame: helper_num = 4 @@ -2291,58 +2383,14 @@ bool(self._regalloc.xrm.reg_bindings)) assert self.wb_slowpath[helper_num] != 0 # - if not is_frame: - mc.PUSH(loc_base) - mc.CALL(imm(self.wb_slowpath[helper_num])) - if not is_frame: - mc.stack_frame_size_delta(-WORD) - - if card_marking: - # The helper ends again with a check of the flag in the object. - # So here, we can simply write again a 'JNS', which will be - # taken if GCFLAG_CARDS_SET is still not set. - jns_location = mc.emit_forward_jump('NS') # patched later - # - # patch the JS above - mc.patch_forward_jump(js_location) - # - # case GCFLAG_CARDS_SET: emit a few instructions to do - # directly the card flag setting - loc_index = arglocs[1] - if isinstance(loc_index, RegLoc): - if IS_X86_64 and isinstance(loc_base, RegLoc): - # copy loc_index into r11 - tmp1 = X86_64_SCRATCH_REG - mc.MOV_rr(tmp1.value, loc_index.value) - final_pop = False - else: - # must save the register loc_index before it is mutated - mc.PUSH_r(loc_index.value) - tmp1 = loc_index - final_pop = True - # SHR tmp, card_page_shift - mc.SHR_ri(tmp1.value, descr.jit_wb_card_page_shift) - # XOR tmp, -8 - mc.XOR_ri(tmp1.value, -8) - # BTS [loc_base], tmp - mc.BTS(addr_add_const(loc_base, 0), tmp1) - # done - if final_pop: - mc.POP_r(loc_index.value) - # - elif isinstance(loc_index, ImmedLoc): - byte_index = loc_index.value >> descr.jit_wb_card_page_shift - byte_ofs = ~(byte_index >> 3) - byte_val = 1 << (byte_index & 7) - mc.OR8(addr_add_const(loc_base, byte_ofs), imm(byte_val)) - else: - raise AssertionError("index is neither RegLoc nor ImmedLoc") - # - # patch the JNS above - mc.patch_forward_jump(jns_location) - - # patch the JZ above - mc.patch_forward_jump(jz_location) + mc.TEST8(loc, imm(mask)) + sp = self.WriteBarrierSlowPath(mc, rx86.Conditions['NZ']) + sp.loc_base = loc_base + sp.loc_index = loc_index + sp.helper_num = helper_num + sp.descr = descr + sp.set_continue_addr(mc) + self.pending_slowpaths.append(sp) def genop_discard_cond_call_gc_wb(self, op, arglocs): self._write_barrier_fastpath(self.mc, op.getdescr(), arglocs) @@ -2373,37 +2421,66 @@ def label(self): self._check_frame_depth_debug(self.mc) + class CondCallSlowPath(codebuf.SlowPath): + guard_token_no_exception = None + + def generate_body(self, assembler, mc): + assembler.push_gcmap(mc, self.gcmap, store=True) + # + # first save away the 4 registers from + # 'cond_call_register_arguments' plus the register 'eax' + base_ofs = assembler.cpu.get_baseofs_of_frame_field() + should_be_saved = self.should_be_saved + restore_eax = False + for gpr in cond_call_register_arguments + [eax]: + if gpr not in should_be_saved or gpr is self.resloc: + continue + v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value] + mc.MOV_br(v * WORD + base_ofs, gpr.value) + if gpr is eax: + restore_eax = True + # + # load the 0-to-4 arguments into these registers + from rpython.jit.backend.x86.jump import remap_frame_layout + arglocs = self.arglocs + remap_frame_layout(assembler, arglocs, + cond_call_register_arguments[:len(arglocs)], + X86_64_SCRATCH_REG if IS_X86_64 else None) + # + # load the constant address of the function to call into eax + mc.MOV(eax, self.imm_func) + # + # figure out which variant of cond_call_slowpath to call, + # and call it + cond_call_adr = assembler.cond_call_slowpath[self.variant_num] + mc.CALL(imm(follow_jump(cond_call_adr))) + # if this is a COND_CALL_VALUE, we need to move the result in place + resloc = self.resloc + if resloc is not None and resloc is not eax: + mc.MOV(resloc, eax) + # restoring the registers saved above, and doing pop_gcmap(), is + # left to the cond_call_slowpath helper. We must only restore eax, + # if needed. + if restore_eax: + v = gpr_reg_mgr_cls.all_reg_indexes[eax.value] + mc.MOV_rb(eax.value, v * WORD + base_ofs) + # + # if needed, emit now the guard_no_exception + if self.guard_token_no_exception is not None: + assembler.generate_guard_no_exception( + self.guard_token_no_exception) + def cond_call(self, gcmap, imm_func, arglocs, resloc=None): assert self.guard_success_cc >= 0 - j_location = self.mc.emit_forward_jump_cond( - rx86.invert_condition(self.guard_success_cc)) + sp = self.CondCallSlowPath(self.mc, self.guard_success_cc) + sp.set_continue_addr(self.mc) self.guard_success_cc = rx86.cond_none + sp.gcmap = gcmap + sp.imm_func = imm_func + sp.arglocs = arglocs + sp.resloc = resloc + sp.should_be_saved = self._regalloc.rm.reg_bindings.values() # - self.push_gcmap(self.mc, gcmap, store=True) - # - # first save away the 4 registers from 'cond_call_register_arguments' - # plus the register 'eax' - base_ofs = self.cpu.get_baseofs_of_frame_field() - should_be_saved = self._regalloc.rm.reg_bindings.values() - restore_eax = False - for gpr in cond_call_register_arguments + [eax]: - if gpr not in should_be_saved or gpr is resloc: - continue - v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value] - self.mc.MOV_br(v * WORD + base_ofs, gpr.value) - if gpr is eax: - restore_eax = True - # - # load the 0-to-4 arguments into these registers - from rpython.jit.backend.x86.jump import remap_frame_layout - remap_frame_layout(self, arglocs, - cond_call_register_arguments[:len(arglocs)], - X86_64_SCRATCH_REG if IS_X86_64 else None) - # - # load the constant address of the function to call into eax - self.mc.MOV(eax, imm_func) - # - # figure out which variant of cond_call_slowpath to call, and call it callee_only = False floats = False if self._regalloc is not None: @@ -2414,34 +2491,25 @@ callee_only = True if self._regalloc.xrm.reg_bindings: floats = True - cond_call_adr = self.cond_call_slowpath[floats * 2 + callee_only] - self.mc.CALL(imm(follow_jump(cond_call_adr))) - # if this is a COND_CALL_VALUE, we need to move the result in place - if resloc is not None and resloc is not eax: - self.mc.MOV(resloc, eax) - # restoring the registers saved above, and doing pop_gcmap(), is left - # to the cond_call_slowpath helper. We must only restore eax, if - # needed. - if restore_eax: - v = gpr_reg_mgr_cls.all_reg_indexes[eax.value] - self.mc.MOV_rb(eax.value, v * WORD + base_ofs) + sp.variant_num = floats * 2 + callee_only # - self.mc.patch_forward_jump(j_location) - # might be overridden again to skip over the following - # guard_no_exception too - self.previous_cond_call_jcond = j_location + self.pending_slowpaths.append(sp) + + class MallocCondSlowPath(codebuf.SlowPath): + def generate_body(self, assembler, mc): + assembler.push_gcmap(mc, self.gcmap, store=True) + mc.CALL(imm(follow_jump(assembler.malloc_slowpath))) def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap): assert size & (WORD-1) == 0 # must be correctly aligned self.mc.MOV(ecx, heap(nursery_free_adr)) self.mc.LEA_rm(edx.value, (ecx.value, size)) self.mc.CMP(edx, heap(nursery_top_adr)) - jna_location = self.mc.emit_forward_jump('NA') # patched later - # save the gcmap - self.push_gcmap(self.mc, gcmap, store=True) - self.mc.CALL(imm(follow_jump(self.malloc_slowpath))) - self.mc.patch_forward_jump(jna_location) + sp = self.MallocCondSlowPath(self.mc, rx86.Conditions['A']) + sp.gcmap = gcmap self.mc.MOV(heap(nursery_free_adr), edx) + sp.set_continue_addr(self.mc) + self.pending_slowpaths.append(sp) def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr, sizeloc, gcmap): @@ -2454,12 +2522,31 @@ else: self.mc.LEA_ra(edx.value, (ecx.value, sizeloc.value, 0, 0)) self.mc.CMP(edx, heap(nursery_top_adr)) - jna_location = self.mc.emit_forward_jump('NA') # patched later - # save the gcmap - self.push_gcmap(self.mc, gcmap, store=True) - self.mc.CALL(imm(follow_jump(self.malloc_slowpath))) - self.mc.patch_forward_jump(jna_location) + sp = self.MallocCondSlowPath(self.mc, rx86.Conditions['A']) + sp.gcmap = gcmap self.mc.MOV(heap(nursery_free_adr), edx) + sp.set_continue_addr(self.mc) + self.pending_slowpaths.append(sp) + + class MallocCondVarsizeSlowPath(codebuf.SlowPath): + def generate_body(self, assembler, mc): + # save the gcmap + assembler.push_gcmap(mc, self.gcmap, store=True) + kind = self.kind + if kind == rewrite.FLAG_ARRAY: + mc.MOV_si(WORD, self.itemsize) + mc.MOV_ri(ecx.value, self.arraydescr.tid) + addr = assembler.malloc_slowpath_varsize + else: + if kind == rewrite.FLAG_STR: + addr = assembler.malloc_slowpath_str + else: + assert kind == rewrite.FLAG_UNICODE + addr = assembler.malloc_slowpath_unicode + lengthloc = self.lengthloc + assert lengthloc is not ecx and lengthloc is not edx + mc.MOV(edx, lengthloc) + mc.CALL(imm(follow_jump(addr))) def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr, lengthloc, itemsize, maxlength, gcmap, @@ -2500,34 +2587,24 @@ # now edx contains the total size in bytes, rounded up to a multiple # of WORD, plus nursery_free_adr self.mc.CMP(edx, heap(nursery_top_adr)) - jna_location = self.mc.emit_forward_jump('NA') # patched later - # self.mc.patch_forward_jump(ja_location) - # save the gcmap - self.push_gcmap(self.mc, gcmap, store=True) - if kind == rewrite.FLAG_ARRAY: - self.mc.MOV_si(WORD, itemsize) - self.mc.MOV(edx, lengthloc) - self.mc.MOV_ri(ecx.value, arraydescr.tid) - addr = self.malloc_slowpath_varsize - else: - if kind == rewrite.FLAG_STR: - addr = self.malloc_slowpath_str - else: - assert kind == rewrite.FLAG_UNICODE - addr = self.malloc_slowpath_unicode - self.mc.MOV(edx, lengthloc) - self.mc.CALL(imm(follow_jump(addr))) - jmp_location = self.mc.emit_forward_jump_uncond() # jump to later - # - self.mc.patch_forward_jump(jna_location) - self.mc.force_frame_size(DEFAULT_FRAME_BYTES) - # write down the tid, but not if it's the result of the CALL + # Note: we call the slow path in condition 'A', which may be + # true either because the CMP just above really got that + # condition, or because we jumped here from ja_location before. + # In both cases, the jumps are forward-going and the expected + # common case is "not taken". + sp = self.MallocCondVarsizeSlowPath(self.mc, rx86.Conditions['A']) + sp.gcmap = gcmap + sp.kind = kind + sp.itemsize = itemsize + sp.lengthloc = lengthloc + sp.arraydescr = arraydescr + # some more code that is only if we *don't* call the slow + # path: write down the tid, and save edx into nursery_free_adr self.mc.MOV(mem(ecx, 0), imm(arraydescr.tid)) - # while we're at it, this line is not needed if we've done the CALL self.mc.MOV(heap(nursery_free_adr), edx) - # - self.mc.patch_forward_jump(jmp_location) + sp.set_continue_addr(self.mc) + self.pending_slowpaths.append(sp) def store_force_descr(self, op, fail_locs, frame_depth): guard_token = self.implement_guard_recovery(op.opnum, diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py --- a/rpython/jit/backend/x86/callbuilder.py +++ b/rpython/jit/backend/x86/callbuilder.py @@ -11,6 +11,7 @@ r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG, RegLoc, RawEspLoc, RawEbpLoc, imm, ImmedLoc) from rpython.jit.backend.x86.jump import remap_frame_layout +from rpython.jit.backend.x86 import codebuf from rpython.jit.backend.llsupport.callbuilder import AbstractCallBuilder from rpython.jit.backend.llsupport import llerrno from rpython.rtyper.lltypesystem import llmemory, rffi @@ -239,7 +240,7 @@ if IS_X86_32: tmpreg = edx else: - tmpreg = r11 # edx is used for 3rd argument + tmpreg = r10 # edx is used for 3rd argument mc.MOV_rm(tmpreg.value, (tlofsreg.value, p_errno)) mc.MOV32_rm(eax.value, (tlofsreg.value, rpy_errno)) mc.MOV32_mr((tmpreg.value, 0), eax.value) @@ -293,6 +294,41 @@ tlofsreg = self.get_tlofs_reg() # => esi (possibly reused) mc.MOV32_mr((tlofsreg.value, lasterror), eax.value) + class ReacqGilSlowPath(codebuf.SlowPath): + early_jump_addr = 0 + + def generate_body(self, assembler, mc): + if self.early_jump_addr != 0: + # This slow-path has two entry points, with two + # conditional jumps. We can jump to the regular start + # of this slow-path with the 2nd conditional jump. Or, + # we can jump past the "MOV(heap(fastgil), ecx)" + # instruction from the 1st conditional jump. + # This instruction reverts the rpy_fastgil acquired + # previously, so that the general 'reacqgil_addr' + # function can acquire it again. It must only be done + # if we actually succeeded in acquiring rpy_fastgil. + from rpython.jit.backend.x86.assembler import heap + mc.MOV(heap(self.fastgil), ecx) + offset = mc.get_relative_pos() - self.early_jump_addr + mc.overwrite32(self.early_jump_addr-4, offset) + # scratch register forgotten here, by get_relative_pos() + + # call the reacqgil() function + cb = self.callbuilder + if not cb.result_value_saved_early: + cb.save_result_value(save_edx=False) + if assembler._is_asmgcc(): + if IS_X86_32: + css_value = edx + old_value = ecx + mc.MOV_sr(4, old_value.value) + mc.MOV_sr(0, css_value.value) + # on X86_64, they are already in the right registers + mc.CALL(imm(follow_jump(assembler.reacqgil_addr))) + if not cb.result_value_saved_early: + cb.restore_result_value(save_edx=False) + def move_real_result_and_call_reacqgil_addr(self, fastgil): from rpython.jit.backend.x86 import rx86 # @@ -314,8 +350,8 @@ if not self.result_value_saved_early: mc.MOV_sr(12, edx.value) restore_edx = True - css_value = edx - old_value = ecx + css_value = edx # note: duplicated in ReacqGilSlowPath + old_value = ecx # elif IS_X86_64: css_value = edi old_value = esi @@ -341,35 +377,25 @@ # thread. So here we check if the shadowstack pointer # is still the same as before we released the GIL (saved # in 'ebx'), and if not, we fall back to 'reacqgil_addr'. - jne_location = mc.emit_forward_jump('NE') + mc.J_il(rx86.Conditions['NE'], 0xfffff) # patched later + early_jump_addr = mc.get_relative_pos(break_basic_block=False) + # ^^^ this jump will go to almost the same place as the + # ReacqGilSlowPath() computes, but one instruction farther, + # i.e. just after the "MOV(heap(fastgil), ecx)". + # here, ecx (=old_value) is zero (so rpy_fastgil was in 'released' # state before the XCHG, but the XCHG acquired it by writing 1) rst = gcrootmap.get_root_stack_top_addr() mc = self.mc mc.CMP(ebx, heap(rst)) - je_location = mc.emit_forward_jump('E') - # revert the rpy_fastgil acquired above, so that the - # general 'reacqgil_addr' below can acquire it again... - mc.MOV(heap(fastgil), ecx) - # patch the JNE above - mc.patch_forward_jump(jne_location) + sp = self.ReacqGilSlowPath(mc, rx86.Conditions['NE']) + sp.early_jump_addr = early_jump_addr + sp.fastgil = fastgil else: - je_location = mc.emit_forward_jump('E') - # - # Yes, we need to call the reacqgil() function - if not self.result_value_saved_early: - self.save_result_value(save_edx=False) - if self.asm._is_asmgcc(): - if IS_X86_32: - mc.MOV_sr(4, old_value.value) - mc.MOV_sr(0, css_value.value) - # on X86_64, they are already in the right registers - mc.CALL(imm(follow_jump(self.asm.reacqgil_addr))) - if not self.result_value_saved_early: - self.restore_result_value(save_edx=False) - # - # patch the JE above - mc.patch_forward_jump(je_location) + sp = self.ReacqGilSlowPath(mc, rx86.Conditions['NE']) + sp.callbuilder = self + sp.set_continue_addr(mc) + self.asm.pending_slowpaths.append(sp) # if restore_edx: mc.MOV_rs(edx.value, 12) # restore this diff --git a/rpython/jit/backend/x86/codebuf.py b/rpython/jit/backend/x86/codebuf.py --- a/rpython/jit/backend/x86/codebuf.py +++ b/rpython/jit/backend/x86/codebuf.py @@ -42,10 +42,10 @@ self.ops_offset = {} def add_pending_relocation(self): - self.relocations.append(self.get_relative_pos()) + self.relocations.append(self.get_relative_pos(break_basic_block=False)) def mark_op(self, op): - pos = self.get_relative_pos() + pos = self.get_relative_pos(break_basic_block=False) self.ops_offset[op] = pos def copy_to_raw_memory(self, addr): @@ -64,11 +64,11 @@ def emit_forward_jump_cond(self, cond): self.J_il8(cond, 0) - return self.get_relative_pos() + return self.get_relative_pos(break_basic_block=False) def emit_forward_jump_uncond(self): self.JMP_l8(0) - return self.get_relative_pos() + return self.get_relative_pos(break_basic_block=False) def patch_forward_jump(self, jcond_location): offset = self.get_relative_pos() - jcond_location @@ -76,3 +76,38 @@ if offset > 127: raise ShortJumpTooFar self.overwrite(jcond_location-1, chr(offset)) + + def get_relative_pos(self, break_basic_block=True): + if break_basic_block: + self.forget_scratch_register() + return BlockBuilderMixin.get_relative_pos(self) + + +class SlowPath(object): + def __init__(self, mc, condition): + mc.J_il(condition, 0xfffff) # patched later + self.cond_jump_addr = mc.get_relative_pos(break_basic_block=False) + self.saved_scratch_value_1 = mc.get_scratch_register_known_value() + self.frame_size = mc._frame_size + + def set_continue_addr(self, mc): + self.continue_addr = mc.get_relative_pos(break_basic_block=False) + self.saved_scratch_value_2 = mc.get_scratch_register_known_value() + assert self.frame_size == mc._frame_size + + def generate(self, assembler, mc): + # no alignment here, prefer compactness for these slow-paths. + # patch the original jump to go here + offset = mc.get_relative_pos() - self.cond_jump_addr + mc.overwrite32(self.cond_jump_addr-4, offset) + # restore the knowledge of the scratch register value + # (this does not emit any code) + mc.force_frame_size(self.frame_size) + mc.restore_scratch_register_known_value(self.saved_scratch_value_1) + # generate the body of the slow-path + self.generate_body(assembler, mc) + # reload (if needed) the (possibly different) scratch register value + mc.load_scratch_if_known(self.saved_scratch_value_2) + # jump back + curpos = mc.get_relative_pos() + 5 + mc.JMP_l(self.continue_addr - curpos) diff --git a/rpython/jit/backend/x86/jump.py b/rpython/jit/backend/x86/jump.py --- a/rpython/jit/backend/x86/jump.py +++ b/rpython/jit/backend/x86/jump.py @@ -77,6 +77,7 @@ assembler.regalloc_pop(dst) return assembler.regalloc_mov(src, tmpreg) + assembler.mc.forget_scratch_register() src = tmpreg assembler.regalloc_mov(src, dst) diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py --- a/rpython/jit/backend/x86/regalloc.py +++ b/rpython/jit/backend/x86/regalloc.py @@ -435,9 +435,9 @@ def consider_guard_not_invalidated(self, op): mc = self.assembler.mc - n = mc.get_relative_pos() + n = mc.get_relative_pos(break_basic_block=False) self.perform_guard(op, [], None) - assert n == mc.get_relative_pos() + assert n == mc.get_relative_pos(break_basic_block=False) # ensure that the next label is at least 5 bytes farther than # the current position. Otherwise, when invalidating the guard, # we would overwrite randomly the next label's position. diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py --- a/rpython/jit/backend/x86/regloc.py +++ b/rpython/jit/backend/x86/regloc.py @@ -4,7 +4,7 @@ from rpython.jit.backend.x86.arch import WORD, IS_X86_32, IS_X86_64 from rpython.tool.sourcetools import func_with_new_name from rpython.rlib.objectmodel import specialize, instantiate -from rpython.rlib.rarithmetic import intmask +from rpython.rlib.rarithmetic import intmask, r_uint from rpython.jit.metainterp.history import FLOAT, INT from rpython.jit.codewriter import longlong from rpython.rtyper.lltypesystem import rffi, lltype @@ -355,7 +355,8 @@ # without an xmm scratch reg. X86_64_XMM_SCRATCH_REG = xmm15 -unrolling_location_codes = unrolling_iterable(list("rbsmajix")) +# note: 'r' is after 'i' in this list, for _binaryop() +unrolling_location_codes = unrolling_iterable(list("irbsmajx")) @specialize.arg(1) def _rx86_getattr(obj, methname): @@ -372,9 +373,7 @@ class LocationCodeBuilder(object): _mixin_ = True - _reuse_scratch_register = False # for now, this is always False - _scratch_register_known = False # for now, this is always False - _scratch_register_value = 0 + _scratch_register_value = -1 # -1 means 'unknown' def _binaryop(name): @@ -383,7 +382,7 @@ val2 = loc2.value_i() if name == 'MOV' and isinstance(loc1, RegLoc): self.MOV_ri(loc1.value, val2) - return + return True code1 = loc1.location_code() if code1 == 'j': checkvalue = loc1.value_j() @@ -402,10 +401,11 @@ self.MOV_ri(freereg.value, val2) INSN(self, loc1, freereg) self.POP_r(freereg.value) + return True else: # For this case, we should not need the scratch register more than here. self._load_scratch(val2) - INSN(self, loc1, X86_64_SCRATCH_REG) + return False def invoke(self, codes, val1, val2): methname = name + "_" + codes @@ -433,15 +433,15 @@ code1 = loc1.location_code() code2 = loc2.location_code() - # You can pass in the scratch register as a location, but you - # must be careful not to combine it with location types that - # might need to use the scratch register themselves. - if loc2 is X86_64_SCRATCH_REG: - if code1 == 'j': - assert (name.startswith("MOV") and - rx86.fits_in_32bits(loc1.value_j())) - if loc1 is X86_64_SCRATCH_REG and not name.startswith("MOV"): - assert code2 not in ('j', 'i') + # You cannot pass in the scratch register as a location, + # except with a MOV instruction. + if name.startswith('MOV'): + if loc2 is X86_64_SCRATCH_REG: + assert code1 != 'j' and code1 != 'm' and code1 != 'a' + if loc1 is X86_64_SCRATCH_REG: + self.forget_scratch_register() + elif loc1 is X86_64_SCRATCH_REG or loc2 is X86_64_SCRATCH_REG: + raise AssertionError("%s with scratch reg specified" % name) for possible_code2 in unrolling_location_codes: if not has_implementation_for('?', possible_code2): @@ -451,8 +451,14 @@ # # Fake out certain operations for x86_64 if self.WORD == 8 and possible_code2 == 'i' and not rx86.fits_in_32bits(val2): - insn_with_64_bit_immediate(self, loc1, loc2) - return + if insn_with_64_bit_immediate(self, loc1, loc2): + return # done + loc2 = X86_64_SCRATCH_REG + code2 = 'r' + # NB. unrolling_location_codes contains 'r' + # after 'i', so that it will be found after + # this iteration + continue # # Regular case for possible_code1 in unrolling_location_codes: @@ -487,6 +493,9 @@ def _unaryop(name): def INSN(self, loc): + if loc is X86_64_SCRATCH_REG: + raise AssertionError("%s with scratch reg specified" % name) + code = loc.location_code() for possible_code in unrolling_location_codes: if code == possible_code: @@ -532,6 +541,9 @@ else: methname = name + "_" + possible_code _rx86_getattr(self, methname)(val) + # This is for CALL and JMP, so it's correct to forget + # the value of the R11 register here. + self.forget_scratch_register() return func_with_new_name(INSN, "INSN_" + name) @@ -540,16 +552,18 @@ # If we are within a "reuse_scratch_register" block, we remember the # last value we loaded to the scratch register and encode the address # as an offset from that if we can - if self._scratch_register_known: - offset = addr - self._scratch_register_value + if self._scratch_register_value != -1: + offset = r_uint(addr) - r_uint(self._scratch_register_value) + offset = intmask(offset) if rx86.fits_in_32bits(offset): + #print '_addr_as_reg_offset(%x) [REUSED r11+%d]' % ( + # addr, offset) return (X86_64_SCRATCH_REG.value, offset) + #print '_addr_as_reg_offset(%x) [too far]' % (addr,) # else: fall through - - if self._reuse_scratch_register: - self._scratch_register_known = True - self._scratch_register_value = addr - + #else: + # print '_addr_as_reg_offset(%x) [new]' % (addr,) + self._scratch_register_value = addr self.MOV_ri(X86_64_SCRATCH_REG.value, addr) return (X86_64_SCRATCH_REG.value, 0) @@ -557,12 +571,10 @@ # For cases where an AddressLoc has the location_code 'm', but # where the static offset does not fit in 32-bits. We have to fall # back to the X86_64_SCRATCH_REG. Returns a new location encoded - # as mode 'm' too. These are all possibly rare cases; don't try - # to reuse a past value of the scratch register at all. - self._scratch_register_known = False - self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset) - self.LEA_ra(X86_64_SCRATCH_REG.value, - (basereg, X86_64_SCRATCH_REG.value, 0, 0)) + # as mode 'm' too. These are all possibly rare cases. + reg, ofs = self._addr_as_reg_offset(static_offset) + self.forget_scratch_register() + self.LEA_ra(X86_64_SCRATCH_REG.value, (basereg, reg, 0, ofs)) return (X86_64_SCRATCH_REG.value, 0) def _fix_static_offset_64_a(self, (basereg, scalereg, @@ -570,41 +582,59 @@ # For cases where an AddressLoc has the location_code 'a', but # where the static offset does not fit in 32-bits. We have to fall # back to the X86_64_SCRATCH_REG. In one case it is even more - # annoying. These are all possibly rare cases; don't try to reuse a - # past value of the scratch register at all. - self._scratch_register_known = False - self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset) + # annoying. These are all possibly rare cases. + reg, ofs = self._addr_as_reg_offset(static_offset) # if basereg != rx86.NO_BASE_REGISTER: - self.LEA_ra(X86_64_SCRATCH_REG.value, - (basereg, X86_64_SCRATCH_REG.value, 0, 0)) - return (X86_64_SCRATCH_REG.value, scalereg, scale, 0) + self.forget_scratch_register() + self.LEA_ra(X86_64_SCRATCH_REG.value, (basereg, reg, 0, ofs)) + reg = X86_64_SCRATCH_REG.value + ofs = 0 + return (reg, scalereg, scale, ofs) def _load_scratch(self, value): - if (self._scratch_register_known - and value == self._scratch_register_value): - return - if self._reuse_scratch_register: - self._scratch_register_known = True - self._scratch_register_value = value + if self._scratch_register_value != -1: + if self._scratch_register_value == value: + #print '_load_scratch(%x) [REUSED]' % (value,) + return + offset = r_uint(value) - r_uint(self._scratch_register_value) + offset = intmask(offset) + if rx86.fits_in_32bits(offset): + #print '_load_scratch(%x) [LEA r11+%d]' % (value, offset) + #global COUNT_ + #try: + # COUNT_ += 1 + #except NameError: + # COUNT_ = 1 + #if COUNT_ % 182 == 0: + # import pdb;pdb.set_trace() + self.LEA_rm(X86_64_SCRATCH_REG.value, + (X86_64_SCRATCH_REG.value, offset)) + self._scratch_register_value = value + return + #print '_load_scratch(%x) [too far]' % (value,) + #else: + # print '_load_scratch(%x) [new]' % (value,) + self._scratch_register_value = value self.MOV_ri(X86_64_SCRATCH_REG.value, value) + def forget_scratch_register(self): + self._scratch_register_value = -1 + + def get_scratch_register_known_value(self): + return self._scratch_register_value + + def restore_scratch_register_known_value(self, saved_value): + self._scratch_register_value = saved_value + + def load_scratch_if_known(self, saved_value): + if saved_value != -1: + assert IS_X86_64 + self._load_scratch(saved_value) + def trap(self): self.INT3() - def begin_reuse_scratch_register(self): - # --NEVER CALLED (only from a specific test)-- - # Flag the beginning of a block where it is okay to reuse the value - # of the scratch register. In theory we shouldn't have to do this if - # we were careful to mark all possible targets of a jump or call, and - # "forget" the value of the scratch register at those positions, but - # for now this seems safer. - self._reuse_scratch_register = True - - def end_reuse_scratch_register(self): - self._reuse_scratch_register = False - self._scratch_register_known = False - def _vector_size_choose(name): def invoke(self, suffix, val1, val2): methname = name + suffix diff --git a/rpython/jit/backend/x86/test/test_jump.py b/rpython/jit/backend/x86/test/test_jump.py --- a/rpython/jit/backend/x86/test/test_jump.py +++ b/rpython/jit/backend/x86/test/test_jump.py @@ -26,6 +26,11 @@ assert isinstance(to_loc, FrameLoc) self.ops.append(('immedmem2mem', from_loc, to_loc)) + class mc: + @staticmethod + def forget_scratch_register(): + pass + def got(self, expected): print '------------------------ comparing ---------------------------' for op1, op2 in zip(self.ops, expected): @@ -405,6 +410,10 @@ print "pop", x def regalloc_immedmem2mem(self, x, y): print "?????????????????????????" + class mc: + @staticmethod + def forget_scratch_register(): + pass def main(): srclocs = [FrameLoc(9999, x, 'i') for x,y in CASE] dstlocs = [FrameLoc(9999, y, 'i') for x,y in CASE] diff --git a/rpython/jit/backend/x86/test/test_regloc.py b/rpython/jit/backend/x86/test/test_regloc.py --- a/rpython/jit/backend/x86/test/test_regloc.py +++ b/rpython/jit/backend/x86/test/test_regloc.py @@ -149,10 +149,8 @@ def test_reuse_scratch_register(self): base_addr = intmask(0xFEDCBA9876543210) cb = LocationCodeBuilder64() - cb.begin_reuse_scratch_register() cb.MOV(ecx, heap(base_addr)) cb.MOV(ecx, heap(base_addr + 8)) - cb.end_reuse_scratch_register() expected_instructions = ( # mov r11, 0xFEDCBA9876543210 @@ -213,12 +211,9 @@ def test_64bit_address_4(self): base_addr = intmask(0xFEDCBA9876543210) cb = LocationCodeBuilder64() - cb.begin_reuse_scratch_register() - assert cb._reuse_scratch_register is True - assert cb._scratch_register_known is False + assert cb._scratch_register_value == -1 cb.MOV(ecx, AddressLoc(edx, esi, 2, base_addr)) - assert cb._reuse_scratch_register is True - assert cb._scratch_register_known is False + assert cb._scratch_register_value == -1 # this case is a CMP_ra # expected_instructions = ( diff --git a/rpython/jit/backend/x86/test/test_runner.py b/rpython/jit/backend/x86/test/test_runner.py --- a/rpython/jit/backend/x86/test/test_runner.py +++ b/rpython/jit/backend/x86/test/test_runner.py @@ -39,7 +39,7 @@ 'nop; ' # for the label 'add; test; je; jmp;') # plus some padding bridge_loop_instructions = ( - 'cmp; jge; mov;( movabs;)? mov; mov(abs)?; call; mov(abs)?; jmp;') + 'cmp; jl; mov(abs)?; jmp;') def get_cpu(self): cpu = CPU(rtyper=None, stats=FakeStats()) diff --git a/rpython/jit/backend/x86/vector_ext.py b/rpython/jit/backend/x86/vector_ext.py --- a/rpython/jit/backend/x86/vector_ext.py +++ b/rpython/jit/backend/x86/vector_ext.py @@ -173,9 +173,10 @@ return elif arg.type == INT: scratchloc = X86_64_SCRATCH_REG + self.mc.forget_scratch_register() self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0) self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1) - self.mc.ADD(targetloc, scratchloc) + self.mc.ADD_rr(targetloc.value, scratchloc.value) return not_implemented("reduce sum for %s not impl." % arg) @@ -387,6 +388,7 @@ return # already the right size if size == 4 and tosize == 8: scratch = X86_64_SCRATCH_REG.value + self.mc.forget_scratch_register() self.mc.PEXTRD_rxi(scratch, srcloc.value, 1) self.mc.PINSRQ_xri(resloc.value, scratch, 1) self.mc.PEXTRD_rxi(scratch, srcloc.value, 0) @@ -394,6 +396,7 @@ _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit