Author: Armin Rigo <ar...@tunes.org>
Branch: reusing-r11
Changeset: r90965:d8f5a5347abb
Date: 2017-04-05 15:29 +0200
http://bitbucket.org/pypy/pypy/changeset/d8f5a5347abb/

Log:    in-progress

diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -277,7 +277,7 @@
         #
         mc.TEST_rr(eax.value, eax.value)
         mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later
-        jz_location = mc.get_relative_pos()
+        jz_location = mc.get_relative_pos(break_basic_block=False)
         mc.MOV_rr(ecx.value, eax.value)
         #
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
@@ -718,6 +718,7 @@
         if rx86.fits_in_32bits(offset):
             mc.JMP_l(offset)
         else:
+            # mc.forget_scratch_register() not needed here
             mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
             mc.JMP_r(X86_64_SCRATCH_REG.value)
         mc.copy_to_raw_memory(adr_jump_offset)
@@ -830,10 +831,10 @@
         descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
         ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
         mc.CMP_bi(ofs, 0xffffff)     # force writing 32 bit
-        stack_check_cmp_ofs = mc.get_relative_pos() - 4
+        stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4
         jg_location = mc.emit_forward_jump('GE')
         mc.MOV_si(WORD, 0xffffff)     # force writing 32 bit
-        ofs2 = mc.get_relative_pos() - 4
+        ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
         self.push_gcmap(mc, gcmap, store=True)
         mc.CALL(imm(self._frame_realloc_slowpath))
         # patch the JG above
@@ -850,11 +851,11 @@
         descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
         ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
         mc.CMP_bi(ofs, 0xffffff)
-        stack_check_cmp_ofs = mc.get_relative_pos() - 4
+        stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4
         jg_location = mc.emit_forward_jump('GE')
         mc.MOV_rr(edi.value, ebp.value)
         mc.MOV_ri(esi.value, 0xffffff)
-        ofs2 = mc.get_relative_pos() - 4
+        ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
         mc.CALL(imm(self.cpu.realloc_frame_crash))
         # patch the JG above
         mc.patch_forward_jump(jg_location)
@@ -895,6 +896,7 @@
             # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
             # because we always write "mov r11, imm-as-8-bytes; call *r11" in
             # the first place.
+            # mc.forget_scratch_register() not needed here
             mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
             mc.JMP_r(X86_64_SCRATCH_REG.value)
             p = rffi.cast(rffi.INTP, adr_jump_offset)
@@ -939,7 +941,7 @@
             # would be used to pass arguments #3 and #4 (even though, so
             # far, the assembler only receives two arguments).
             tloc = esi
-            old = r11
+            old = r10
         # eax = address in the stack of a 3-words struct vmprof_stack_s
         self.mc.LEA_rs(eax.value, (FRAME_FIXED_SIZE - 4) * WORD)
         # old = current value of vmprof_tl_stack
@@ -1023,27 +1025,14 @@
         fit in 32 bits, it will be loaded in r11.
         """
         rst = gcrootmap.get_root_stack_top_addr()
-        if rx86.fits_in_32bits(rst):
-            mc.MOV_rj(ebx.value, rst)            # MOV ebx, [rootstacktop]
-        else:
-            mc.MOV_ri(X86_64_SCRATCH_REG.value, rst) # MOV r11, rootstacktop
-            mc.MOV_rm(ebx.value, (X86_64_SCRATCH_REG.value, 0))
-            # MOV ebx, [r11]
-        #
+        mc.MOV(ebx, heap(rst))                  # maybe via loading r11
         return rst
 
     def _call_header_shadowstack(self, gcrootmap):
         rst = self._load_shadowstack_top_in_ebx(self.mc, gcrootmap)
         self.mc.MOV_mr((ebx.value, 0), ebp.value)      # MOV [ebx], ebp
         self.mc.ADD_ri(ebx.value, WORD)
-        if rx86.fits_in_32bits(rst):
-            self.mc.MOV_jr(rst, ebx.value)            # MOV [rootstacktop], ebx
-        else:
-            # The integer 'rst' doesn't fit in 32 bits, so we know that
-            # _load_shadowstack_top_in_ebx() above loaded it in r11.
-            # Reuse it.  Be careful not to overwrite r11 in the middle!
-            self.mc.MOV_mr((X86_64_SCRATCH_REG.value, 0),
-                           ebx.value) # MOV [r11], ebx
+        self.mc.MOV(heap(rst), ebx)                   # MOV [rootstacktop], ebx
 
     def _call_footer_shadowstack(self, gcrootmap):
         rst = gcrootmap.get_root_stack_top_addr()
@@ -1449,7 +1438,7 @@
         # has been emitted.  64-bit mode only.
         assert IS_X86_64
         address_in_buffer = index * WORD   # at the start of the buffer
-        p_location = self.mc.get_relative_pos()
+        p_location = self.mc.get_relative_pos(break_basic_block=False)
         offset = address_in_buffer - p_location
         self.mc.overwrite32(p_location-4, offset)
 
@@ -1551,7 +1540,7 @@
             self.mc.add_pending_relocation()
         elif WORD == 8:
             self.mc.J_il(rx86.Conditions['Z'], 0)
-            pos = self.mc.get_relative_pos()
+            pos = self.mc.get_relative_pos(break_basic_block=False)
             self.pending_memoryerror_trampoline_from.append(pos)
 
     # ----------
@@ -1721,7 +1710,8 @@
 
     def genop_guard_guard_not_invalidated(self, guard_op, guard_token,
                                           locs, ign):
-        pos = self.mc.get_relative_pos() + 1 # after potential jmp
+        pos = self.mc.get_relative_pos(break_basic_block=False)
+        pos += 1   # after potential jmp
         guard_token.pos_jump_offset = pos
         self.pending_guard_tokens.append(guard_token)
 
@@ -2077,7 +2067,8 @@
         assert self.guard_success_cc >= 0
         self.mc.J_il(rx86.invert_condition(self.guard_success_cc), 0)
         self.guard_success_cc = rx86.cond_none
-        guard_token.pos_jump_offset = self.mc.get_relative_pos() - 4
+        pos = self.mc.get_relative_pos(break_basic_block=False)
+        guard_token.pos_jump_offset = pos - 4
         self.pending_guard_tokens.append(guard_token)
 
     def _genop_real_call(self, op, arglocs, resloc):
@@ -2125,6 +2116,7 @@
 
         faildescrindex = self.get_gcref_from_faildescr(faildescr)
         if IS_X86_64:
+            self.mc.forget_scratch_register()
             self.mc.MOV_rp(X86_64_SCRATCH_REG.value, 0)
             self._patch_load_from_gc_table(faildescrindex)
             self.mc.MOV(raw_stack(ofs), X86_64_SCRATCH_REG)
@@ -2313,6 +2305,7 @@
                 if IS_X86_64 and isinstance(loc_base, RegLoc):
                     # copy loc_index into r11
                     tmp1 = X86_64_SCRATCH_REG
+                    mc.forget_scratch_register()
                     mc.MOV_rr(tmp1.value, loc_index.value)
                     final_pop = False
                 else:
@@ -2325,7 +2318,13 @@
                 # XOR tmp, -8
                 mc.XOR_ri(tmp1.value, -8)
                 # BTS [loc_base], tmp
-                mc.BTS(addr_add_const(loc_base, 0), tmp1)
+                if final_pop:
+                    # r11 is not specially used, fall back to regloc.py
+                    mc.BTS(addr_add_const(loc_base, 0), tmp1)
+                else:
+                    # tmp1 is r11!  but in this case, loc_base is a
+                    # register so we can invoke directly rx86.py
+                    mc.BTS_mr((loc_base.value, 0), tmp1.value)
                 # done
                 if final_pop:
                     mc.POP_r(loc_index.value)
diff --git a/rpython/jit/backend/x86/callbuilder.py 
b/rpython/jit/backend/x86/callbuilder.py
--- a/rpython/jit/backend/x86/callbuilder.py
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -239,7 +239,7 @@
             if IS_X86_32:
                 tmpreg = edx
             else:
-                tmpreg = r11     # edx is used for 3rd argument
+                tmpreg = r10                   # edx is used for 3rd argument
             mc.MOV_rm(tmpreg.value, (tlofsreg.value, p_errno))
             mc.MOV32_rm(eax.value, (tlofsreg.value, rpy_errno))
             mc.MOV32_mr((tmpreg.value, 0), eax.value)
diff --git a/rpython/jit/backend/x86/codebuf.py 
b/rpython/jit/backend/x86/codebuf.py
--- a/rpython/jit/backend/x86/codebuf.py
+++ b/rpython/jit/backend/x86/codebuf.py
@@ -42,10 +42,10 @@
         self.ops_offset = {}
 
     def add_pending_relocation(self):
-        self.relocations.append(self.get_relative_pos())
+        self.relocations.append(self.get_relative_pos(break_basic_block=False))
 
     def mark_op(self, op):
-        pos = self.get_relative_pos()
+        pos = self.get_relative_pos(break_basic_block=False)
         self.ops_offset[op] = pos
 
     def copy_to_raw_memory(self, addr):
@@ -64,11 +64,11 @@
 
     def emit_forward_jump_cond(self, cond):
         self.J_il8(cond, 0)
-        return self.get_relative_pos()
+        return self.get_relative_pos(break_basic_block=False)
 
     def emit_forward_jump_uncond(self):
         self.JMP_l8(0)
-        return self.get_relative_pos()
+        return self.get_relative_pos(break_basic_block=False)
 
     def patch_forward_jump(self, jcond_location):
         offset = self.get_relative_pos() - jcond_location
@@ -76,3 +76,8 @@
         if offset > 127:
             raise ShortJumpTooFar
         self.overwrite(jcond_location-1, chr(offset))
+
+    def get_relative_pos(self, break_basic_block=True):
+        if break_basic_block:
+            self.forget_scratch_register()
+        return BlockBuilderMixin.get_relative_pos(self)
diff --git a/rpython/jit/backend/x86/jump.py b/rpython/jit/backend/x86/jump.py
--- a/rpython/jit/backend/x86/jump.py
+++ b/rpython/jit/backend/x86/jump.py
@@ -77,6 +77,7 @@
             assembler.regalloc_pop(dst)
             return
         assembler.regalloc_mov(src, tmpreg)
+        assembler.mc.forget_scratch_register()
         src = tmpreg
     assembler.regalloc_mov(src, dst)
 
diff --git a/rpython/jit/backend/x86/regalloc.py 
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -435,9 +435,9 @@
 
     def consider_guard_not_invalidated(self, op):
         mc = self.assembler.mc
-        n = mc.get_relative_pos()
+        n = mc.get_relative_pos(break_basic_block=False)
         self.perform_guard(op, [], None)
-        assert n == mc.get_relative_pos()
+        assert n == mc.get_relative_pos(break_basic_block=False)
         # ensure that the next label is at least 5 bytes farther than
         # the current position.  Otherwise, when invalidating the guard,
         # we would overwrite randomly the next label's position.
diff --git a/rpython/jit/backend/x86/regloc.py 
b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -4,7 +4,7 @@
 from rpython.jit.backend.x86.arch import WORD, IS_X86_32, IS_X86_64
 from rpython.tool.sourcetools import func_with_new_name
 from rpython.rlib.objectmodel import specialize, instantiate
-from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rarithmetic import intmask, r_uint
 from rpython.jit.metainterp.history import FLOAT, INT
 from rpython.jit.codewriter import longlong
 from rpython.rtyper.lltypesystem import rffi, lltype
@@ -355,7 +355,8 @@
 # without an xmm scratch reg.
 X86_64_XMM_SCRATCH_REG = xmm15
 
-unrolling_location_codes = unrolling_iterable(list("rbsmajix"))
+# note: 'r' is after 'i' in this list, for _binaryop()
+unrolling_location_codes = unrolling_iterable(list("irbsmajx"))
 
 @specialize.arg(1)
 def _rx86_getattr(obj, methname):
@@ -372,9 +373,7 @@
 class LocationCodeBuilder(object):
     _mixin_ = True
 
-    _reuse_scratch_register = False   # for now, this is always False
-    _scratch_register_known = False   # for now, this is always False
-    _scratch_register_value = 0
+    _scratch_register_value = 0    # 0 means 'unknown'
 
     def _binaryop(name):
 
@@ -383,7 +382,7 @@
             val2 = loc2.value_i()
             if name == 'MOV' and isinstance(loc1, RegLoc):
                 self.MOV_ri(loc1.value, val2)
-                return
+                return True
             code1 = loc1.location_code()
             if code1 == 'j':
                 checkvalue = loc1.value_j()
@@ -402,10 +401,11 @@
                 self.MOV_ri(freereg.value, val2)
                 INSN(self, loc1, freereg)
                 self.POP_r(freereg.value)
+                return True
             else:
                 # For this case, we should not need the scratch register more 
than here.
                 self._load_scratch(val2)
-                INSN(self, loc1, X86_64_SCRATCH_REG)
+                return False
 
         def invoke(self, codes, val1, val2):
             methname = name + "_" + codes
@@ -433,15 +433,15 @@
             code1 = loc1.location_code()
             code2 = loc2.location_code()
 
-            # You can pass in the scratch register as a location, but you
-            # must be careful not to combine it with location types that
-            # might need to use the scratch register themselves.
-            if loc2 is X86_64_SCRATCH_REG:
-                if code1 == 'j':
-                    assert (name.startswith("MOV") and
-                            rx86.fits_in_32bits(loc1.value_j()))
-            if loc1 is X86_64_SCRATCH_REG and not name.startswith("MOV"):
-                assert code2 not in ('j', 'i')
+            # You cannot pass in the scratch register as a location,
+            # except with a MOV instruction.
+            if name.startswith('MOV'):
+                if loc2 is X86_64_SCRATCH_REG:
+                    assert code1 != 'j' and code1 != 'm' and code1 != 'a'
+                if loc1 is X86_64_SCRATCH_REG:
+                    self.forget_scratch_register()
+            elif loc1 is X86_64_SCRATCH_REG or loc2 is X86_64_SCRATCH_REG:
+                raise AssertionError("%s with scratch reg specified" % name)
 
             for possible_code2 in unrolling_location_codes:
                 if not has_implementation_for('?', possible_code2):
@@ -451,8 +451,14 @@
                     #
                     # Fake out certain operations for x86_64
                     if self.WORD == 8 and possible_code2 == 'i' and not 
rx86.fits_in_32bits(val2):
-                        insn_with_64_bit_immediate(self, loc1, loc2)
-                        return
+                        if insn_with_64_bit_immediate(self, loc1, loc2):
+                            return      # done
+                        loc2 = X86_64_SCRATCH_REG
+                        code2 = 'r'
+                        # NB. unrolling_location_codes contains 'r'
+                        # after 'i', so that it will be found after
+                        # this iteration
+                        continue
                     #
                     # Regular case
                     for possible_code1 in unrolling_location_codes:
@@ -487,6 +493,9 @@
 
     def _unaryop(name):
         def INSN(self, loc):
+            if loc is X86_64_SCRATCH_REG:
+                raise AssertionError("%s with scratch reg specified" % name)
+
             code = loc.location_code()
             for possible_code in unrolling_location_codes:
                 if code == possible_code:
@@ -532,6 +541,9 @@
                     else:
                         methname = name + "_" + possible_code
                         _rx86_getattr(self, methname)(val)
+            # This is for CALL and JMP, so it's correct to forget
+            # the value of the R11 register here.
+            self.forget_scratch_register()
 
         return func_with_new_name(INSN, "INSN_" + name)
 
@@ -540,16 +552,18 @@
         # If we are within a "reuse_scratch_register" block, we remember the
         # last value we loaded to the scratch register and encode the address
         # as an offset from that if we can
-        if self._scratch_register_known:
-            offset = addr - self._scratch_register_value
+        if self._scratch_register_value != 0:
+            offset = r_uint(addr) - r_uint(self._scratch_register_value)
+            offset = intmask(offset)
             if rx86.fits_in_32bits(offset):
+                print '_addr_as_reg_offset(%x) [REUSED r11+%d]' % (
+                    addr, offset)
                 return (X86_64_SCRATCH_REG.value, offset)
+            print '_addr_as_reg_offset(%x) [too far]' % (addr,)
             # else: fall through
-
-        if self._reuse_scratch_register:
-            self._scratch_register_known = True
-            self._scratch_register_value = addr
-
+        else:
+            print '_addr_as_reg_offset(%x) [new]' % (addr,)
+        self._scratch_register_value = addr
         self.MOV_ri(X86_64_SCRATCH_REG.value, addr)
         return (X86_64_SCRATCH_REG.value, 0)
 
@@ -557,12 +571,11 @@
         # For cases where an AddressLoc has the location_code 'm', but
         # where the static offset does not fit in 32-bits.  We have to fall
         # back to the X86_64_SCRATCH_REG.  Returns a new location encoded
-        # as mode 'm' too.  These are all possibly rare cases; don't try
-        # to reuse a past value of the scratch register at all.
-        self._scratch_register_known = False
-        self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset)
+        # as mode 'm' too.  These are all possibly rare cases.
+        ofs = self._addr_as_reg_offset(static_offset)
+        self.forget_scratch_register()
         self.LEA_ra(X86_64_SCRATCH_REG.value,
-                    (basereg, X86_64_SCRATCH_REG.value, 0, 0))
+                    (basereg, X86_64_SCRATCH_REG.value, 0, ofs))
         return (X86_64_SCRATCH_REG.value, 0)
 
     def _fix_static_offset_64_a(self, (basereg, scalereg,
@@ -570,41 +583,48 @@
         # For cases where an AddressLoc has the location_code 'a', but
         # where the static offset does not fit in 32-bits.  We have to fall
         # back to the X86_64_SCRATCH_REG.  In one case it is even more
-        # annoying.  These are all possibly rare cases; don't try to reuse a
-        # past value of the scratch register at all.
-        self._scratch_register_known = False
-        self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset)
+        # annoying.  These are all possibly rare cases.
+        ofs = self._addr_as_reg_offset(static_offset)
         #
         if basereg != rx86.NO_BASE_REGISTER:
+            self.forget_scratch_register()
             self.LEA_ra(X86_64_SCRATCH_REG.value,
-                        (basereg, X86_64_SCRATCH_REG.value, 0, 0))
-        return (X86_64_SCRATCH_REG.value, scalereg, scale, 0)
+                        (basereg, X86_64_SCRATCH_REG.value, 0, ofs))
+            ofs = 0
+        return (X86_64_SCRATCH_REG.value, scalereg, scale, ofs)
 
     def _load_scratch(self, value):
-        if (self._scratch_register_known
-            and value == self._scratch_register_value):
-            return
-        if self._reuse_scratch_register:
-            self._scratch_register_known = True
-            self._scratch_register_value = value
+        if self._scratch_register_value != 0:
+            if self._scratch_register_value == value:
+                print '_load_scratch(%x) [REUSED]' % (value,)
+                return
+            offset = r_uint(value) - r_uint(self._scratch_register_value)
+            offset = intmask(offset)
+            if rx86.fits_in_32bits(offset):
+                print '_load_scratch(%x) [LEA r11+%d]' % (value, offset)
+                global COUNT_
+                try:
+                    COUNT_ += 1
+                except NameError:
+                    COUNT_ = 1
+                if COUNT_ % 182 == 0:
+                    import pdb;pdb.set_trace()
+                self.LEA_rm(X86_64_SCRATCH_REG.value,
+                    (X86_64_SCRATCH_REG.value, offset))
+                self._scratch_register_value = value
+                return
+            print '_load_scratch(%x) [too far]' % (value,)
+        else:
+            print '_load_scratch(%x) [new]' % (value,)
+        self._scratch_register_value = value
         self.MOV_ri(X86_64_SCRATCH_REG.value, value)
 
+    def forget_scratch_register(self):
+        self._scratch_register_value = 0
+
     def trap(self):
         self.INT3()
 
-    def begin_reuse_scratch_register(self):
-        # --NEVER CALLED (only from a specific test)--
-        # Flag the beginning of a block where it is okay to reuse the value
-        # of the scratch register. In theory we shouldn't have to do this if
-        # we were careful to mark all possible targets of a jump or call, and
-        # "forget" the value of the scratch register at those positions, but
-        # for now this seems safer.
-        self._reuse_scratch_register = True
-
-    def end_reuse_scratch_register(self):
-        self._reuse_scratch_register = False
-        self._scratch_register_known = False
-
     def _vector_size_choose(name):
         def invoke(self, suffix, val1, val2):
             methname = name + suffix
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to