Author: Armin Rigo <ar...@tunes.org>
Branch: 
Changeset: r90976:6b21a8908e9f
Date: 2017-04-05 19:22 +0200
http://bitbucket.org/pypy/pypy/changeset/6b21a8908e9f/

Log:    hg merge reusing-r11

        Improve the generated machine code by tracking the (constant) value
        of r11 across intructions. This lets us avoid reloading r11 with
        another (apparently slowish) "movabs" instruction, replacing it with
        either nothing or a cheaper variant.

diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -277,7 +277,7 @@
         #
         mc.TEST_rr(eax.value, eax.value)
         mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later
-        jz_location = mc.get_relative_pos()
+        jz_location = mc.get_relative_pos(break_basic_block=False)
         mc.MOV_rr(ecx.value, eax.value)
         #
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
@@ -718,6 +718,7 @@
         if rx86.fits_in_32bits(offset):
             mc.JMP_l(offset)
         else:
+            # mc.forget_scratch_register() not needed here
             mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
             mc.JMP_r(X86_64_SCRATCH_REG.value)
         mc.copy_to_raw_memory(adr_jump_offset)
@@ -830,10 +831,10 @@
         descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
         ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
         mc.CMP_bi(ofs, 0xffffff)     # force writing 32 bit
-        stack_check_cmp_ofs = mc.get_relative_pos() - 4
+        stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4
         jg_location = mc.emit_forward_jump('GE')
         mc.MOV_si(WORD, 0xffffff)     # force writing 32 bit
-        ofs2 = mc.get_relative_pos() - 4
+        ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
         self.push_gcmap(mc, gcmap, store=True)
         mc.CALL(imm(self._frame_realloc_slowpath))
         # patch the JG above
@@ -850,11 +851,11 @@
         descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
         ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
         mc.CMP_bi(ofs, 0xffffff)
-        stack_check_cmp_ofs = mc.get_relative_pos() - 4
+        stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4
         jg_location = mc.emit_forward_jump('GE')
         mc.MOV_rr(edi.value, ebp.value)
         mc.MOV_ri(esi.value, 0xffffff)
-        ofs2 = mc.get_relative_pos() - 4
+        ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
         mc.CALL(imm(self.cpu.realloc_frame_crash))
         # patch the JG above
         mc.patch_forward_jump(jg_location)
@@ -895,6 +896,7 @@
             # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
             # because we always write "mov r11, imm-as-8-bytes; call *r11" in
             # the first place.
+            # mc.forget_scratch_register() not needed here
             mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
             mc.JMP_r(X86_64_SCRATCH_REG.value)
             p = rffi.cast(rffi.INTP, adr_jump_offset)
@@ -939,7 +941,7 @@
             # would be used to pass arguments #3 and #4 (even though, so
             # far, the assembler only receives two arguments).
             tloc = esi
-            old = r11
+            old = r10
         # eax = address in the stack of a 3-words struct vmprof_stack_s
         self.mc.LEA_rs(eax.value, (FRAME_FIXED_SIZE - 4) * WORD)
         # old = current value of vmprof_tl_stack
@@ -1023,27 +1025,14 @@
         fit in 32 bits, it will be loaded in r11.
         """
         rst = gcrootmap.get_root_stack_top_addr()
-        if rx86.fits_in_32bits(rst):
-            mc.MOV_rj(ebx.value, rst)            # MOV ebx, [rootstacktop]
-        else:
-            mc.MOV_ri(X86_64_SCRATCH_REG.value, rst) # MOV r11, rootstacktop
-            mc.MOV_rm(ebx.value, (X86_64_SCRATCH_REG.value, 0))
-            # MOV ebx, [r11]
-        #
+        mc.MOV(ebx, heap(rst))                  # maybe via loading r11
         return rst
 
     def _call_header_shadowstack(self, gcrootmap):
         rst = self._load_shadowstack_top_in_ebx(self.mc, gcrootmap)
         self.mc.MOV_mr((ebx.value, 0), ebp.value)      # MOV [ebx], ebp
         self.mc.ADD_ri(ebx.value, WORD)
-        if rx86.fits_in_32bits(rst):
-            self.mc.MOV_jr(rst, ebx.value)            # MOV [rootstacktop], ebx
-        else:
-            # The integer 'rst' doesn't fit in 32 bits, so we know that
-            # _load_shadowstack_top_in_ebx() above loaded it in r11.
-            # Reuse it.  Be careful not to overwrite r11 in the middle!
-            self.mc.MOV_mr((X86_64_SCRATCH_REG.value, 0),
-                           ebx.value) # MOV [r11], ebx
+        self.mc.MOV(heap(rst), ebx)                   # MOV [rootstacktop], ebx
 
     def _call_footer_shadowstack(self, gcrootmap):
         rst = gcrootmap.get_root_stack_top_addr()
@@ -1449,7 +1438,7 @@
         # has been emitted.  64-bit mode only.
         assert IS_X86_64
         address_in_buffer = index * WORD   # at the start of the buffer
-        p_location = self.mc.get_relative_pos()
+        p_location = self.mc.get_relative_pos(break_basic_block=False)
         offset = address_in_buffer - p_location
         self.mc.overwrite32(p_location-4, offset)
 
@@ -1551,7 +1540,7 @@
             self.mc.add_pending_relocation()
         elif WORD == 8:
             self.mc.J_il(rx86.Conditions['Z'], 0)
-            pos = self.mc.get_relative_pos()
+            pos = self.mc.get_relative_pos(break_basic_block=False)
             self.pending_memoryerror_trampoline_from.append(pos)
 
     # ----------
@@ -1721,7 +1710,8 @@
 
     def genop_guard_guard_not_invalidated(self, guard_op, guard_token,
                                           locs, ign):
-        pos = self.mc.get_relative_pos() + 1 # after potential jmp
+        pos = self.mc.get_relative_pos(break_basic_block=False)
+        pos += 1   # after potential jmp
         guard_token.pos_jump_offset = pos
         self.pending_guard_tokens.append(guard_token)
 
@@ -2077,7 +2067,8 @@
         assert self.guard_success_cc >= 0
         self.mc.J_il(rx86.invert_condition(self.guard_success_cc), 0)
         self.guard_success_cc = rx86.cond_none
-        guard_token.pos_jump_offset = self.mc.get_relative_pos() - 4
+        pos = self.mc.get_relative_pos(break_basic_block=False)
+        guard_token.pos_jump_offset = pos - 4
         self.pending_guard_tokens.append(guard_token)
 
     def _genop_real_call(self, op, arglocs, resloc):
@@ -2125,6 +2116,7 @@
 
         faildescrindex = self.get_gcref_from_faildescr(faildescr)
         if IS_X86_64:
+            self.mc.forget_scratch_register()
             self.mc.MOV_rp(X86_64_SCRATCH_REG.value, 0)
             self._patch_load_from_gc_table(faildescrindex)
             self.mc.MOV(raw_stack(ofs), X86_64_SCRATCH_REG)
@@ -2313,6 +2305,7 @@
                 if IS_X86_64 and isinstance(loc_base, RegLoc):
                     # copy loc_index into r11
                     tmp1 = X86_64_SCRATCH_REG
+                    mc.forget_scratch_register()
                     mc.MOV_rr(tmp1.value, loc_index.value)
                     final_pop = False
                 else:
@@ -2325,7 +2318,13 @@
                 # XOR tmp, -8
                 mc.XOR_ri(tmp1.value, -8)
                 # BTS [loc_base], tmp
-                mc.BTS(addr_add_const(loc_base, 0), tmp1)
+                if final_pop:
+                    # r11 is not specially used, fall back to regloc.py
+                    mc.BTS(addr_add_const(loc_base, 0), tmp1)
+                else:
+                    # tmp1 is r11!  but in this case, loc_base is a
+                    # register so we can invoke directly rx86.py
+                    mc.BTS_mr((loc_base.value, 0), tmp1.value)
                 # done
                 if final_pop:
                     mc.POP_r(loc_index.value)
diff --git a/rpython/jit/backend/x86/callbuilder.py 
b/rpython/jit/backend/x86/callbuilder.py
--- a/rpython/jit/backend/x86/callbuilder.py
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -239,7 +239,7 @@
             if IS_X86_32:
                 tmpreg = edx
             else:
-                tmpreg = r11     # edx is used for 3rd argument
+                tmpreg = r10                   # edx is used for 3rd argument
             mc.MOV_rm(tmpreg.value, (tlofsreg.value, p_errno))
             mc.MOV32_rm(eax.value, (tlofsreg.value, rpy_errno))
             mc.MOV32_mr((tmpreg.value, 0), eax.value)
diff --git a/rpython/jit/backend/x86/codebuf.py 
b/rpython/jit/backend/x86/codebuf.py
--- a/rpython/jit/backend/x86/codebuf.py
+++ b/rpython/jit/backend/x86/codebuf.py
@@ -42,10 +42,10 @@
         self.ops_offset = {}
 
     def add_pending_relocation(self):
-        self.relocations.append(self.get_relative_pos())
+        self.relocations.append(self.get_relative_pos(break_basic_block=False))
 
     def mark_op(self, op):
-        pos = self.get_relative_pos()
+        pos = self.get_relative_pos(break_basic_block=False)
         self.ops_offset[op] = pos
 
     def copy_to_raw_memory(self, addr):
@@ -64,11 +64,11 @@
 
     def emit_forward_jump_cond(self, cond):
         self.J_il8(cond, 0)
-        return self.get_relative_pos()
+        return self.get_relative_pos(break_basic_block=False)
 
     def emit_forward_jump_uncond(self):
         self.JMP_l8(0)
-        return self.get_relative_pos()
+        return self.get_relative_pos(break_basic_block=False)
 
     def patch_forward_jump(self, jcond_location):
         offset = self.get_relative_pos() - jcond_location
@@ -76,3 +76,8 @@
         if offset > 127:
             raise ShortJumpTooFar
         self.overwrite(jcond_location-1, chr(offset))
+
+    def get_relative_pos(self, break_basic_block=True):
+        if break_basic_block:
+            self.forget_scratch_register()
+        return BlockBuilderMixin.get_relative_pos(self)
diff --git a/rpython/jit/backend/x86/jump.py b/rpython/jit/backend/x86/jump.py
--- a/rpython/jit/backend/x86/jump.py
+++ b/rpython/jit/backend/x86/jump.py
@@ -77,6 +77,7 @@
             assembler.regalloc_pop(dst)
             return
         assembler.regalloc_mov(src, tmpreg)
+        assembler.mc.forget_scratch_register()
         src = tmpreg
     assembler.regalloc_mov(src, dst)
 
diff --git a/rpython/jit/backend/x86/regalloc.py 
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -435,9 +435,9 @@
 
     def consider_guard_not_invalidated(self, op):
         mc = self.assembler.mc
-        n = mc.get_relative_pos()
+        n = mc.get_relative_pos(break_basic_block=False)
         self.perform_guard(op, [], None)
-        assert n == mc.get_relative_pos()
+        assert n == mc.get_relative_pos(break_basic_block=False)
         # ensure that the next label is at least 5 bytes farther than
         # the current position.  Otherwise, when invalidating the guard,
         # we would overwrite randomly the next label's position.
diff --git a/rpython/jit/backend/x86/regloc.py 
b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -4,7 +4,7 @@
 from rpython.jit.backend.x86.arch import WORD, IS_X86_32, IS_X86_64
 from rpython.tool.sourcetools import func_with_new_name
 from rpython.rlib.objectmodel import specialize, instantiate
-from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rarithmetic import intmask, r_uint
 from rpython.jit.metainterp.history import FLOAT, INT
 from rpython.jit.codewriter import longlong
 from rpython.rtyper.lltypesystem import rffi, lltype
@@ -355,7 +355,8 @@
 # without an xmm scratch reg.
 X86_64_XMM_SCRATCH_REG = xmm15
 
-unrolling_location_codes = unrolling_iterable(list("rbsmajix"))
+# note: 'r' is after 'i' in this list, for _binaryop()
+unrolling_location_codes = unrolling_iterable(list("irbsmajx"))
 
 @specialize.arg(1)
 def _rx86_getattr(obj, methname):
@@ -372,9 +373,7 @@
 class LocationCodeBuilder(object):
     _mixin_ = True
 
-    _reuse_scratch_register = False   # for now, this is always False
-    _scratch_register_known = False   # for now, this is always False
-    _scratch_register_value = 0
+    _scratch_register_value = 0    # 0 means 'unknown'
 
     def _binaryop(name):
 
@@ -383,7 +382,7 @@
             val2 = loc2.value_i()
             if name == 'MOV' and isinstance(loc1, RegLoc):
                 self.MOV_ri(loc1.value, val2)
-                return
+                return True
             code1 = loc1.location_code()
             if code1 == 'j':
                 checkvalue = loc1.value_j()
@@ -402,10 +401,11 @@
                 self.MOV_ri(freereg.value, val2)
                 INSN(self, loc1, freereg)
                 self.POP_r(freereg.value)
+                return True
             else:
                 # For this case, we should not need the scratch register more 
than here.
                 self._load_scratch(val2)
-                INSN(self, loc1, X86_64_SCRATCH_REG)
+                return False
 
         def invoke(self, codes, val1, val2):
             methname = name + "_" + codes
@@ -433,15 +433,15 @@
             code1 = loc1.location_code()
             code2 = loc2.location_code()
 
-            # You can pass in the scratch register as a location, but you
-            # must be careful not to combine it with location types that
-            # might need to use the scratch register themselves.
-            if loc2 is X86_64_SCRATCH_REG:
-                if code1 == 'j':
-                    assert (name.startswith("MOV") and
-                            rx86.fits_in_32bits(loc1.value_j()))
-            if loc1 is X86_64_SCRATCH_REG and not name.startswith("MOV"):
-                assert code2 not in ('j', 'i')
+            # You cannot pass in the scratch register as a location,
+            # except with a MOV instruction.
+            if name.startswith('MOV'):
+                if loc2 is X86_64_SCRATCH_REG:
+                    assert code1 != 'j' and code1 != 'm' and code1 != 'a'
+                if loc1 is X86_64_SCRATCH_REG:
+                    self.forget_scratch_register()
+            elif loc1 is X86_64_SCRATCH_REG or loc2 is X86_64_SCRATCH_REG:
+                raise AssertionError("%s with scratch reg specified" % name)
 
             for possible_code2 in unrolling_location_codes:
                 if not has_implementation_for('?', possible_code2):
@@ -451,8 +451,14 @@
                     #
                     # Fake out certain operations for x86_64
                     if self.WORD == 8 and possible_code2 == 'i' and not 
rx86.fits_in_32bits(val2):
-                        insn_with_64_bit_immediate(self, loc1, loc2)
-                        return
+                        if insn_with_64_bit_immediate(self, loc1, loc2):
+                            return      # done
+                        loc2 = X86_64_SCRATCH_REG
+                        code2 = 'r'
+                        # NB. unrolling_location_codes contains 'r'
+                        # after 'i', so that it will be found after
+                        # this iteration
+                        continue
                     #
                     # Regular case
                     for possible_code1 in unrolling_location_codes:
@@ -487,6 +493,9 @@
 
     def _unaryop(name):
         def INSN(self, loc):
+            if loc is X86_64_SCRATCH_REG:
+                raise AssertionError("%s with scratch reg specified" % name)
+
             code = loc.location_code()
             for possible_code in unrolling_location_codes:
                 if code == possible_code:
@@ -532,6 +541,9 @@
                     else:
                         methname = name + "_" + possible_code
                         _rx86_getattr(self, methname)(val)
+            # This is for CALL and JMP, so it's correct to forget
+            # the value of the R11 register here.
+            self.forget_scratch_register()
 
         return func_with_new_name(INSN, "INSN_" + name)
 
@@ -540,16 +552,18 @@
         # If we are within a "reuse_scratch_register" block, we remember the
         # last value we loaded to the scratch register and encode the address
         # as an offset from that if we can
-        if self._scratch_register_known:
-            offset = addr - self._scratch_register_value
+        if self._scratch_register_value != 0:
+            offset = r_uint(addr) - r_uint(self._scratch_register_value)
+            offset = intmask(offset)
             if rx86.fits_in_32bits(offset):
+                #print '_addr_as_reg_offset(%x) [REUSED r11+%d]' % (
+                #    addr, offset)
                 return (X86_64_SCRATCH_REG.value, offset)
+            #print '_addr_as_reg_offset(%x) [too far]' % (addr,)
             # else: fall through
-
-        if self._reuse_scratch_register:
-            self._scratch_register_known = True
-            self._scratch_register_value = addr
-
+        #else:
+        #    print '_addr_as_reg_offset(%x) [new]' % (addr,)
+        self._scratch_register_value = addr
         self.MOV_ri(X86_64_SCRATCH_REG.value, addr)
         return (X86_64_SCRATCH_REG.value, 0)
 
@@ -557,12 +571,10 @@
         # For cases where an AddressLoc has the location_code 'm', but
         # where the static offset does not fit in 32-bits.  We have to fall
         # back to the X86_64_SCRATCH_REG.  Returns a new location encoded
-        # as mode 'm' too.  These are all possibly rare cases; don't try
-        # to reuse a past value of the scratch register at all.
-        self._scratch_register_known = False
-        self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset)
-        self.LEA_ra(X86_64_SCRATCH_REG.value,
-                    (basereg, X86_64_SCRATCH_REG.value, 0, 0))
+        # as mode 'm' too.  These are all possibly rare cases.
+        reg, ofs = self._addr_as_reg_offset(static_offset)
+        self.forget_scratch_register()
+        self.LEA_ra(X86_64_SCRATCH_REG.value, (basereg, reg, 0, ofs))
         return (X86_64_SCRATCH_REG.value, 0)
 
     def _fix_static_offset_64_a(self, (basereg, scalereg,
@@ -570,41 +582,48 @@
         # For cases where an AddressLoc has the location_code 'a', but
         # where the static offset does not fit in 32-bits.  We have to fall
         # back to the X86_64_SCRATCH_REG.  In one case it is even more
-        # annoying.  These are all possibly rare cases; don't try to reuse a
-        # past value of the scratch register at all.
-        self._scratch_register_known = False
-        self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset)
+        # annoying.  These are all possibly rare cases.
+        reg, ofs = self._addr_as_reg_offset(static_offset)
         #
         if basereg != rx86.NO_BASE_REGISTER:
-            self.LEA_ra(X86_64_SCRATCH_REG.value,
-                        (basereg, X86_64_SCRATCH_REG.value, 0, 0))
-        return (X86_64_SCRATCH_REG.value, scalereg, scale, 0)
+            self.forget_scratch_register()
+            self.LEA_ra(X86_64_SCRATCH_REG.value, (basereg, reg, 0, ofs))
+            reg = X86_64_SCRATCH_REG.value
+            ofs = 0
+        return (reg, scalereg, scale, ofs)
 
     def _load_scratch(self, value):
-        if (self._scratch_register_known
-            and value == self._scratch_register_value):
-            return
-        if self._reuse_scratch_register:
-            self._scratch_register_known = True
-            self._scratch_register_value = value
+        if self._scratch_register_value != 0:
+            if self._scratch_register_value == value:
+                #print '_load_scratch(%x) [REUSED]' % (value,)
+                return
+            offset = r_uint(value) - r_uint(self._scratch_register_value)
+            offset = intmask(offset)
+            if rx86.fits_in_32bits(offset):
+                #print '_load_scratch(%x) [LEA r11+%d]' % (value, offset)
+                #global COUNT_
+                #try:
+                #    COUNT_ += 1
+                #except NameError:
+                #    COUNT_ = 1
+                #if COUNT_ % 182 == 0:
+                #    import pdb;pdb.set_trace()
+                self.LEA_rm(X86_64_SCRATCH_REG.value,
+                    (X86_64_SCRATCH_REG.value, offset))
+                self._scratch_register_value = value
+                return
+            #print '_load_scratch(%x) [too far]' % (value,)
+        #else:
+        #    print '_load_scratch(%x) [new]' % (value,)
+        self._scratch_register_value = value
         self.MOV_ri(X86_64_SCRATCH_REG.value, value)
 
+    def forget_scratch_register(self):
+        self._scratch_register_value = 0
+
     def trap(self):
         self.INT3()
 
-    def begin_reuse_scratch_register(self):
-        # --NEVER CALLED (only from a specific test)--
-        # Flag the beginning of a block where it is okay to reuse the value
-        # of the scratch register. In theory we shouldn't have to do this if
-        # we were careful to mark all possible targets of a jump or call, and
-        # "forget" the value of the scratch register at those positions, but
-        # for now this seems safer.
-        self._reuse_scratch_register = True
-
-    def end_reuse_scratch_register(self):
-        self._reuse_scratch_register = False
-        self._scratch_register_known = False
-
     def _vector_size_choose(name):
         def invoke(self, suffix, val1, val2):
             methname = name + suffix
diff --git a/rpython/jit/backend/x86/test/test_jump.py 
b/rpython/jit/backend/x86/test/test_jump.py
--- a/rpython/jit/backend/x86/test/test_jump.py
+++ b/rpython/jit/backend/x86/test/test_jump.py
@@ -26,6 +26,11 @@
         assert isinstance(to_loc,   FrameLoc)
         self.ops.append(('immedmem2mem', from_loc, to_loc))
 
+    class mc:
+        @staticmethod
+        def forget_scratch_register():
+            pass
+
     def got(self, expected):
         print '------------------------ comparing ---------------------------'
         for op1, op2 in zip(self.ops, expected):
@@ -405,6 +410,10 @@
             print "pop", x
         def regalloc_immedmem2mem(self, x, y):
             print "?????????????????????????"
+        class mc:
+            @staticmethod
+            def forget_scratch_register():
+                pass
     def main():
         srclocs = [FrameLoc(9999, x, 'i') for x,y in CASE]
         dstlocs = [FrameLoc(9999, y, 'i') for x,y in CASE]
diff --git a/rpython/jit/backend/x86/test/test_regloc.py 
b/rpython/jit/backend/x86/test/test_regloc.py
--- a/rpython/jit/backend/x86/test/test_regloc.py
+++ b/rpython/jit/backend/x86/test/test_regloc.py
@@ -149,10 +149,8 @@
     def test_reuse_scratch_register(self):
         base_addr = intmask(0xFEDCBA9876543210)
         cb = LocationCodeBuilder64()
-        cb.begin_reuse_scratch_register()
         cb.MOV(ecx, heap(base_addr))
         cb.MOV(ecx, heap(base_addr + 8))
-        cb.end_reuse_scratch_register()
 
         expected_instructions = (
                 # mov r11, 0xFEDCBA9876543210
@@ -213,12 +211,9 @@
     def test_64bit_address_4(self):
         base_addr = intmask(0xFEDCBA9876543210)
         cb = LocationCodeBuilder64()
-        cb.begin_reuse_scratch_register()
-        assert cb._reuse_scratch_register is True
-        assert cb._scratch_register_known is False
+        assert cb._scratch_register_value == 0
         cb.MOV(ecx, AddressLoc(edx, esi, 2, base_addr))
-        assert cb._reuse_scratch_register is True
-        assert cb._scratch_register_known is False
+        assert cb._scratch_register_value == 0
         # this case is a CMP_ra
         #
         expected_instructions = (
diff --git a/rpython/jit/backend/x86/test/test_runner.py 
b/rpython/jit/backend/x86/test/test_runner.py
--- a/rpython/jit/backend/x86/test/test_runner.py
+++ b/rpython/jit/backend/x86/test/test_runner.py
@@ -39,7 +39,7 @@
                                  'nop; '    # for the label
                                  'add; test; je; jmp;')   # plus some padding
         bridge_loop_instructions = (
-            'cmp; jge; mov;( movabs;)? mov; mov(abs)?; call; mov(abs)?; jmp;')
+            'cmp; jge; mov;( movabs;| lea;)? mov; (mov|movabs|lea); call; 
mov(abs)?; jmp;')
 
     def get_cpu(self):
         cpu = CPU(rtyper=None, stats=FakeStats())
diff --git a/rpython/jit/backend/x86/vector_ext.py 
b/rpython/jit/backend/x86/vector_ext.py
--- a/rpython/jit/backend/x86/vector_ext.py
+++ b/rpython/jit/backend/x86/vector_ext.py
@@ -173,9 +173,10 @@
             return
         elif arg.type == INT:
             scratchloc = X86_64_SCRATCH_REG
+            self.mc.forget_scratch_register()
             self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
             self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
-            self.mc.ADD(targetloc, scratchloc)
+            self.mc.ADD_rr(targetloc.value, scratchloc.value)
             return
 
         not_implemented("reduce sum for %s not impl." % arg)
@@ -387,6 +388,7 @@
             return # already the right size
         if size == 4 and tosize == 8:
             scratch = X86_64_SCRATCH_REG.value
+            self.mc.forget_scratch_register()
             self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
             self.mc.PINSRQ_xri(resloc.value, scratch, 1)
             self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
@@ -394,6 +396,7 @@
         elif size == 8 and tosize == 4:
             # is there a better sequence to move them?
             scratch = X86_64_SCRATCH_REG.value
+            self.mc.forget_scratch_register()
             self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
             self.mc.PINSRD_xri(resloc.value, scratch, 0)
             self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
@@ -426,6 +429,7 @@
     def genop_vec_expand_i(self, op, arglocs, resloc):
         srcloc, sizeloc = arglocs
         if not isinstance(srcloc, RegLoc):
+            # self.mc.forget_scratch_register(): done by self.mov()
             self.mov(srcloc, X86_64_SCRATCH_REG)
             srcloc = X86_64_SCRATCH_REG
         assert not srcloc.is_xmm
@@ -465,6 +469,7 @@
         while k > 0:
             if size == 8:
                 if resultloc.is_xmm and sourceloc.is_xmm: # both xmm
+                    self.mc.forget_scratch_register()
                     self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
                     self.mc.PINSRQ_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
                 elif resultloc.is_xmm: # xmm <- reg
@@ -473,6 +478,7 @@
                     self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si)
             elif size == 4:
                 if resultloc.is_xmm and sourceloc.is_xmm:
+                    self.mc.forget_scratch_register()
                     self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
                     self.mc.PINSRD_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
                 elif resultloc.is_xmm:
@@ -481,6 +487,7 @@
                     self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si)
             elif size == 2:
                 if resultloc.is_xmm and sourceloc.is_xmm:
+                    self.mc.forget_scratch_register()
                     self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
                     self.mc.PINSRW_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
                 elif resultloc.is_xmm:
@@ -489,6 +496,7 @@
                     self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si)
             elif size == 1:
                 if resultloc.is_xmm and sourceloc.is_xmm:
+                    self.mc.forget_scratch_register()
                     self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
                     self.mc.PINSRB_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
                 elif resultloc.is_xmm:
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to