Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r90976:6b21a8908e9f Date: 2017-04-05 19:22 +0200 http://bitbucket.org/pypy/pypy/changeset/6b21a8908e9f/
Log: hg merge reusing-r11 Improve the generated machine code by tracking the (constant) value of r11 across intructions. This lets us avoid reloading r11 with another (apparently slowish) "movabs" instruction, replacing it with either nothing or a cheaper variant. diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -277,7 +277,7 @@ # mc.TEST_rr(eax.value, eax.value) mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later - jz_location = mc.get_relative_pos() + jz_location = mc.get_relative_pos(break_basic_block=False) mc.MOV_rr(ecx.value, eax.value) # nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr() @@ -718,6 +718,7 @@ if rx86.fits_in_32bits(offset): mc.JMP_l(offset) else: + # mc.forget_scratch_register() not needed here mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target) mc.JMP_r(X86_64_SCRATCH_REG.value) mc.copy_to_raw_memory(adr_jump_offset) @@ -830,10 +831,10 @@ descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu) ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr) mc.CMP_bi(ofs, 0xffffff) # force writing 32 bit - stack_check_cmp_ofs = mc.get_relative_pos() - 4 + stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4 jg_location = mc.emit_forward_jump('GE') mc.MOV_si(WORD, 0xffffff) # force writing 32 bit - ofs2 = mc.get_relative_pos() - 4 + ofs2 = mc.get_relative_pos(break_basic_block=False) - 4 self.push_gcmap(mc, gcmap, store=True) mc.CALL(imm(self._frame_realloc_slowpath)) # patch the JG above @@ -850,11 +851,11 @@ descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu) ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr) mc.CMP_bi(ofs, 0xffffff) - stack_check_cmp_ofs = mc.get_relative_pos() - 4 + stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4 jg_location = mc.emit_forward_jump('GE') mc.MOV_rr(edi.value, ebp.value) mc.MOV_ri(esi.value, 0xffffff) - ofs2 = mc.get_relative_pos() - 4 + ofs2 = mc.get_relative_pos(break_basic_block=False) - 4 mc.CALL(imm(self.cpu.realloc_frame_crash)) # patch the JG above mc.patch_forward_jump(jg_location) @@ -895,6 +896,7 @@ # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there # because we always write "mov r11, imm-as-8-bytes; call *r11" in # the first place. + # mc.forget_scratch_register() not needed here mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target) mc.JMP_r(X86_64_SCRATCH_REG.value) p = rffi.cast(rffi.INTP, adr_jump_offset) @@ -939,7 +941,7 @@ # would be used to pass arguments #3 and #4 (even though, so # far, the assembler only receives two arguments). tloc = esi - old = r11 + old = r10 # eax = address in the stack of a 3-words struct vmprof_stack_s self.mc.LEA_rs(eax.value, (FRAME_FIXED_SIZE - 4) * WORD) # old = current value of vmprof_tl_stack @@ -1023,27 +1025,14 @@ fit in 32 bits, it will be loaded in r11. """ rst = gcrootmap.get_root_stack_top_addr() - if rx86.fits_in_32bits(rst): - mc.MOV_rj(ebx.value, rst) # MOV ebx, [rootstacktop] - else: - mc.MOV_ri(X86_64_SCRATCH_REG.value, rst) # MOV r11, rootstacktop - mc.MOV_rm(ebx.value, (X86_64_SCRATCH_REG.value, 0)) - # MOV ebx, [r11] - # + mc.MOV(ebx, heap(rst)) # maybe via loading r11 return rst def _call_header_shadowstack(self, gcrootmap): rst = self._load_shadowstack_top_in_ebx(self.mc, gcrootmap) self.mc.MOV_mr((ebx.value, 0), ebp.value) # MOV [ebx], ebp self.mc.ADD_ri(ebx.value, WORD) - if rx86.fits_in_32bits(rst): - self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx - else: - # The integer 'rst' doesn't fit in 32 bits, so we know that - # _load_shadowstack_top_in_ebx() above loaded it in r11. - # Reuse it. Be careful not to overwrite r11 in the middle! - self.mc.MOV_mr((X86_64_SCRATCH_REG.value, 0), - ebx.value) # MOV [r11], ebx + self.mc.MOV(heap(rst), ebx) # MOV [rootstacktop], ebx def _call_footer_shadowstack(self, gcrootmap): rst = gcrootmap.get_root_stack_top_addr() @@ -1449,7 +1438,7 @@ # has been emitted. 64-bit mode only. assert IS_X86_64 address_in_buffer = index * WORD # at the start of the buffer - p_location = self.mc.get_relative_pos() + p_location = self.mc.get_relative_pos(break_basic_block=False) offset = address_in_buffer - p_location self.mc.overwrite32(p_location-4, offset) @@ -1551,7 +1540,7 @@ self.mc.add_pending_relocation() elif WORD == 8: self.mc.J_il(rx86.Conditions['Z'], 0) - pos = self.mc.get_relative_pos() + pos = self.mc.get_relative_pos(break_basic_block=False) self.pending_memoryerror_trampoline_from.append(pos) # ---------- @@ -1721,7 +1710,8 @@ def genop_guard_guard_not_invalidated(self, guard_op, guard_token, locs, ign): - pos = self.mc.get_relative_pos() + 1 # after potential jmp + pos = self.mc.get_relative_pos(break_basic_block=False) + pos += 1 # after potential jmp guard_token.pos_jump_offset = pos self.pending_guard_tokens.append(guard_token) @@ -2077,7 +2067,8 @@ assert self.guard_success_cc >= 0 self.mc.J_il(rx86.invert_condition(self.guard_success_cc), 0) self.guard_success_cc = rx86.cond_none - guard_token.pos_jump_offset = self.mc.get_relative_pos() - 4 + pos = self.mc.get_relative_pos(break_basic_block=False) + guard_token.pos_jump_offset = pos - 4 self.pending_guard_tokens.append(guard_token) def _genop_real_call(self, op, arglocs, resloc): @@ -2125,6 +2116,7 @@ faildescrindex = self.get_gcref_from_faildescr(faildescr) if IS_X86_64: + self.mc.forget_scratch_register() self.mc.MOV_rp(X86_64_SCRATCH_REG.value, 0) self._patch_load_from_gc_table(faildescrindex) self.mc.MOV(raw_stack(ofs), X86_64_SCRATCH_REG) @@ -2313,6 +2305,7 @@ if IS_X86_64 and isinstance(loc_base, RegLoc): # copy loc_index into r11 tmp1 = X86_64_SCRATCH_REG + mc.forget_scratch_register() mc.MOV_rr(tmp1.value, loc_index.value) final_pop = False else: @@ -2325,7 +2318,13 @@ # XOR tmp, -8 mc.XOR_ri(tmp1.value, -8) # BTS [loc_base], tmp - mc.BTS(addr_add_const(loc_base, 0), tmp1) + if final_pop: + # r11 is not specially used, fall back to regloc.py + mc.BTS(addr_add_const(loc_base, 0), tmp1) + else: + # tmp1 is r11! but in this case, loc_base is a + # register so we can invoke directly rx86.py + mc.BTS_mr((loc_base.value, 0), tmp1.value) # done if final_pop: mc.POP_r(loc_index.value) diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py --- a/rpython/jit/backend/x86/callbuilder.py +++ b/rpython/jit/backend/x86/callbuilder.py @@ -239,7 +239,7 @@ if IS_X86_32: tmpreg = edx else: - tmpreg = r11 # edx is used for 3rd argument + tmpreg = r10 # edx is used for 3rd argument mc.MOV_rm(tmpreg.value, (tlofsreg.value, p_errno)) mc.MOV32_rm(eax.value, (tlofsreg.value, rpy_errno)) mc.MOV32_mr((tmpreg.value, 0), eax.value) diff --git a/rpython/jit/backend/x86/codebuf.py b/rpython/jit/backend/x86/codebuf.py --- a/rpython/jit/backend/x86/codebuf.py +++ b/rpython/jit/backend/x86/codebuf.py @@ -42,10 +42,10 @@ self.ops_offset = {} def add_pending_relocation(self): - self.relocations.append(self.get_relative_pos()) + self.relocations.append(self.get_relative_pos(break_basic_block=False)) def mark_op(self, op): - pos = self.get_relative_pos() + pos = self.get_relative_pos(break_basic_block=False) self.ops_offset[op] = pos def copy_to_raw_memory(self, addr): @@ -64,11 +64,11 @@ def emit_forward_jump_cond(self, cond): self.J_il8(cond, 0) - return self.get_relative_pos() + return self.get_relative_pos(break_basic_block=False) def emit_forward_jump_uncond(self): self.JMP_l8(0) - return self.get_relative_pos() + return self.get_relative_pos(break_basic_block=False) def patch_forward_jump(self, jcond_location): offset = self.get_relative_pos() - jcond_location @@ -76,3 +76,8 @@ if offset > 127: raise ShortJumpTooFar self.overwrite(jcond_location-1, chr(offset)) + + def get_relative_pos(self, break_basic_block=True): + if break_basic_block: + self.forget_scratch_register() + return BlockBuilderMixin.get_relative_pos(self) diff --git a/rpython/jit/backend/x86/jump.py b/rpython/jit/backend/x86/jump.py --- a/rpython/jit/backend/x86/jump.py +++ b/rpython/jit/backend/x86/jump.py @@ -77,6 +77,7 @@ assembler.regalloc_pop(dst) return assembler.regalloc_mov(src, tmpreg) + assembler.mc.forget_scratch_register() src = tmpreg assembler.regalloc_mov(src, dst) diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py --- a/rpython/jit/backend/x86/regalloc.py +++ b/rpython/jit/backend/x86/regalloc.py @@ -435,9 +435,9 @@ def consider_guard_not_invalidated(self, op): mc = self.assembler.mc - n = mc.get_relative_pos() + n = mc.get_relative_pos(break_basic_block=False) self.perform_guard(op, [], None) - assert n == mc.get_relative_pos() + assert n == mc.get_relative_pos(break_basic_block=False) # ensure that the next label is at least 5 bytes farther than # the current position. Otherwise, when invalidating the guard, # we would overwrite randomly the next label's position. diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py --- a/rpython/jit/backend/x86/regloc.py +++ b/rpython/jit/backend/x86/regloc.py @@ -4,7 +4,7 @@ from rpython.jit.backend.x86.arch import WORD, IS_X86_32, IS_X86_64 from rpython.tool.sourcetools import func_with_new_name from rpython.rlib.objectmodel import specialize, instantiate -from rpython.rlib.rarithmetic import intmask +from rpython.rlib.rarithmetic import intmask, r_uint from rpython.jit.metainterp.history import FLOAT, INT from rpython.jit.codewriter import longlong from rpython.rtyper.lltypesystem import rffi, lltype @@ -355,7 +355,8 @@ # without an xmm scratch reg. X86_64_XMM_SCRATCH_REG = xmm15 -unrolling_location_codes = unrolling_iterable(list("rbsmajix")) +# note: 'r' is after 'i' in this list, for _binaryop() +unrolling_location_codes = unrolling_iterable(list("irbsmajx")) @specialize.arg(1) def _rx86_getattr(obj, methname): @@ -372,9 +373,7 @@ class LocationCodeBuilder(object): _mixin_ = True - _reuse_scratch_register = False # for now, this is always False - _scratch_register_known = False # for now, this is always False - _scratch_register_value = 0 + _scratch_register_value = 0 # 0 means 'unknown' def _binaryop(name): @@ -383,7 +382,7 @@ val2 = loc2.value_i() if name == 'MOV' and isinstance(loc1, RegLoc): self.MOV_ri(loc1.value, val2) - return + return True code1 = loc1.location_code() if code1 == 'j': checkvalue = loc1.value_j() @@ -402,10 +401,11 @@ self.MOV_ri(freereg.value, val2) INSN(self, loc1, freereg) self.POP_r(freereg.value) + return True else: # For this case, we should not need the scratch register more than here. self._load_scratch(val2) - INSN(self, loc1, X86_64_SCRATCH_REG) + return False def invoke(self, codes, val1, val2): methname = name + "_" + codes @@ -433,15 +433,15 @@ code1 = loc1.location_code() code2 = loc2.location_code() - # You can pass in the scratch register as a location, but you - # must be careful not to combine it with location types that - # might need to use the scratch register themselves. - if loc2 is X86_64_SCRATCH_REG: - if code1 == 'j': - assert (name.startswith("MOV") and - rx86.fits_in_32bits(loc1.value_j())) - if loc1 is X86_64_SCRATCH_REG and not name.startswith("MOV"): - assert code2 not in ('j', 'i') + # You cannot pass in the scratch register as a location, + # except with a MOV instruction. + if name.startswith('MOV'): + if loc2 is X86_64_SCRATCH_REG: + assert code1 != 'j' and code1 != 'm' and code1 != 'a' + if loc1 is X86_64_SCRATCH_REG: + self.forget_scratch_register() + elif loc1 is X86_64_SCRATCH_REG or loc2 is X86_64_SCRATCH_REG: + raise AssertionError("%s with scratch reg specified" % name) for possible_code2 in unrolling_location_codes: if not has_implementation_for('?', possible_code2): @@ -451,8 +451,14 @@ # # Fake out certain operations for x86_64 if self.WORD == 8 and possible_code2 == 'i' and not rx86.fits_in_32bits(val2): - insn_with_64_bit_immediate(self, loc1, loc2) - return + if insn_with_64_bit_immediate(self, loc1, loc2): + return # done + loc2 = X86_64_SCRATCH_REG + code2 = 'r' + # NB. unrolling_location_codes contains 'r' + # after 'i', so that it will be found after + # this iteration + continue # # Regular case for possible_code1 in unrolling_location_codes: @@ -487,6 +493,9 @@ def _unaryop(name): def INSN(self, loc): + if loc is X86_64_SCRATCH_REG: + raise AssertionError("%s with scratch reg specified" % name) + code = loc.location_code() for possible_code in unrolling_location_codes: if code == possible_code: @@ -532,6 +541,9 @@ else: methname = name + "_" + possible_code _rx86_getattr(self, methname)(val) + # This is for CALL and JMP, so it's correct to forget + # the value of the R11 register here. + self.forget_scratch_register() return func_with_new_name(INSN, "INSN_" + name) @@ -540,16 +552,18 @@ # If we are within a "reuse_scratch_register" block, we remember the # last value we loaded to the scratch register and encode the address # as an offset from that if we can - if self._scratch_register_known: - offset = addr - self._scratch_register_value + if self._scratch_register_value != 0: + offset = r_uint(addr) - r_uint(self._scratch_register_value) + offset = intmask(offset) if rx86.fits_in_32bits(offset): + #print '_addr_as_reg_offset(%x) [REUSED r11+%d]' % ( + # addr, offset) return (X86_64_SCRATCH_REG.value, offset) + #print '_addr_as_reg_offset(%x) [too far]' % (addr,) # else: fall through - - if self._reuse_scratch_register: - self._scratch_register_known = True - self._scratch_register_value = addr - + #else: + # print '_addr_as_reg_offset(%x) [new]' % (addr,) + self._scratch_register_value = addr self.MOV_ri(X86_64_SCRATCH_REG.value, addr) return (X86_64_SCRATCH_REG.value, 0) @@ -557,12 +571,10 @@ # For cases where an AddressLoc has the location_code 'm', but # where the static offset does not fit in 32-bits. We have to fall # back to the X86_64_SCRATCH_REG. Returns a new location encoded - # as mode 'm' too. These are all possibly rare cases; don't try - # to reuse a past value of the scratch register at all. - self._scratch_register_known = False - self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset) - self.LEA_ra(X86_64_SCRATCH_REG.value, - (basereg, X86_64_SCRATCH_REG.value, 0, 0)) + # as mode 'm' too. These are all possibly rare cases. + reg, ofs = self._addr_as_reg_offset(static_offset) + self.forget_scratch_register() + self.LEA_ra(X86_64_SCRATCH_REG.value, (basereg, reg, 0, ofs)) return (X86_64_SCRATCH_REG.value, 0) def _fix_static_offset_64_a(self, (basereg, scalereg, @@ -570,41 +582,48 @@ # For cases where an AddressLoc has the location_code 'a', but # where the static offset does not fit in 32-bits. We have to fall # back to the X86_64_SCRATCH_REG. In one case it is even more - # annoying. These are all possibly rare cases; don't try to reuse a - # past value of the scratch register at all. - self._scratch_register_known = False - self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset) + # annoying. These are all possibly rare cases. + reg, ofs = self._addr_as_reg_offset(static_offset) # if basereg != rx86.NO_BASE_REGISTER: - self.LEA_ra(X86_64_SCRATCH_REG.value, - (basereg, X86_64_SCRATCH_REG.value, 0, 0)) - return (X86_64_SCRATCH_REG.value, scalereg, scale, 0) + self.forget_scratch_register() + self.LEA_ra(X86_64_SCRATCH_REG.value, (basereg, reg, 0, ofs)) + reg = X86_64_SCRATCH_REG.value + ofs = 0 + return (reg, scalereg, scale, ofs) def _load_scratch(self, value): - if (self._scratch_register_known - and value == self._scratch_register_value): - return - if self._reuse_scratch_register: - self._scratch_register_known = True - self._scratch_register_value = value + if self._scratch_register_value != 0: + if self._scratch_register_value == value: + #print '_load_scratch(%x) [REUSED]' % (value,) + return + offset = r_uint(value) - r_uint(self._scratch_register_value) + offset = intmask(offset) + if rx86.fits_in_32bits(offset): + #print '_load_scratch(%x) [LEA r11+%d]' % (value, offset) + #global COUNT_ + #try: + # COUNT_ += 1 + #except NameError: + # COUNT_ = 1 + #if COUNT_ % 182 == 0: + # import pdb;pdb.set_trace() + self.LEA_rm(X86_64_SCRATCH_REG.value, + (X86_64_SCRATCH_REG.value, offset)) + self._scratch_register_value = value + return + #print '_load_scratch(%x) [too far]' % (value,) + #else: + # print '_load_scratch(%x) [new]' % (value,) + self._scratch_register_value = value self.MOV_ri(X86_64_SCRATCH_REG.value, value) + def forget_scratch_register(self): + self._scratch_register_value = 0 + def trap(self): self.INT3() - def begin_reuse_scratch_register(self): - # --NEVER CALLED (only from a specific test)-- - # Flag the beginning of a block where it is okay to reuse the value - # of the scratch register. In theory we shouldn't have to do this if - # we were careful to mark all possible targets of a jump or call, and - # "forget" the value of the scratch register at those positions, but - # for now this seems safer. - self._reuse_scratch_register = True - - def end_reuse_scratch_register(self): - self._reuse_scratch_register = False - self._scratch_register_known = False - def _vector_size_choose(name): def invoke(self, suffix, val1, val2): methname = name + suffix diff --git a/rpython/jit/backend/x86/test/test_jump.py b/rpython/jit/backend/x86/test/test_jump.py --- a/rpython/jit/backend/x86/test/test_jump.py +++ b/rpython/jit/backend/x86/test/test_jump.py @@ -26,6 +26,11 @@ assert isinstance(to_loc, FrameLoc) self.ops.append(('immedmem2mem', from_loc, to_loc)) + class mc: + @staticmethod + def forget_scratch_register(): + pass + def got(self, expected): print '------------------------ comparing ---------------------------' for op1, op2 in zip(self.ops, expected): @@ -405,6 +410,10 @@ print "pop", x def regalloc_immedmem2mem(self, x, y): print "?????????????????????????" + class mc: + @staticmethod + def forget_scratch_register(): + pass def main(): srclocs = [FrameLoc(9999, x, 'i') for x,y in CASE] dstlocs = [FrameLoc(9999, y, 'i') for x,y in CASE] diff --git a/rpython/jit/backend/x86/test/test_regloc.py b/rpython/jit/backend/x86/test/test_regloc.py --- a/rpython/jit/backend/x86/test/test_regloc.py +++ b/rpython/jit/backend/x86/test/test_regloc.py @@ -149,10 +149,8 @@ def test_reuse_scratch_register(self): base_addr = intmask(0xFEDCBA9876543210) cb = LocationCodeBuilder64() - cb.begin_reuse_scratch_register() cb.MOV(ecx, heap(base_addr)) cb.MOV(ecx, heap(base_addr + 8)) - cb.end_reuse_scratch_register() expected_instructions = ( # mov r11, 0xFEDCBA9876543210 @@ -213,12 +211,9 @@ def test_64bit_address_4(self): base_addr = intmask(0xFEDCBA9876543210) cb = LocationCodeBuilder64() - cb.begin_reuse_scratch_register() - assert cb._reuse_scratch_register is True - assert cb._scratch_register_known is False + assert cb._scratch_register_value == 0 cb.MOV(ecx, AddressLoc(edx, esi, 2, base_addr)) - assert cb._reuse_scratch_register is True - assert cb._scratch_register_known is False + assert cb._scratch_register_value == 0 # this case is a CMP_ra # expected_instructions = ( diff --git a/rpython/jit/backend/x86/test/test_runner.py b/rpython/jit/backend/x86/test/test_runner.py --- a/rpython/jit/backend/x86/test/test_runner.py +++ b/rpython/jit/backend/x86/test/test_runner.py @@ -39,7 +39,7 @@ 'nop; ' # for the label 'add; test; je; jmp;') # plus some padding bridge_loop_instructions = ( - 'cmp; jge; mov;( movabs;)? mov; mov(abs)?; call; mov(abs)?; jmp;') + 'cmp; jge; mov;( movabs;| lea;)? mov; (mov|movabs|lea); call; mov(abs)?; jmp;') def get_cpu(self): cpu = CPU(rtyper=None, stats=FakeStats()) diff --git a/rpython/jit/backend/x86/vector_ext.py b/rpython/jit/backend/x86/vector_ext.py --- a/rpython/jit/backend/x86/vector_ext.py +++ b/rpython/jit/backend/x86/vector_ext.py @@ -173,9 +173,10 @@ return elif arg.type == INT: scratchloc = X86_64_SCRATCH_REG + self.mc.forget_scratch_register() self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0) self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1) - self.mc.ADD(targetloc, scratchloc) + self.mc.ADD_rr(targetloc.value, scratchloc.value) return not_implemented("reduce sum for %s not impl." % arg) @@ -387,6 +388,7 @@ return # already the right size if size == 4 and tosize == 8: scratch = X86_64_SCRATCH_REG.value + self.mc.forget_scratch_register() self.mc.PEXTRD_rxi(scratch, srcloc.value, 1) self.mc.PINSRQ_xri(resloc.value, scratch, 1) self.mc.PEXTRD_rxi(scratch, srcloc.value, 0) @@ -394,6 +396,7 @@ elif size == 8 and tosize == 4: # is there a better sequence to move them? scratch = X86_64_SCRATCH_REG.value + self.mc.forget_scratch_register() self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0) self.mc.PINSRD_xri(resloc.value, scratch, 0) self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1) @@ -426,6 +429,7 @@ def genop_vec_expand_i(self, op, arglocs, resloc): srcloc, sizeloc = arglocs if not isinstance(srcloc, RegLoc): + # self.mc.forget_scratch_register(): done by self.mov() self.mov(srcloc, X86_64_SCRATCH_REG) srcloc = X86_64_SCRATCH_REG assert not srcloc.is_xmm @@ -465,6 +469,7 @@ while k > 0: if size == 8: if resultloc.is_xmm and sourceloc.is_xmm: # both xmm + self.mc.forget_scratch_register() self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si) self.mc.PINSRQ_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri) elif resultloc.is_xmm: # xmm <- reg @@ -473,6 +478,7 @@ self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si) elif size == 4: if resultloc.is_xmm and sourceloc.is_xmm: + self.mc.forget_scratch_register() self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si) self.mc.PINSRD_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri) elif resultloc.is_xmm: @@ -481,6 +487,7 @@ self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si) elif size == 2: if resultloc.is_xmm and sourceloc.is_xmm: + self.mc.forget_scratch_register() self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si) self.mc.PINSRW_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri) elif resultloc.is_xmm: @@ -489,6 +496,7 @@ self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si) elif size == 1: if resultloc.is_xmm and sourceloc.is_xmm: + self.mc.forget_scratch_register() self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si) self.mc.PINSRB_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri) elif resultloc.is_xmm: _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit