Author: Armin Rigo <ar...@tunes.org> Branch: reusing-r11 Changeset: r90965:d8f5a5347abb Date: 2017-04-05 15:29 +0200 http://bitbucket.org/pypy/pypy/changeset/d8f5a5347abb/
Log: in-progress diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -277,7 +277,7 @@ # mc.TEST_rr(eax.value, eax.value) mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later - jz_location = mc.get_relative_pos() + jz_location = mc.get_relative_pos(break_basic_block=False) mc.MOV_rr(ecx.value, eax.value) # nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr() @@ -718,6 +718,7 @@ if rx86.fits_in_32bits(offset): mc.JMP_l(offset) else: + # mc.forget_scratch_register() not needed here mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target) mc.JMP_r(X86_64_SCRATCH_REG.value) mc.copy_to_raw_memory(adr_jump_offset) @@ -830,10 +831,10 @@ descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu) ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr) mc.CMP_bi(ofs, 0xffffff) # force writing 32 bit - stack_check_cmp_ofs = mc.get_relative_pos() - 4 + stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4 jg_location = mc.emit_forward_jump('GE') mc.MOV_si(WORD, 0xffffff) # force writing 32 bit - ofs2 = mc.get_relative_pos() - 4 + ofs2 = mc.get_relative_pos(break_basic_block=False) - 4 self.push_gcmap(mc, gcmap, store=True) mc.CALL(imm(self._frame_realloc_slowpath)) # patch the JG above @@ -850,11 +851,11 @@ descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu) ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr) mc.CMP_bi(ofs, 0xffffff) - stack_check_cmp_ofs = mc.get_relative_pos() - 4 + stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4 jg_location = mc.emit_forward_jump('GE') mc.MOV_rr(edi.value, ebp.value) mc.MOV_ri(esi.value, 0xffffff) - ofs2 = mc.get_relative_pos() - 4 + ofs2 = mc.get_relative_pos(break_basic_block=False) - 4 mc.CALL(imm(self.cpu.realloc_frame_crash)) # patch the JG above mc.patch_forward_jump(jg_location) @@ -895,6 +896,7 @@ # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there # because we always write "mov r11, imm-as-8-bytes; call *r11" in # the first place. + # mc.forget_scratch_register() not needed here mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target) mc.JMP_r(X86_64_SCRATCH_REG.value) p = rffi.cast(rffi.INTP, adr_jump_offset) @@ -939,7 +941,7 @@ # would be used to pass arguments #3 and #4 (even though, so # far, the assembler only receives two arguments). tloc = esi - old = r11 + old = r10 # eax = address in the stack of a 3-words struct vmprof_stack_s self.mc.LEA_rs(eax.value, (FRAME_FIXED_SIZE - 4) * WORD) # old = current value of vmprof_tl_stack @@ -1023,27 +1025,14 @@ fit in 32 bits, it will be loaded in r11. """ rst = gcrootmap.get_root_stack_top_addr() - if rx86.fits_in_32bits(rst): - mc.MOV_rj(ebx.value, rst) # MOV ebx, [rootstacktop] - else: - mc.MOV_ri(X86_64_SCRATCH_REG.value, rst) # MOV r11, rootstacktop - mc.MOV_rm(ebx.value, (X86_64_SCRATCH_REG.value, 0)) - # MOV ebx, [r11] - # + mc.MOV(ebx, heap(rst)) # maybe via loading r11 return rst def _call_header_shadowstack(self, gcrootmap): rst = self._load_shadowstack_top_in_ebx(self.mc, gcrootmap) self.mc.MOV_mr((ebx.value, 0), ebp.value) # MOV [ebx], ebp self.mc.ADD_ri(ebx.value, WORD) - if rx86.fits_in_32bits(rst): - self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx - else: - # The integer 'rst' doesn't fit in 32 bits, so we know that - # _load_shadowstack_top_in_ebx() above loaded it in r11. - # Reuse it. Be careful not to overwrite r11 in the middle! - self.mc.MOV_mr((X86_64_SCRATCH_REG.value, 0), - ebx.value) # MOV [r11], ebx + self.mc.MOV(heap(rst), ebx) # MOV [rootstacktop], ebx def _call_footer_shadowstack(self, gcrootmap): rst = gcrootmap.get_root_stack_top_addr() @@ -1449,7 +1438,7 @@ # has been emitted. 64-bit mode only. assert IS_X86_64 address_in_buffer = index * WORD # at the start of the buffer - p_location = self.mc.get_relative_pos() + p_location = self.mc.get_relative_pos(break_basic_block=False) offset = address_in_buffer - p_location self.mc.overwrite32(p_location-4, offset) @@ -1551,7 +1540,7 @@ self.mc.add_pending_relocation() elif WORD == 8: self.mc.J_il(rx86.Conditions['Z'], 0) - pos = self.mc.get_relative_pos() + pos = self.mc.get_relative_pos(break_basic_block=False) self.pending_memoryerror_trampoline_from.append(pos) # ---------- @@ -1721,7 +1710,8 @@ def genop_guard_guard_not_invalidated(self, guard_op, guard_token, locs, ign): - pos = self.mc.get_relative_pos() + 1 # after potential jmp + pos = self.mc.get_relative_pos(break_basic_block=False) + pos += 1 # after potential jmp guard_token.pos_jump_offset = pos self.pending_guard_tokens.append(guard_token) @@ -2077,7 +2067,8 @@ assert self.guard_success_cc >= 0 self.mc.J_il(rx86.invert_condition(self.guard_success_cc), 0) self.guard_success_cc = rx86.cond_none - guard_token.pos_jump_offset = self.mc.get_relative_pos() - 4 + pos = self.mc.get_relative_pos(break_basic_block=False) + guard_token.pos_jump_offset = pos - 4 self.pending_guard_tokens.append(guard_token) def _genop_real_call(self, op, arglocs, resloc): @@ -2125,6 +2116,7 @@ faildescrindex = self.get_gcref_from_faildescr(faildescr) if IS_X86_64: + self.mc.forget_scratch_register() self.mc.MOV_rp(X86_64_SCRATCH_REG.value, 0) self._patch_load_from_gc_table(faildescrindex) self.mc.MOV(raw_stack(ofs), X86_64_SCRATCH_REG) @@ -2313,6 +2305,7 @@ if IS_X86_64 and isinstance(loc_base, RegLoc): # copy loc_index into r11 tmp1 = X86_64_SCRATCH_REG + mc.forget_scratch_register() mc.MOV_rr(tmp1.value, loc_index.value) final_pop = False else: @@ -2325,7 +2318,13 @@ # XOR tmp, -8 mc.XOR_ri(tmp1.value, -8) # BTS [loc_base], tmp - mc.BTS(addr_add_const(loc_base, 0), tmp1) + if final_pop: + # r11 is not specially used, fall back to regloc.py + mc.BTS(addr_add_const(loc_base, 0), tmp1) + else: + # tmp1 is r11! but in this case, loc_base is a + # register so we can invoke directly rx86.py + mc.BTS_mr((loc_base.value, 0), tmp1.value) # done if final_pop: mc.POP_r(loc_index.value) diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py --- a/rpython/jit/backend/x86/callbuilder.py +++ b/rpython/jit/backend/x86/callbuilder.py @@ -239,7 +239,7 @@ if IS_X86_32: tmpreg = edx else: - tmpreg = r11 # edx is used for 3rd argument + tmpreg = r10 # edx is used for 3rd argument mc.MOV_rm(tmpreg.value, (tlofsreg.value, p_errno)) mc.MOV32_rm(eax.value, (tlofsreg.value, rpy_errno)) mc.MOV32_mr((tmpreg.value, 0), eax.value) diff --git a/rpython/jit/backend/x86/codebuf.py b/rpython/jit/backend/x86/codebuf.py --- a/rpython/jit/backend/x86/codebuf.py +++ b/rpython/jit/backend/x86/codebuf.py @@ -42,10 +42,10 @@ self.ops_offset = {} def add_pending_relocation(self): - self.relocations.append(self.get_relative_pos()) + self.relocations.append(self.get_relative_pos(break_basic_block=False)) def mark_op(self, op): - pos = self.get_relative_pos() + pos = self.get_relative_pos(break_basic_block=False) self.ops_offset[op] = pos def copy_to_raw_memory(self, addr): @@ -64,11 +64,11 @@ def emit_forward_jump_cond(self, cond): self.J_il8(cond, 0) - return self.get_relative_pos() + return self.get_relative_pos(break_basic_block=False) def emit_forward_jump_uncond(self): self.JMP_l8(0) - return self.get_relative_pos() + return self.get_relative_pos(break_basic_block=False) def patch_forward_jump(self, jcond_location): offset = self.get_relative_pos() - jcond_location @@ -76,3 +76,8 @@ if offset > 127: raise ShortJumpTooFar self.overwrite(jcond_location-1, chr(offset)) + + def get_relative_pos(self, break_basic_block=True): + if break_basic_block: + self.forget_scratch_register() + return BlockBuilderMixin.get_relative_pos(self) diff --git a/rpython/jit/backend/x86/jump.py b/rpython/jit/backend/x86/jump.py --- a/rpython/jit/backend/x86/jump.py +++ b/rpython/jit/backend/x86/jump.py @@ -77,6 +77,7 @@ assembler.regalloc_pop(dst) return assembler.regalloc_mov(src, tmpreg) + assembler.mc.forget_scratch_register() src = tmpreg assembler.regalloc_mov(src, dst) diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py --- a/rpython/jit/backend/x86/regalloc.py +++ b/rpython/jit/backend/x86/regalloc.py @@ -435,9 +435,9 @@ def consider_guard_not_invalidated(self, op): mc = self.assembler.mc - n = mc.get_relative_pos() + n = mc.get_relative_pos(break_basic_block=False) self.perform_guard(op, [], None) - assert n == mc.get_relative_pos() + assert n == mc.get_relative_pos(break_basic_block=False) # ensure that the next label is at least 5 bytes farther than # the current position. Otherwise, when invalidating the guard, # we would overwrite randomly the next label's position. diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py --- a/rpython/jit/backend/x86/regloc.py +++ b/rpython/jit/backend/x86/regloc.py @@ -4,7 +4,7 @@ from rpython.jit.backend.x86.arch import WORD, IS_X86_32, IS_X86_64 from rpython.tool.sourcetools import func_with_new_name from rpython.rlib.objectmodel import specialize, instantiate -from rpython.rlib.rarithmetic import intmask +from rpython.rlib.rarithmetic import intmask, r_uint from rpython.jit.metainterp.history import FLOAT, INT from rpython.jit.codewriter import longlong from rpython.rtyper.lltypesystem import rffi, lltype @@ -355,7 +355,8 @@ # without an xmm scratch reg. X86_64_XMM_SCRATCH_REG = xmm15 -unrolling_location_codes = unrolling_iterable(list("rbsmajix")) +# note: 'r' is after 'i' in this list, for _binaryop() +unrolling_location_codes = unrolling_iterable(list("irbsmajx")) @specialize.arg(1) def _rx86_getattr(obj, methname): @@ -372,9 +373,7 @@ class LocationCodeBuilder(object): _mixin_ = True - _reuse_scratch_register = False # for now, this is always False - _scratch_register_known = False # for now, this is always False - _scratch_register_value = 0 + _scratch_register_value = 0 # 0 means 'unknown' def _binaryop(name): @@ -383,7 +382,7 @@ val2 = loc2.value_i() if name == 'MOV' and isinstance(loc1, RegLoc): self.MOV_ri(loc1.value, val2) - return + return True code1 = loc1.location_code() if code1 == 'j': checkvalue = loc1.value_j() @@ -402,10 +401,11 @@ self.MOV_ri(freereg.value, val2) INSN(self, loc1, freereg) self.POP_r(freereg.value) + return True else: # For this case, we should not need the scratch register more than here. self._load_scratch(val2) - INSN(self, loc1, X86_64_SCRATCH_REG) + return False def invoke(self, codes, val1, val2): methname = name + "_" + codes @@ -433,15 +433,15 @@ code1 = loc1.location_code() code2 = loc2.location_code() - # You can pass in the scratch register as a location, but you - # must be careful not to combine it with location types that - # might need to use the scratch register themselves. - if loc2 is X86_64_SCRATCH_REG: - if code1 == 'j': - assert (name.startswith("MOV") and - rx86.fits_in_32bits(loc1.value_j())) - if loc1 is X86_64_SCRATCH_REG and not name.startswith("MOV"): - assert code2 not in ('j', 'i') + # You cannot pass in the scratch register as a location, + # except with a MOV instruction. + if name.startswith('MOV'): + if loc2 is X86_64_SCRATCH_REG: + assert code1 != 'j' and code1 != 'm' and code1 != 'a' + if loc1 is X86_64_SCRATCH_REG: + self.forget_scratch_register() + elif loc1 is X86_64_SCRATCH_REG or loc2 is X86_64_SCRATCH_REG: + raise AssertionError("%s with scratch reg specified" % name) for possible_code2 in unrolling_location_codes: if not has_implementation_for('?', possible_code2): @@ -451,8 +451,14 @@ # # Fake out certain operations for x86_64 if self.WORD == 8 and possible_code2 == 'i' and not rx86.fits_in_32bits(val2): - insn_with_64_bit_immediate(self, loc1, loc2) - return + if insn_with_64_bit_immediate(self, loc1, loc2): + return # done + loc2 = X86_64_SCRATCH_REG + code2 = 'r' + # NB. unrolling_location_codes contains 'r' + # after 'i', so that it will be found after + # this iteration + continue # # Regular case for possible_code1 in unrolling_location_codes: @@ -487,6 +493,9 @@ def _unaryop(name): def INSN(self, loc): + if loc is X86_64_SCRATCH_REG: + raise AssertionError("%s with scratch reg specified" % name) + code = loc.location_code() for possible_code in unrolling_location_codes: if code == possible_code: @@ -532,6 +541,9 @@ else: methname = name + "_" + possible_code _rx86_getattr(self, methname)(val) + # This is for CALL and JMP, so it's correct to forget + # the value of the R11 register here. + self.forget_scratch_register() return func_with_new_name(INSN, "INSN_" + name) @@ -540,16 +552,18 @@ # If we are within a "reuse_scratch_register" block, we remember the # last value we loaded to the scratch register and encode the address # as an offset from that if we can - if self._scratch_register_known: - offset = addr - self._scratch_register_value + if self._scratch_register_value != 0: + offset = r_uint(addr) - r_uint(self._scratch_register_value) + offset = intmask(offset) if rx86.fits_in_32bits(offset): + print '_addr_as_reg_offset(%x) [REUSED r11+%d]' % ( + addr, offset) return (X86_64_SCRATCH_REG.value, offset) + print '_addr_as_reg_offset(%x) [too far]' % (addr,) # else: fall through - - if self._reuse_scratch_register: - self._scratch_register_known = True - self._scratch_register_value = addr - + else: + print '_addr_as_reg_offset(%x) [new]' % (addr,) + self._scratch_register_value = addr self.MOV_ri(X86_64_SCRATCH_REG.value, addr) return (X86_64_SCRATCH_REG.value, 0) @@ -557,12 +571,11 @@ # For cases where an AddressLoc has the location_code 'm', but # where the static offset does not fit in 32-bits. We have to fall # back to the X86_64_SCRATCH_REG. Returns a new location encoded - # as mode 'm' too. These are all possibly rare cases; don't try - # to reuse a past value of the scratch register at all. - self._scratch_register_known = False - self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset) + # as mode 'm' too. These are all possibly rare cases. + ofs = self._addr_as_reg_offset(static_offset) + self.forget_scratch_register() self.LEA_ra(X86_64_SCRATCH_REG.value, - (basereg, X86_64_SCRATCH_REG.value, 0, 0)) + (basereg, X86_64_SCRATCH_REG.value, 0, ofs)) return (X86_64_SCRATCH_REG.value, 0) def _fix_static_offset_64_a(self, (basereg, scalereg, @@ -570,41 +583,48 @@ # For cases where an AddressLoc has the location_code 'a', but # where the static offset does not fit in 32-bits. We have to fall # back to the X86_64_SCRATCH_REG. In one case it is even more - # annoying. These are all possibly rare cases; don't try to reuse a - # past value of the scratch register at all. - self._scratch_register_known = False - self.MOV_ri(X86_64_SCRATCH_REG.value, static_offset) + # annoying. These are all possibly rare cases. + ofs = self._addr_as_reg_offset(static_offset) # if basereg != rx86.NO_BASE_REGISTER: + self.forget_scratch_register() self.LEA_ra(X86_64_SCRATCH_REG.value, - (basereg, X86_64_SCRATCH_REG.value, 0, 0)) - return (X86_64_SCRATCH_REG.value, scalereg, scale, 0) + (basereg, X86_64_SCRATCH_REG.value, 0, ofs)) + ofs = 0 + return (X86_64_SCRATCH_REG.value, scalereg, scale, ofs) def _load_scratch(self, value): - if (self._scratch_register_known - and value == self._scratch_register_value): - return - if self._reuse_scratch_register: - self._scratch_register_known = True - self._scratch_register_value = value + if self._scratch_register_value != 0: + if self._scratch_register_value == value: + print '_load_scratch(%x) [REUSED]' % (value,) + return + offset = r_uint(value) - r_uint(self._scratch_register_value) + offset = intmask(offset) + if rx86.fits_in_32bits(offset): + print '_load_scratch(%x) [LEA r11+%d]' % (value, offset) + global COUNT_ + try: + COUNT_ += 1 + except NameError: + COUNT_ = 1 + if COUNT_ % 182 == 0: + import pdb;pdb.set_trace() + self.LEA_rm(X86_64_SCRATCH_REG.value, + (X86_64_SCRATCH_REG.value, offset)) + self._scratch_register_value = value + return + print '_load_scratch(%x) [too far]' % (value,) + else: + print '_load_scratch(%x) [new]' % (value,) + self._scratch_register_value = value self.MOV_ri(X86_64_SCRATCH_REG.value, value) + def forget_scratch_register(self): + self._scratch_register_value = 0 + def trap(self): self.INT3() - def begin_reuse_scratch_register(self): - # --NEVER CALLED (only from a specific test)-- - # Flag the beginning of a block where it is okay to reuse the value - # of the scratch register. In theory we shouldn't have to do this if - # we were careful to mark all possible targets of a jump or call, and - # "forget" the value of the scratch register at those positions, but - # for now this seems safer. - self._reuse_scratch_register = True - - def end_reuse_scratch_register(self): - self._reuse_scratch_register = False - self._scratch_register_known = False - def _vector_size_choose(name): def invoke(self, suffix, val1, val2): methname = name + suffix _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit