Author: Armin Rigo <[email protected]>
Branch: continulet-jit-2
Changeset: r53148:4c28945b23e6
Date: 2012-03-03 19:22 +0100
http://bitbucket.org/pypy/pypy/changeset/4c28945b23e6/

Log:    Hack hack hack. Now realloc() is only called if needed. Should
        improve things a lot.

        On the negative side it seems that we need to save and restore the
        XMM registers too.

diff --git a/pypy/jit/backend/x86/arch.py b/pypy/jit/backend/x86/arch.py
--- a/pypy/jit/backend/x86/arch.py
+++ b/pypy/jit/backend/x86/arch.py
@@ -39,6 +39,9 @@
 # has them stored in (ebp+8), (ebp+12), etc.
 OFFSTACK_START_AT_WORD = 2
 #
+# (ebp+4) has the size allocated so far
+OFFSTACK_SIZE_ALLOCATED = 1
+#
 # In stacklet mode, the real frame contains always just OFFSTACK_REAL_FRAME
 # words reserved for temporary usage like call arguments.  To maintain
 # alignment on 32-bit, OFFSTACK_REAL_FRAME % 4 == 3, and it is at least 17
diff --git a/pypy/jit/backend/x86/assembler.py 
b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -15,7 +15,8 @@
 from pypy.jit.backend.x86.arch import (FRAME_FIXED_SIZE, FORCE_INDEX_OFS, WORD,
                                        IS_X86_32, IS_X86_64,
                                        OFFSTACK_REAL_FRAME,
-                                       OFFSTACK_START_AT_WORD)
+                                       OFFSTACK_START_AT_WORD,
+                                       OFFSTACK_SIZE_ALLOCATED)
 
 from pypy.jit.backend.x86.regloc import (eax, ecx, edx, ebx,
                                          esp, ebp, esi, edi,
@@ -125,6 +126,7 @@
             support.ensure_sse2_floats()
             self._build_float_constants()
         self._build_propagate_exception_path()
+        self._build_realloc_bridge_slowpath()
         if gc_ll_descr.get_malloc_slowpath_addr is not None:
             self._build_malloc_slowpath()
         self._build_stack_check_slowpath()
@@ -281,6 +283,93 @@
         self.propagate_exception_path = rawstart
         self.mc = None
 
+    def _build_realloc_bridge_slowpath(self):
+        from pypy.jit.backend.x86.regalloc import gpr_reg_mgr_cls
+        # This defines a function called at the start of a bridge to
+        # increase the size of the off-stack frame.  It must preserve
+        # all registers.
+        #
+        # XXX optimize more: should also patch the original malloc()
+        # call to directly allocate enough
+        #
+        # see _enter_bridge_code() for the following constant: the
+        # new size is not passed explicitly, but needs to be fished
+        # from the code at (retaddr - WORD * realloc_bridge_ofs).
+        # This is commented as "fish fish fish" below.
+        if IS_X86_32:
+            self.realloc_bridge_ofs = 11
+        elif IS_X86_64:
+            self.realloc_bridge_ofs = 19
+        #
+        self.mc = codebuf.MachineCodeBlockWrapper()
+        #
+        # First, save all registers that this code might modify.
+        # Assume that the xmm registers are safe.  Note that this code
+        # will save some registers in the caller's frame, in the
+        # temporary OFFSTACK_REAL_FRAME words.
+        save_regs = gpr_reg_mgr_cls.save_around_call_regs
+        if IS_X86_32:
+            assert OFFSTACK_REAL_FRAME >= 2
+            assert len(save_regs) == 3
+            # there are 3 PUSHes in total here.  With the retaddr, the
+            # stack remains aligned.
+            self.mc.MOV_sr(1*WORD, save_regs[0].value)
+            self.mc.MOV_sr(2*WORD, save_regs[1].value)
+            self.mc.PUSH_r(save_regs[2].value)
+            #
+            # fish fish fish (see above)
+            self.mc.MOV_rs(eax.value, WORD)     # load the retaddr
+            self.mc.PUSH_m((eax.value, -self.realloc_bridge_ofs))
+            #
+            self.mc.LEA_rb(eax.value, -WORD * (FRAME_FIXED_SIZE-1))
+            self.mc.PUSH_r(eax.value)
+            #
+        elif IS_X86_64:
+            assert OFFSTACK_REAL_FRAME >= len(save_regs) - 1
+            # there is only 1 PUSH in total here.  With the retaddr, the
+            # stack remains aligned.
+            for j in range(len(save_regs)-1, 0, -1):
+                self.mc.MOV_sr(j*WORD, save_regs[j].value)
+            self.mc.PUSH_r(save_regs[0].value)
+            #
+            # fish fish fish (see above)
+            self.mc.MOV_rs(esi.value, WORD)     # load the retaddr
+            self.mc.MOV32_rm(esi.value, (esi.value,
+                                         -self.realloc_bridge_ofs))
+            #
+            self.mc.LEA_rb(edi.value, -WORD * (FRAME_FIXED_SIZE-1))
+        #
+        self.mc.CALL(imm(self.offstack_realloc_addr))
+        #
+        # load the updated ebp
+        self.mc.LEA_rm(ebp.value, (eax.value, WORD * (FRAME_FIXED_SIZE-1)))
+        #
+        # fix the OFFSTACK_SIZE_ALLOCATED in the updated memory location
+        if IS_X86_32:
+            self.mc.ADD_ri(esp.value, 2*WORD)
+        self.mc.MOV_rs(eax.value, WORD)      # load the retaddr again
+        self.mc.MOV32_rm(eax.value, (eax.value, -self.realloc_bridge_ofs))
+        self.mc.MOV_br(WORD * OFFSTACK_SIZE_ALLOCATED, eax.value)
+        #
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap is not None and gcrootmap.is_shadow_stack:
+            self._fixup_shadowstack_location(gcrootmap)
+        #
+        # restore all registers and return
+        if IS_X86_32:
+            self.mc.POP_r(save_regs[2].value)
+            self.mc.MOV_rs(save_regs[1].value, 2*WORD)
+            self.mc.MOV_rs(save_regs[0].value, 1*WORD)
+        elif IS_X86_64:
+            self.mc.POP_r(save_regs[0].value)
+            for j in range(len(save_regs)-1, 0, -1):
+                self.mc.MOV_rs(save_regs[j].value, j*WORD)
+        self.mc.RET()
+        #
+        rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
+        self.realloc_bridge_addr = rawstart
+        self.mc = None
+
     def _build_stack_check_slowpath(self):
         _, _, slowpathaddr = self.cpu.insert_stack_check()
         if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
@@ -727,56 +816,20 @@
         return self.mc.get_relative_pos() - 4
 
     def _enter_bridge_code(self, regalloc):
-        # XXX XXX far too heavy saving and restoring
-        j = 0
-        if self.cpu.supports_floats:
-            for reg in regalloc.xrm.save_around_call_regs:
-                self.mc.MOVSD_sx(j, reg.value)
-                j += 8
-        #
-        save_regs = regalloc.rm.save_around_call_regs
-        if IS_X86_32:
-            assert len(save_regs) == 3
-            self.mc.MOV_sr(j, save_regs[0].value)
-            self.mc.PUSH_r(save_regs[1].value)
-            self.mc.PUSH_r(save_regs[2].value)
-            # 4 PUSHes in total, stack remains aligned
-            self.mc.PUSH_i32(0x77777777)     # patched later
-            result = self.mc.get_relative_pos() - 4
-            self.mc.LEA_rb(eax.value, -WORD * (FRAME_FIXED_SIZE-1))
-            self.mc.PUSH_r(eax.value)
-        elif IS_X86_64:
-            # an even number of PUSHes, stack remains aligned
-            assert len(save_regs) & 1 == 0
-            for reg in save_regs:
-                self.mc.PUSH_r(reg.value)
-            self.mc.LEA_rb(edi.value, -WORD * (FRAME_FIXED_SIZE-1))
-            self.mc.MOV_riu32(esi.value, 0x77777777)   # patched later
-            result = self.mc.get_relative_pos() - 4
-        #
-        self.mc.CALL(imm(self.offstack_realloc_addr))
-        #
-        self.mc.LEA_rm(ebp.value, (eax.value, WORD * (FRAME_FIXED_SIZE-1)))
-        #
-        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
-        if gcrootmap is not None and gcrootmap.is_shadow_stack:
-            self._fixup_shadowstack_location(gcrootmap)
-        #
-        if IS_X86_32:
-            self.mc.ADD_ri(esp.value, 2*WORD)
-            self.mc.POP_r(save_regs[2].value)
-            self.mc.POP_r(save_regs[1].value)
-            self.mc.MOV_rs(save_regs[0].value, j)
-        elif IS_X86_64:
-            for i in range(len(save_regs)-1, -1, -1):
-                self.mc.POP_r(save_regs[i].value)
-        #
-        if self.cpu.supports_floats:
-            j = 0
-            for reg in regalloc.xrm.save_around_call_regs:
-                self.mc.MOVSD_xs(reg.value, j)
-                j += 8
-        #
+        self.mc.CMP_bi(WORD * OFFSTACK_SIZE_ALLOCATED, 0x77777777)
+        result = self.mc.get_relative_pos() - 4
+        self.mc.J_il8(rx86.Conditions['NB'], 0)     # JNB .skip
+        jnb_location = self.mc.get_relative_pos()
+        if WORD == 4:
+            self.mc.CALL(imm(self.realloc_bridge_addr))
+        else:
+            # must always use the long, 13-bytes encoding here
+            self.mc.MOV_ri64(r11.value, self.realloc_bridge_addr)
+            self.mc.CALL_r(r11.value)
+        assert self.mc.get_relative_pos() - result == self.realloc_bridge_ofs
+        offset = self.mc.get_relative_pos() - jnb_location
+        assert 0 <= offset <= 127
+        self.mc.overwrite(jnb_location-1, chr(offset))
         return result
 
     def _patch_stackadjust(self, adr_to_fix, allocated_depth):
@@ -802,38 +855,49 @@
         return -WORD * aligned_words
 
     def _call_header(self):
-        # NB. the shape of the frame is hard-coded in get_basic_shape() too.
-        # Also, make sure this is consistent with FRAME_FIXED_SIZE.
+        # the frame has always a fixed size of OFFSTACK_REAL_FRAME words.
+        self.mc.SUB_ri(esp.value, WORD * OFFSTACK_REAL_FRAME)
+        #
         if IS_X86_32:
-            self.mc.SUB_ri(esp.value, WORD * (OFFSTACK_REAL_FRAME-1))
-            self.mc.PUSH_i32(0x77777777)     # temporary
+            # save (and later restore) the value of edi
+            self.mc.MOV_sr(WORD, edi.value)
+            self.mc.MOV_ri(edi.value, 0x77777777)     # temporary
         elif IS_X86_64:
-            # XXX very heavily save and restore all possible argument registers
+            # XXX need to save and restore all possible argument registers
             save_regs = [r9, r8, ecx, edx, esi, edi]
-            save_xmm_regs = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
-            assert OFFSTACK_REAL_FRAME >= len(save_regs) + len(save_xmm_regs)
-            self.mc.SUB_ri(esp.value, WORD * OFFSTACK_REAL_FRAME)
+            assert OFFSTACK_REAL_FRAME > len(save_regs)
             for i in range(len(save_regs)):
-                self.mc.MOV_sr(WORD * i, save_regs[i].value)
-            base = len(save_regs)
-            for i in range(len(save_xmm_regs)):
-                self.mc.MOVSD_sx(WORD * (base + i), save_xmm_regs[i].value)
-            #
+                self.mc.MOV_sr(WORD * (1 + i), save_regs[i].value)
+            # assume that the XMM registers are safe.
             self.mc.MOV_riu32(edi.value, 0x77777777)     # temporary
         frame_size_pos = self.mc.get_relative_pos() - 4
         #
+        self.mc.MOV_sr(0, edi.value)
         self.mc.CALL(imm(self.offstack_malloc_addr))
         #
-        if IS_X86_64:
-            for i in range(len(save_regs)):
-                self.mc.MOV_rs(save_regs[i].value, WORD * i)
-            base = len(save_regs)
-            for i in range(len(save_xmm_regs)):
-                self.mc.MOVSD_xs(save_xmm_regs[i].value, WORD * (base + i))
-        #
+        # save in the freshly malloc'ed block the original value of ebp
         self.mc.MOV_mr((eax.value, WORD * (FRAME_FIXED_SIZE-1)),
                        ebp.value)                      # (new ebp) <- ebp
         self.mc.LEA_rm(ebp.value, (eax.value, WORD * (FRAME_FIXED_SIZE-1)))
+        #
+        # save in OFFSTACK_SIZE_ALLOCATED the allocated size
+        if IS_X86_32:
+            # edi is preserved by the CALL above
+            self.mc.MOV_br(WORD * OFFSTACK_SIZE_ALLOCATED, edi.value)
+            # now restore to original value of edi
+            self.mc.MOV_rs(edi.value, WORD)
+            #
+        elif IS_X86_64:
+            # reload edi from the stack and save it in the freshly
+            # malloc'ed block
+            self.mc.MOV_rs(edi.value, 0)
+            self.mc.MOV_br(WORD * OFFSTACK_SIZE_ALLOCATED, edi.value)
+            # reload the original value of the save_regs (including edi)
+            for i in range(len(save_regs)):
+                self.mc.MOV_rs(save_regs[i].value, WORD * (1 + i))
+        #
+        # save in the freshly malloc'ed block the original value of
+        # all other callee-saved registers
         for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)):
             loc = self.cpu.CALLEE_SAVE_REGISTERS[i]
             self.mc.MOV_br(WORD*(-1-i), loc.value)     # (ebp-4-4*i) <- reg
diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -531,6 +531,7 @@
 
     PUSH_r = insn(rex_nw, register(1), '\x50')
     PUSH_b = insn(rex_nw, '\xFF', orbyte(6<<3), stack_bp(1))
+    PUSH_m = insn(rex_nw, '\xFF', orbyte(6<<3), mem_reg_plus_const(1))
     PUSH_i32 = insn('\x68', immediate(1, 'i'))
 
     POP_r = insn(rex_nw, register(1), '\x58')
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to