Author: Armin Rigo <[email protected]>
Branch: 
Changeset: r85021:78ccb9fbb54f
Date: 2016-06-08 08:53 +0200
http://bitbucket.org/pypy/pypy/changeset/78ccb9fbb54f/

Log:    Change the registers used by 'malloc_nursery' so that it won't force
        away 'eax' returned by a likely previous 'call'.

        Add a before_call()-like equivalent before a few operations like
        'malloc_nursery', to move the unrelated values from the registers
        into other registers instead of to the stack.

diff --git a/rpython/jit/backend/arm/regalloc.py 
b/rpython/jit/backend/arm/regalloc.py
--- a/rpython/jit/backend/arm/regalloc.py
+++ b/rpython/jit/backend/arm/regalloc.py
@@ -901,6 +901,8 @@
         size_box = op.getarg(0)
         assert isinstance(size_box, ConstInt)
         size = size_box.getint()
+        # hint: try to move unrelated registers away from r0 and r1 now
+        self.rm.spill_or_move_registers_before_call([r.r0, r.r1])
 
         self.rm.force_allocate_reg(op, selected_reg=r.r0)
         t = TempInt()
@@ -924,6 +926,7 @@
         # sizeloc must be in a register, but we can free it now
         # (we take care explicitly of conflicts with r0 or r1)
         sizeloc = self.rm.make_sure_var_in_reg(size_box)
+        self.rm.spill_or_move_registers_before_call([r.r0, r.r1]) # sizeloc 
safe
         self.rm.possibly_free_var(size_box)
         #
         self.rm.force_allocate_reg(op, selected_reg=r.r0)
@@ -951,6 +954,11 @@
         arraydescr = op.getdescr()
         length_box = op.getarg(2)
         assert not isinstance(length_box, Const) # we cannot have a const here!
+        # can only use spill_or_move_registers_before_call() as a hint if
+        # we are sure that length_box stays alive and won't be overridden
+        # (it should always be the case, see below, but better safe than sorry)
+        if self.rm.stays_alive(length_box):
+            self.rm.spill_or_move_registers_before_call([r.r0, r.r1])
         # the result will be in r0
         self.rm.force_allocate_reg(op, selected_reg=r.r0)
         # we need r1 as a temporary
diff --git a/rpython/jit/backend/llsupport/regalloc.py 
b/rpython/jit/backend/llsupport/regalloc.py
--- a/rpython/jit/backend/llsupport/regalloc.py
+++ b/rpython/jit/backend/llsupport/regalloc.py
@@ -579,11 +579,26 @@
         new_free_regs.append(self.reg_bindings.pop(v))
 
     def before_call(self, force_store=[], save_all_regs=0):
-        """Spill or move some registers before a call.  By default,
-        this means: for every register in 'self.save_around_call_regs',
+        self.spill_or_move_registers_before_call(self.save_around_call_regs,
+                                                 force_store, save_all_regs)
+
+    def spill_or_move_registers_before_call(self, save_sublist,
+                                            force_store=[], save_all_regs=0):
+        """Spill or move some registers before a call.
+
+        By default, this means: for every register in 'save_sublist',
         if there is a variable there and it survives longer than
         the current operation, then it is spilled/moved somewhere else.
 
+        WARNING: this might do the equivalent of possibly_free_vars()
+        on variables dying in the current operation.  It won't
+        immediately overwrite registers that used to be occupied by
+        these variables, though.  Use this function *after* you finished
+        calling self.loc() or self.make_sure_var_in_reg(), i.e. when you
+        know the location of all input arguments.  These locations stay
+        valid, but only *if they are in self.save_around_call_regs,*
+        not if they are callee-saved registers!
+
         'save_all_regs' can be 0 (default set of registers), 1 (do that
         for all registers), or 2 (default + gc ptrs).
 
@@ -612,6 +627,16 @@
         anyway, as a local hack in this function, because on x86 CPUs
         such register-register moves are almost free.
         """
+        if not we_are_translated():
+            # 'save_sublist' is either the whole
+            # 'self.save_around_call_regs', or a sublist thereof, and
+            # then only those registers are spilled/moved.  But when
+            # we move them, we never move them to other registers in
+            # 'self.save_around_call_regs', to avoid ping-pong effects
+            # where the same value is constantly moved around.
+            for reg in save_sublist:
+                assert reg in self.save_around_call_regs
+
         new_free_regs = []
         move_or_spill = []
 
@@ -631,7 +656,7 @@
                 # we need to spill all GC ptrs in this mode
                 self._bc_spill(v, new_free_regs)
                 #
-            elif reg not in self.save_around_call_regs:
+            elif reg not in save_sublist:
                 continue  # in a register like ebx/rbx: it is fine where it is
                 #
             else:
@@ -663,6 +688,7 @@
                 if not we_are_translated():
                     if move_or_spill:
                         assert max_age <= min([_a for _, _a in move_or_spill])
+                    assert reg in save_sublist
                     assert reg in self.save_around_call_regs
                     assert new_reg not in self.save_around_call_regs
                 self.assembler.regalloc_mov(reg, new_reg)
diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -204,20 +204,20 @@
 
     def _build_malloc_slowpath(self, kind):
         """ While arriving on slowpath, we have a gcpattern on stack 0.
-        The arguments are passed in eax and edi, as follows:
+        The arguments are passed in ecx and edx, as follows:
 
-        kind == 'fixed': nursery_head in eax and the size in edi - eax.
+        kind == 'fixed': nursery_head in ecx and the size in (edx - ecx).
 
-        kind == 'str/unicode': length of the string to allocate in edi.
+        kind == 'str/unicode': length of the string to allocate in edx.
 
-        kind == 'var': length to allocate in edi, tid in eax,
+        kind == 'var': length to allocate in edx, tid in ecx,
                        and itemsize in the stack 1 (position esp+WORD).
 
-        This function must preserve all registers apart from eax and edi.
+        This function must preserve all registers apart from ecx and edx.
         """
         assert kind in ['fixed', 'str', 'unicode', 'var']
         mc = codebuf.MachineCodeBlockWrapper()
-        self._push_all_regs_to_frame(mc, [eax, edi], self.cpu.supports_floats)
+        self._push_all_regs_to_frame(mc, [ecx, edx], self.cpu.supports_floats)
         # the caller already did push_gcmap(store=True)
         #
         if kind == 'fixed':
@@ -231,32 +231,32 @@
         mc.SUB_ri(esp.value, 16 - WORD)  # restore 16-byte alignment
         # magically, the above is enough on X86_32 to reserve 3 stack places
         if kind == 'fixed':
-            mc.SUB_rr(edi.value, eax.value) # compute the size we want
-            # the arg is already in edi
+            mc.SUB_rr(edx.value, ecx.value) # compute the size we want
             if IS_X86_32:
-                mc.MOV_sr(0, edi.value)
+                mc.MOV_sr(0, edx.value)     # store the length
                 if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
-                    mc.MOV_sr(WORD, ebp.value)
-            elif hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
-                # for tests only
-                mc.MOV_rr(esi.value, ebp.value)
+                    mc.MOV_sr(WORD, ebp.value)        # for tests only
+            else:
+                mc.MOV_rr(edi.value, edx.value)   # length argument
+                if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
+                    mc.MOV_rr(esi.value, ebp.value)   # for tests only
         elif kind == 'str' or kind == 'unicode':
             if IS_X86_32:
                 # stack layout: [---][---][---][ret].. with 3 free stack places
-                mc.MOV_sr(0, edi.value)     # store the length
-            else:
-                pass                        # length already in edi
+                mc.MOV_sr(0, edx.value)     # store the length
+            elif IS_X86_64:
+                mc.MOV_rr(edi.value, edx.value)   # length argument
         else:
             if IS_X86_32:
                 # stack layout: [---][---][---][ret][gcmap][itemsize]...
-                mc.MOV_sr(WORD * 2, edi.value)  # store the length
-                mc.MOV_sr(WORD * 1, eax.value)  # store the tid
-                mc.MOV_rs(edi.value, WORD * 5)  # load the itemsize
-                mc.MOV_sr(WORD * 0, edi.value)  # store the itemsize
+                mc.MOV_sr(WORD * 2, edx.value)  # store the length
+                mc.MOV_sr(WORD * 1, ecx.value)  # store the tid
+                mc.MOV_rs(edx.value, WORD * 5)  # load the itemsize
+                mc.MOV_sr(WORD * 0, edx.value)  # store the itemsize
             else:
                 # stack layout: [---][ret][gcmap][itemsize]...
-                mc.MOV_rr(edx.value, edi.value) # length
-                mc.MOV_rr(esi.value, eax.value) # tid
+                # (already in edx)              # length
+                mc.MOV_rr(esi.value, ecx.value) # tid
                 mc.MOV_rs(edi.value, WORD * 3)  # load the itemsize
         self.set_extra_stack_depth(mc, 16)
         mc.CALL(imm(follow_jump(addr)))
@@ -267,10 +267,11 @@
         mc.TEST_rr(eax.value, eax.value)
         mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later
         jz_location = mc.get_relative_pos()
+        mc.MOV_rr(ecx.value, eax.value)
         #
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
-        self._pop_all_regs_from_frame(mc, [eax, edi], self.cpu.supports_floats)
-        mc.MOV(edi, heap(nursery_free_adr))   # load this in EDI
+        self._pop_all_regs_from_frame(mc, [ecx, edx], self.cpu.supports_floats)
+        mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
         self.pop_gcmap(mc)   # push_gcmap(store=True) done by the caller
         mc.RET()
         #
@@ -2441,9 +2442,9 @@
 
     def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap):
         assert size & (WORD-1) == 0     # must be correctly aligned
-        self.mc.MOV(eax, heap(nursery_free_adr))
-        self.mc.LEA_rm(edi.value, (eax.value, size))
-        self.mc.CMP(edi, heap(nursery_top_adr))
+        self.mc.MOV(ecx, heap(nursery_free_adr))
+        self.mc.LEA_rm(edx.value, (ecx.value, size))
+        self.mc.CMP(edx, heap(nursery_top_adr))
         self.mc.J_il8(rx86.Conditions['NA'], 0) # patched later
         jmp_adr = self.mc.get_relative_pos()
         # save the gcmap
@@ -2452,19 +2453,19 @@
         offset = self.mc.get_relative_pos() - jmp_adr
         assert 0 < offset <= 127
         self.mc.overwrite(jmp_adr-1, chr(offset))
-        self.mc.MOV(heap(nursery_free_adr), edi)
+        self.mc.MOV(heap(nursery_free_adr), edx)
 
     def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr,
                                   sizeloc, gcmap):
-        if sizeloc is eax:
-            self.mc.MOV(edi, sizeloc)
-            sizeloc = edi
-        self.mc.MOV(eax, heap(nursery_free_adr))
-        if sizeloc is edi:
-            self.mc.ADD_rr(edi.value, eax.value)
+        if sizeloc is ecx:
+            self.mc.MOV(edx, sizeloc)
+            sizeloc = edx
+        self.mc.MOV(ecx, heap(nursery_free_adr))
+        if sizeloc is edx:
+            self.mc.ADD_rr(edx.value, ecx.value)
         else:
-            self.mc.LEA_ra(edi.value, (eax.value, sizeloc.value, 0, 0))
-        self.mc.CMP(edi, heap(nursery_top_adr))
+            self.mc.LEA_ra(edx.value, (ecx.value, sizeloc.value, 0, 0))
+        self.mc.CMP(edx, heap(nursery_top_adr))
         self.mc.J_il8(rx86.Conditions['NA'], 0) # patched later
         jmp_adr = self.mc.get_relative_pos()
         # save the gcmap
@@ -2473,7 +2474,7 @@
         offset = self.mc.get_relative_pos() - jmp_adr
         assert 0 < offset <= 127
         self.mc.overwrite(jmp_adr-1, chr(offset))
-        self.mc.MOV(heap(nursery_free_adr), edi)
+        self.mc.MOV(heap(nursery_free_adr), edx)
 
     def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr,
                             lengthloc, itemsize, maxlength, gcmap,
@@ -2482,39 +2483,39 @@
         assert isinstance(arraydescr, ArrayDescr)
 
         # lengthloc is the length of the array, which we must not modify!
-        assert lengthloc is not eax and lengthloc is not edi
+        assert lengthloc is not ecx and lengthloc is not edx
         if isinstance(lengthloc, RegLoc):
             varsizeloc = lengthloc
         else:
-            self.mc.MOV(edi, lengthloc)
-            varsizeloc = edi
+            self.mc.MOV(edx, lengthloc)
+            varsizeloc = edx
 
         self.mc.CMP(varsizeloc, imm(maxlength))
         self.mc.J_il8(rx86.Conditions['A'], 0) # patched later
         jmp_adr0 = self.mc.get_relative_pos()
 
-        self.mc.MOV(eax, heap(nursery_free_adr))
+        self.mc.MOV(ecx, heap(nursery_free_adr))
         if valid_addressing_size(itemsize):
             shift = get_scale(itemsize)
         else:
-            shift = self._imul_const_scaled(self.mc, edi.value,
+            shift = self._imul_const_scaled(self.mc, edx.value,
                                             varsizeloc.value, itemsize)
-            varsizeloc = edi
+            varsizeloc = edx
 
-        # now varsizeloc is a register != eax.  The size of
+        # now varsizeloc is a register != ecx.  The size of
         # the variable part of the array is (varsizeloc << shift)
         assert arraydescr.basesize >= self.gc_minimal_size_in_nursery
         constsize = arraydescr.basesize + self.gc_size_of_header
         force_realignment = (itemsize % WORD) != 0
         if force_realignment:
             constsize += WORD - 1
-        self.mc.LEA_ra(edi.value, (eax.value, varsizeloc.value, shift,
+        self.mc.LEA_ra(edx.value, (ecx.value, varsizeloc.value, shift,
                                    constsize))
         if force_realignment:
-            self.mc.AND_ri(edi.value, ~(WORD - 1))
-        # now edi contains the total size in bytes, rounded up to a multiple
+            self.mc.AND_ri(edx.value, ~(WORD - 1))
+        # now edx contains the total size in bytes, rounded up to a multiple
         # of WORD, plus nursery_free_adr
-        self.mc.CMP(edi, heap(nursery_top_adr))
+        self.mc.CMP(edx, heap(nursery_top_adr))
         self.mc.J_il8(rx86.Conditions['NA'], 0) # patched later
         jmp_adr1 = self.mc.get_relative_pos()
         #
@@ -2525,8 +2526,8 @@
         self.push_gcmap(self.mc, gcmap, store=True)
         if kind == rewrite.FLAG_ARRAY:
             self.mc.MOV_si(WORD, itemsize)
-            self.mc.MOV(edi, lengthloc)
-            self.mc.MOV_ri(eax.value, arraydescr.tid)
+            self.mc.MOV(edx, lengthloc)
+            self.mc.MOV_ri(ecx.value, arraydescr.tid)
             addr = self.malloc_slowpath_varsize
         else:
             if kind == rewrite.FLAG_STR:
@@ -2534,7 +2535,7 @@
             else:
                 assert kind == rewrite.FLAG_UNICODE
                 addr = self.malloc_slowpath_unicode
-            self.mc.MOV(edi, lengthloc)
+            self.mc.MOV(edx, lengthloc)
         self.mc.CALL(imm(follow_jump(addr)))
         self.mc.JMP_l8(0)      # jump to done, patched later
         jmp_location = self.mc.get_relative_pos()
@@ -2544,9 +2545,9 @@
         self.mc.overwrite(jmp_adr1-1, chr(offset))
         self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
         # write down the tid, but not if it's the result of the CALL
-        self.mc.MOV(mem(eax, 0), imm(arraydescr.tid))
+        self.mc.MOV(mem(ecx, 0), imm(arraydescr.tid))
         # while we're at it, this line is not needed if we've done the CALL
-        self.mc.MOV(heap(nursery_free_adr), edi)
+        self.mc.MOV(heap(nursery_free_adr), edx)
         #
         offset = self.mc.get_relative_pos() - jmp_location
         assert 0 < offset <= 127
diff --git a/rpython/jit/backend/x86/regalloc.py 
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -952,14 +952,16 @@
         size_box = op.getarg(0)
         assert isinstance(size_box, ConstInt)
         size = size_box.getint()
-        # looking at the result
-        self.rm.force_allocate_reg(op, selected_reg=eax)
+        # hint: try to move unrelated registers away from eax and edx now
+        self.rm.spill_or_move_registers_before_call([ecx, edx])
+        # the result will be in ecx
+        self.rm.force_allocate_reg(op, selected_reg=ecx)
         #
-        # We need edi as a temporary, but otherwise don't save any more
+        # We need edx as a temporary, but otherwise don't save any more
         # register.  See comments in _build_malloc_slowpath().
         tmp_box = TempVar()
-        self.rm.force_allocate_reg(tmp_box, selected_reg=edi)
-        gcmap = self.get_gcmap([eax, edi]) # allocate the gcmap *before*
+        self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
+        gcmap = self.get_gcmap([ecx, edx]) # allocate the gcmap *before*
         self.rm.possibly_free_var(tmp_box)
         #
         gc_ll_descr = self.assembler.cpu.gc_ll_descr
@@ -972,15 +974,16 @@
         size_box = op.getarg(0)
         assert not isinstance(size_box, Const) # we cannot have a const here!
         # sizeloc must be in a register, but we can free it now
-        # (we take care explicitly of conflicts with eax or edi)
+        # (we take care explicitly of conflicts with ecx or edx)
         sizeloc = self.rm.make_sure_var_in_reg(size_box)
+        self.rm.spill_or_move_registers_before_call([ecx, edx])  # sizeloc safe
         self.rm.possibly_free_var(size_box)
-        # the result will be in eax
-        self.rm.force_allocate_reg(op, selected_reg=eax)
-        # we need edi as a temporary
+        # the result will be in ecx
+        self.rm.force_allocate_reg(op, selected_reg=ecx)
+        # we need edx as a temporary
         tmp_box = TempVar()
-        self.rm.force_allocate_reg(tmp_box, selected_reg=edi)
-        gcmap = self.get_gcmap([eax, edi]) # allocate the gcmap *before*
+        self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
+        gcmap = self.get_gcmap([ecx, edx]) # allocate the gcmap *before*
         self.rm.possibly_free_var(tmp_box)
         #
         gc_ll_descr = self.assembler.cpu.gc_ll_descr
@@ -997,16 +1000,21 @@
         arraydescr = op.getdescr()
         length_box = op.getarg(2)
         assert not isinstance(length_box, Const) # we cannot have a const here!
-        # the result will be in eax
-        self.rm.force_allocate_reg(op, selected_reg=eax)
-        # we need edi as a temporary
+        # can only use spill_or_move_registers_before_call() as a hint if
+        # we are sure that length_box stays alive and won't be overridden
+        # (it should always be the case, see below, but better safe than sorry)
+        if self.rm.stays_alive(length_box):
+            self.rm.spill_or_move_registers_before_call([ecx, edx])
+        # the result will be in ecx
+        self.rm.force_allocate_reg(op, selected_reg=ecx)
+        # we need edx as a temporary
         tmp_box = TempVar()
-        self.rm.force_allocate_reg(tmp_box, selected_reg=edi)
-        gcmap = self.get_gcmap([eax, edi]) # allocate the gcmap *before*
+        self.rm.force_allocate_reg(tmp_box, selected_reg=edx)
+        gcmap = self.get_gcmap([ecx, edx]) # allocate the gcmap *before*
         self.rm.possibly_free_var(tmp_box)
         # length_box always survives: it's typically also present in the
         # next operation that will copy it inside the new array.  It's
-        # fine to load it from the stack too, as long as it's != eax, edi.
+        # fine to load it from the stack too, as long as it is != ecx, edx.
         lengthloc = self.rm.loc(length_box)
         self.rm.possibly_free_var(length_box)
         #
@@ -1225,6 +1233,8 @@
             raise AssertionError("bad unicode item size")
 
     def _consider_math_read_timestamp(self, op):
+        # hint: try to move unrelated registers away from eax and edx now
+        self.rm.spill_or_move_registers_before_call([eax, edx])
         tmpbox_high = TempVar()
         self.rm.force_allocate_reg(tmpbox_high, selected_reg=eax)
         if longlong.is_64_bit:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to