Author: Armin Rigo <ar...@tunes.org>
Branch: ppc-updated-backend
Changeset: r79807:293103ba23de
Date: 2015-09-24 10:45 +0200
http://bitbucket.org/pypy/pypy/changeset/293103ba23de/

Log:    PPC Backend #5: GC

        Support for a non-testing, non-Boehm GC. Implemented the various
        kinds of mallocs, the write barriers, and other GC-related code like
        pushing and popping the gcmap, reloading the jitframe, etc.

        Update some tests outside the PPC backend, too.

diff too long, truncating to 2000 out of 2034 lines

diff --git a/rpython/jit/backend/arm/test/test_runner.py 
b/rpython/jit/backend/arm/test/test_runner.py
--- a/rpython/jit/backend/arm/test/test_runner.py
+++ b/rpython/jit/backend/arm/test/test_runner.py
@@ -26,24 +26,24 @@
     # for the individual tests see
     # ====> ../../test/runner_test.py
 
-    add_loop_instructions = ['ldr', 'adds', 'cmp', 'beq', 'b']
-    bridge_loop_instructions = ['ldr', 'mov', 'nop', 'cmp', 'bge',
-                                'push', 'mov', 'mov', 'push', 'mov', 'mov',
-                                'blx', 'mov', 'mov', 'bx']
+    add_loop_instructions = 'ldr; adds; cmp; beq; b;$'
+    bridge_loop_instructions = ('ldr; mov; nop; cmp; bge; '
+                                'push; mov; mov; push; mov; mov; '
+                                'blx; mov; mov; bx;$')
     arch_version = detect_arch_version()
     if arch_version == 7:
-        bridge_loop_instructions = ['ldr', 'mov', 'nop', 'cmp', 'bge',
-                                    'push', 'mov', 'mov', 'push', 'mov', 'mov',
-                                    'blx', 'mov', 'mov', 'bx']
+        bridge_loop_instructions = ('ldr; mov; nop; cmp; bge; '
+                                    'push; mov; mov; push; mov; mov; '
+                                    'blx; mov; mov; bx;$')
     else:
-        bridge_loop_instructions = ['ldr', 'mov', 'nop', 'nop', 'nop', 'cmp', 
'bge',
-                              'push', 'ldr', 'mov',
-                              '*', # inline constant
-                              'push', 'ldr', 'mov',
-                              '*', # inline constant
-                              'blx', 'ldr', 'mov',
-                              '*', # inline constant
-                              'bx']
+        bridge_loop_instructions = ('ldr; mov; nop; nop; nop; cmp; bge; '
+                                    'push; ldr; mov; '
+                                    '[^;]+; ' # inline constant
+                                    'push; ldr; mov; '
+                                    '[^;]+; ' # inline constant
+                                    'blx; ldr; mov; '
+                                    '[^;]+; ' # inline constant
+                                    'bx;$')
 
     def get_cpu(self):
         cpu = CPU(rtyper=None, stats=FakeStats())
diff --git a/rpython/jit/backend/llsupport/llmodel.py 
b/rpython/jit/backend/llsupport/llmodel.py
--- a/rpython/jit/backend/llsupport/llmodel.py
+++ b/rpython/jit/backend/llsupport/llmodel.py
@@ -25,6 +25,12 @@
 
     HAS_CODEMAP = False
 
+    done_with_this_frame_descr_int      = None   # overridden by pyjitpl.py
+    done_with_this_frame_descr_float    = None
+    done_with_this_frame_descr_ref      = None
+    done_with_this_frame_descr_void     = None
+    exit_frame_with_exception_descr_ref = None
+
     def __init__(self, rtyper, stats, opts, translate_support_code=False,
                  gcdescr=None):
         assert type(opts) is not bool
diff --git a/rpython/jit/backend/llsupport/test/test_gc_integration.py 
b/rpython/jit/backend/llsupport/test/test_gc_integration.py
--- a/rpython/jit/backend/llsupport/test/test_gc_integration.py
+++ b/rpython/jit/backend/llsupport/test/test_gc_integration.py
@@ -90,6 +90,8 @@
                 assert nos ==  [0, 1, 25]
         elif self.cpu.backend_name.startswith('arm'):
             assert nos == [0, 1, 47]
+        elif self.cpu.backend_name.startswith('ppc64'):
+            assert nos == [0, 1, 33]
         else:
             raise Exception("write the data here")
         assert frame.jf_frame[nos[0]]
@@ -155,6 +157,8 @@
         self.nursery = lltype.malloc(NTP, 64, flavor='raw')
         for i in range(64):
             self.nursery[i] = NOT_INITIALIZED
+        self.nursery_words = rffi.cast(rffi.CArrayPtr(lltype.Signed),
+                                       self.nursery)
         self.addrs = lltype.malloc(rffi.CArray(lltype.Signed), 2,
                                    flavor='raw')
         self.addrs[0] = rffi.cast(lltype.Signed, self.nursery)
@@ -263,11 +267,11 @@
         # slowpath never called
         assert gc_ll_descr.calls == []
 
-    def test_malloc_nursery_varsize(self):
+    def test_malloc_nursery_varsize_nonframe(self):
         self.cpu = self.getcpu(None)
         A = lltype.GcArray(lltype.Signed)
         arraydescr = self.cpu.arraydescrof(A)
-        arraydescr.tid = 15
+        arraydescr.tid = 1515
         ops = '''
         [i0, i1, i2]
         p0 = call_malloc_nursery_varsize(0, 8, i0, descr=arraydescr)
@@ -283,8 +287,8 @@
         assert rffi.cast(lltype.Signed, ref(0)) == nurs_adr + 0
         assert rffi.cast(lltype.Signed, ref(1)) == nurs_adr + 2*WORD + 8*1
         # check the nursery content and state
-        assert gc_ll_descr.nursery[0] == chr(15)
-        assert gc_ll_descr.nursery[2 * WORD + 8] == chr(15)
+        assert gc_ll_descr.nursery_words[0] == 1515
+        assert gc_ll_descr.nursery_words[2 + 8 // WORD] == 1515
         assert gc_ll_descr.addrs[0] == nurs_adr + (((4 * WORD + 8*1 + 5*2) + 
(WORD - 1)) & ~(WORD - 1))
         # slowpath never called
         assert gc_ll_descr.calls == []
@@ -323,11 +327,11 @@
                 idx = 1
             assert len(frame.jf_gcmap) == expected_size
             if self.cpu.IS_64_BIT:
-                assert frame.jf_gcmap[idx] == (1<<29) | (1 << 30)
+                exp_idx = self.cpu.JITFRAME_FIXED_SIZE + 1  # +1 from i0
             else:
                 assert frame.jf_gcmap[idx]
                 exp_idx = self.cpu.JITFRAME_FIXED_SIZE - 32 * idx + 1 # +1 
from i0
-                assert frame.jf_gcmap[idx] == (1 << (exp_idx + 1)) | (1 << 
exp_idx)
+            assert frame.jf_gcmap[idx] == (1 << (exp_idx + 1)) | (1 << exp_idx)
 
         self.cpu = self.getcpu(check)
         ops = '''
@@ -636,7 +640,9 @@
             frames.append(frame)
             new_frame = JITFRAME.allocate(frame.jf_frame_info)
             gcmap = unpack_gcmap(frame)
-            if self.cpu.IS_64_BIT:
+            if self.cpu.backend_name.startswith('ppc64'):
+                assert gcmap == [30, 31, 32]
+            elif self.cpu.IS_64_BIT:
                 assert gcmap == [28, 29, 30]
             elif self.cpu.backend_name.startswith('arm'):
                 assert gcmap == [44, 45, 46]
@@ -647,6 +653,8 @@
                 new_frame.jf_frame[item] = rffi.cast(lltype.Signed, s)
             assert cpu.gc_ll_descr.gcrootmap.stack[0] == 
rffi.cast(lltype.Signed, frame)
             cpu.gc_ll_descr.gcrootmap.stack[0] = rffi.cast(lltype.Signed, 
new_frame)
+            print '"Collecting" moved the frame from %d to %d' % (
+                i, cpu.gc_ll_descr.gcrootmap.stack[0])
             frames.append(new_frame)
 
         def check2(i):
diff --git a/rpython/jit/backend/ppc/callbuilder.py 
b/rpython/jit/backend/ppc/callbuilder.py
--- a/rpython/jit/backend/ppc/callbuilder.py
+++ b/rpython/jit/backend/ppc/callbuilder.py
@@ -111,10 +111,23 @@
 
 
     def push_gcmap(self):
-        pass  # XXX
+        # we push *now* the gcmap, describing the status of GC registers
+        # after the rearrangements done just before, ignoring the return
+        # value r3, if necessary
+        assert not self.is_call_release_gil
+        noregs = self.asm.cpu.gc_ll_descr.is_shadow_stack()
+        gcmap = self.asm._regalloc.get_gcmap([r.r3], noregs=noregs)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
 
     def pop_gcmap(self):
-        pass  # XXX
+        ssreg = None
+        gcrootmap = self.asm.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap:
+            if gcrootmap.is_shadow_stack and self.is_call_release_gil:
+                # in this mode, 'ebx' happens to contain the shadowstack
+                # top at this point, so reuse it instead of loading it again
+                ssreg = ebx
+        self.asm._reload_frame_if_necessary(self.mc)
 
     def emit_raw_call(self):
         self.mc.raw_call()
diff --git a/rpython/jit/backend/ppc/codebuilder.py 
b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -1049,20 +1049,26 @@
     else:
         RAW_CALL_REG = r.r12
 
-    def raw_call(self):
-        """Emit a call to the address stored in the register RAW_CALL_REG."""
+    def raw_call(self, call_reg=RAW_CALL_REG):
+        """Emit a call to the address stored in the register 'call_reg',
+        which must be either RAW_CALL_REG or r12.  This is a regular C
+        function pointer, which means on big-endian that it is actually
+        the address of a three-words descriptor.
+        """
         if IS_BIG_ENDIAN:
             # Load the function descriptor (currently in r2) from memory:
             #  [r2 + 0]  -> ctr
             #  [r2 + 16] -> r11
             #  [r2 + 8]  -> r2  (= TOC)
             assert self.RAW_CALL_REG is r.r2
-            self.ld(r.SCRATCH.value, r.r2.value, 0)
-            self.ld(r.r11.value, r.r2.value, 16)
+            assert call_reg is r.r2 or call_reg is r.r12
+            self.ld(r.SCRATCH.value, call_reg.value, 0)
+            self.ld(r.r11.value, call_reg.value, 16)
             self.mtctr(r.SCRATCH.value)
-            self.ld(r.TOC.value, r.r2.value, 8)   # must be last: TOC is r2
+            self.ld(r.TOC.value, call_reg.value, 8)  # must be last: TOC is r2
         elif IS_LITTLE_ENDIAN:
             assert self.RAW_CALL_REG is r.r12     # 'r12' is fixed by this ABI
+            assert call_reg is r.r12
             self.mtctr(r.r12.value)
         # Call the function
         self.bctrl()
diff --git a/rpython/jit/backend/ppc/opassembler.py 
b/rpython/jit/backend/ppc/opassembler.py
--- a/rpython/jit/backend/ppc/opassembler.py
+++ b/rpython/jit/backend/ppc/opassembler.py
@@ -11,14 +11,15 @@
                                           IS_BIG_ENDIAN)
 
 from rpython.jit.metainterp.history import (JitCellToken, TargetToken, Box,
-                                            AbstractFailDescr, FLOAT, INT, REF)
+                                            AbstractFailDescr, FLOAT, INT, REF,
+                                            ConstInt)
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.jit.backend.ppc.helper.assembler import (Saved_Volatiles)
 from rpython.jit.backend.ppc.jump import remap_frame_layout
 from rpython.jit.backend.ppc.codebuilder import (OverwritingBuilder, 
scratch_reg,
                                                  PPCBuilder, PPCGuardToken)
 from rpython.jit.backend.ppc.regalloc import TempPtr, TempInt
-from rpython.jit.backend.llsupport import symbolic
+from rpython.jit.backend.llsupport import symbolic, jitframe
 from rpython.jit.backend.llsupport.descr import InteriorFieldDescr, CallDescr
 from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
 from rpython.rtyper.lltypesystem import rstr, rffi, lltype
@@ -26,6 +27,7 @@
 from rpython.jit.metainterp.resoperation import rop
 from rpython.jit.codewriter.effectinfo import EffectInfo
 from rpython.jit.backend.ppc import callbuilder
+from rpython.rlib.rarithmetic import r_uint
 
 class IntOpAssembler(object):
         
@@ -391,30 +393,31 @@
             [fail_descr_loc] = arglocs
 
         ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
+        ofs2 = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+
         self.mc.load_imm(r.r5, fail_descr_loc.getint())
+
+        # gcmap logic here:
+        arglist = op.getarglist()
+        if arglist and arglist[0].type == REF:
+            if self._finish_gcmap:
+                # we're returning with a guard_not_forced_2, and
+                # additionally we need to say that the result contains
+                # a reference too:
+                self._finish_gcmap[0] |= r_uint(1)
+                gcmap = self._finish_gcmap
+            else:
+                gcmap = self.gcmap_for_finish
+        elif self._finish_gcmap:
+            # we're returning with a guard_not_forced_2
+            gcmap = self._finish_gcmap
+        else:
+            gcmap = lltype.nullptr(jitframe.GCMAP)
+        self.load_gcmap(self.mc, r.r2, gcmap)
+
         self.mc.std(r.r5.value, r.SPP.value, ofs)
+        self.mc.store(r.r2.value, r.SPP.value, ofs2)
 
-        ## XXX: gcmap logic here:
-        ## arglist = op.getarglist()
-        ## if arglist and arglist[0].type == REF:
-        ##     if self._finish_gcmap:
-        ##         # we're returning with a guard_not_forced_2, and
-        ##         # additionally we need to say that eax/rax contains
-        ##         # a reference too:
-        ##         self._finish_gcmap[0] |= r_uint(1)
-        ##         gcmap = self._finish_gcmap
-        ##     else:
-        ##         gcmap = self.gcmap_for_finish
-        ##     self.push_gcmap(self.mc, gcmap, store=True)
-        ## elif self._finish_gcmap:
-        ##     # we're returning with a guard_not_forced_2
-        ##     gcmap = self._finish_gcmap
-        ##     self.push_gcmap(self.mc, gcmap, store=True)
-        ## else:
-        ##     # note that the 0 here is redundant, but I would rather
-        ##     # keep that one and kill all the others
-        ##     ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
-        ##     self.mc.MOV_bi(ofs, 0)
         # exit function
         self._call_footer()
 
@@ -527,6 +530,8 @@
     def _find_nearby_operation(self, regalloc, delta):
         return regalloc.operations[regalloc.rm.position + delta]
 
+    _COND_CALL_SAVE_REGS = [r.r3, r.r4, r.r5, r.r6, r.r12]
+
     def emit_cond_call(self, op, arglocs, regalloc):
         fcond = self.guard_success_cc
         self.guard_success_cc = c.cond_none
@@ -536,16 +541,13 @@
         jmp_adr = self.mc.get_relative_pos()
         self.mc.trap()        # patched later to a 'bc'
 
-        # XXX load_gcmap XXX -> r2
+        self.load_gcmap(self.mc, r.r2, regalloc.get_gcmap())
 
         # save away r3, r4, r5, r6, r12 into the jitframe
-        base_ofs = self.cpu.get_baseofs_of_frame_field()
-        should_be_saved = self._regalloc.rm.reg_bindings.values()
-        for gpr in [r.r3, r.r4, r.r5, r.r6, r.r12]:
-            if gpr not in should_be_saved:
-                continue
-            v = self.cpu.all_reg_indexes[gpr.value]
-            self.mc.std(gpr.value, r.SPP.value, v * WORD + base_ofs)
+        should_be_saved = [
+            reg for reg in self._regalloc.rm.reg_bindings.itervalues()
+                if reg in self._COND_CALL_SAVE_REGS]
+        self._push_core_regs_to_jitframe(self.mc, should_be_saved)
         #
         # load the 0-to-4 arguments into these registers, with the address of
         # the function to call into r12
@@ -676,6 +678,7 @@
     SIZE2SCALE = dict([(1<<_i, _i) for _i in range(32)])
 
     def _multiply_by_constant(self, loc, multiply_by, scratch_loc):
+        assert loc.is_reg()
         if multiply_by == 1:
             return loc
         try:
@@ -910,9 +913,8 @@
         self.mc.addi(r.r4.value, r.r4.value, basesize)
         self.mc.addi(r.r3.value, r.r2.value, basesize)
 
-        cb = callbuilder.CallBuilder(self, imm(self.memcpy_addr),
-                                     [r.r3, r.r4, r.r5], None)
-        cb.emit()
+        self.mc.load_imm(self.mc.RAW_CALL_REG, self.memcpy_addr)
+        self.mc.raw_call()
 
 
 class UnicodeOpAssembler(object):
@@ -933,15 +935,40 @@
         self.propagate_memoryerror_if_r3_is_null()
 
     def emit_call_malloc_nursery(self, op, arglocs, regalloc):
-        # registers r3 and r4 are allocated for this call
-        assert len(arglocs) == 1
-        size = arglocs[0].value
+        # registers r.RES and r.RSZ are allocated for this call
+        size_box = op.getarg(0)
+        assert isinstance(size_box, ConstInt)
+        size = size_box.getint()
         gc_ll_descr = self.cpu.gc_ll_descr
+        gcmap = regalloc.get_gcmap([r.RES, r.RSZ])
         self.malloc_cond(
             gc_ll_descr.get_nursery_free_addr(),
             gc_ll_descr.get_nursery_top_addr(),
-            size
-            )
+            size, gcmap)
+
+    def emit_call_malloc_nursery_varsize_frame(self, op, arglocs, regalloc):
+        # registers r.RES and r.RSZ are allocated for this call
+        [sizeloc] = arglocs
+        gc_ll_descr = self.cpu.gc_ll_descr
+        gcmap = regalloc.get_gcmap([r.RES, r.RSZ])
+        self.malloc_cond_varsize_frame(
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            sizeloc, gcmap)
+
+    def emit_call_malloc_nursery_varsize(self, op, arglocs, regalloc):
+        # registers r.RES and r.RSZ are allocated for this call
+        [lengthloc] = arglocs
+        arraydescr = op.getdescr()
+        itemsize = op.getarg(1).getint()
+        gc_ll_descr = self.cpu.gc_ll_descr
+        maxlength = (gc_ll_descr.max_size_of_young_obj - WORD * 2) / itemsize
+        gcmap = regalloc.get_gcmap([r.RES, r.RSZ])
+        self.malloc_cond_varsize(
+            op.getarg(0).getint(),
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            lengthloc, itemsize, maxlength, gcmap, arraydescr)
 
     def emit_debug_merge_point(self, op, arglocs, regalloc):
         pass
@@ -950,7 +977,7 @@
     emit_keepalive = emit_debug_merge_point
 
     def _write_barrier_fastpath(self, mc, descr, arglocs, regalloc, 
array=False,
-                                is_frame=False, align_stack=False):
+                                is_frame=False):
         # Write code equivalent to write_barrier() in the GC: it checks
         # a flag in the object at arglocs[0], and if set, it calls a
         # helper piece of assembler.  The latter saves registers as needed
@@ -999,6 +1026,7 @@
             helper_num += 2
         if self.wb_slowpath[helper_num] == 0:    # tests only
             assert not we_are_translated()
+            assert not is_frame
             self.cpu.gc_ll_descr.write_barrier_descr = descr
             self._build_wb_slowpath(card_marking_mask != 0,
                                     bool(regalloc.fprm.reg_bindings))
@@ -1006,15 +1034,9 @@
         #
         if not is_frame:
             mc.mr(r.r0.value, loc_base.value)    # unusual argument location
-        if is_frame and align_stack:
-            XXXX
-            mc.SUB_ri(esp.value, 16 - WORD) # erase the return address
         mc.load_imm(r.SCRATCH2, self.wb_slowpath[helper_num])
         mc.mtctr(r.SCRATCH2.value)
         mc.bctrl()
-        if is_frame and align_stack:
-            XXXX
-            mc.ADD_ri(esp.value, 16 - WORD) # erase the return address
 
         if card_marking_mask:
             # The helper ends again with a check of the flag in the object.
diff --git a/rpython/jit/backend/ppc/ppc_assembler.py 
b/rpython/jit/backend/ppc/ppc_assembler.py
--- a/rpython/jit/backend/ppc/ppc_assembler.py
+++ b/rpython/jit/backend/ppc/ppc_assembler.py
@@ -16,7 +16,7 @@
 from rpython.jit.backend.ppc.register import JITFRAME_FIXED_SIZE
 from rpython.jit.metainterp.history import AbstractFailDescr
 from rpython.jit.metainterp.history import ConstInt, BoxInt
-from rpython.jit.backend.llsupport import jitframe
+from rpython.jit.backend.llsupport import jitframe, rewrite
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.llsupport.assembler import (DEBUG_COUNTER, 
debug_bridge,
                                                      BaseAssembler)
@@ -124,27 +124,23 @@
             mc.lfd(reg.value, spp_reg.value,
                         self.OFFSET_SPP_TO_FPR_SAVE_AREA + WORD * i)
 
-    def gen_shadowstack_header(self, gcrootmap):
-        # we need to put two words into the shadowstack: the MARKER_FRAME
-        # and the address of the frame (fp, actually)
-        rst = gcrootmap.get_root_stack_top_addr()
-        self.mc.load_imm(r.r14, rst)
-        self.mc.load(r.r15.value, r.r14.value, 0) # LD r15 [rootstacktop]
+    def _call_header_shadowstack(self, gcrootmap):
+        # we need to put one word into the shadowstack: the jitframe (SPP)
+        mc = self.mc
+        mc.load_imm(r.RCS1, gcrootmap.get_root_stack_top_addr())
+        mc.load(r.RCS2.value, r.RCS1.value, 0)    # ld RCS2, [rootstacktop]
         #
-        MARKER = gcrootmap.MARKER_FRAME
-        self.mc.addi(r.r16.value, r.r15.value, 2 * WORD) # ADD r16, r15, 2*WORD
-        self.mc.load_imm(r.r17, MARKER)
-        self.mc.store(r.r17.value, r.r15.value, WORD)  # STR MARKER, r15+WORD
-        self.mc.store(r.SPP.value, r.r15.value, 0)  # STR spp, r15
+        mc.addi(r.RCS3.value, r.RCS2.value, WORD) # add RCS3, RCS2, WORD
+        mc.store(r.SPP.value, r.RCS2.value, 0)    # std SPP, RCS2
         #
-        self.mc.store(r.r16.value, r.r14.value, 0)  # STR r16, [rootstacktop]
+        mc.store(r.RCS3.value, r.RCS1.value, 0)   # std RCS3, [rootstacktop]
 
-    def gen_footer_shadowstack(self, gcrootmap, mc):
-        rst = gcrootmap.get_root_stack_top_addr()
-        mc.load_imm(r.r14, rst)
-        mc.load(r.r15.value, r.r14.value, 0)  # LD r15, [rootstacktop]
-        mc.addi(r.r15.value, r.r15.value, -2 * WORD)  # SUB r15, r15, 2*WORD
-        mc.store(r.r15.value, r.r14.value, 0) # STR r15, [rootstacktop]
+    def _call_footer_shadowstack(self, gcrootmap):
+        mc = self.mc
+        mc.load_imm(r.RCS1, gcrootmap.get_root_stack_top_addr())
+        mc.load(r.RCS2.value, r.RCS1.value, 0)     # ld RCS2, [rootstacktop]
+        mc.addi(r.RCS2.value, r.RCS2.value, WORD)  # sub RCS2, RCS2, WORD
+        mc.store(r.RCS2.value, r.RCS1.value, 0)    # std RCS2, [rootstacktop]
 
     def new_stack_loc(self, i, tp):
         base_ofs = self.cpu.get_baseofs_of_frame_field()
@@ -153,41 +149,29 @@
     def setup_failure_recovery(self):
         self.failure_recovery_code = [0, 0, 0, 0]
 
-    def _push_all_regs_to_jitframe(self, mc, ignored_regs, withfloats,
-                                   callee_only=False):
+    def _push_core_regs_to_jitframe(self, mc, includes=r.MANAGED_REGS):
         base_ofs = self.cpu.get_baseofs_of_frame_field()
-        if callee_only:
-            regs = PPCRegisterManager.save_around_call_regs
-        else:
-            regs = PPCRegisterManager.all_regs
-        #
-        for reg in regs:
-            if reg not in ignored_regs:
-                v = r.ALL_REG_INDEXES[reg]
-                mc.std(reg.value, r.SPP.value, base_ofs + v * WORD)
-        #
-        if withfloats:
-            for reg in r.MANAGED_FP_REGS:
-                v = r.ALL_REG_INDEXES[reg]
-                mc.stfd(reg.value, r.SPP.value, base_ofs + v * WORD)
+        for reg in includes:
+            v = r.ALL_REG_INDEXES[reg]
+            mc.std(reg.value, r.SPP.value, base_ofs + v * WORD)
 
-    def _pop_all_regs_from_jitframe(self, mc, ignored_regs, withfloats,
-                                    callee_only=False):
+    def _push_fp_regs_to_jitframe(self, mc, includes=r.MANAGED_FP_REGS):
         base_ofs = self.cpu.get_baseofs_of_frame_field()
-        if callee_only:
-            regs = PPCRegisterManager.save_around_call_regs
-        else:
-            regs = PPCRegisterManager.all_regs
-        #
-        for reg in regs:
-            if reg not in ignored_regs:
-                v = r.ALL_REG_INDEXES[reg]
-                mc.ld(reg.value, r.SPP.value, base_ofs + v * WORD)
-        #
-        if withfloats:
-            for reg in r.MANAGED_FP_REGS:
-                v = r.ALL_REG_INDEXES[reg]
-                mc.lfd(reg.value, r.SPP.value, base_ofs + v * WORD)
+        for reg in includes:
+            v = r.ALL_REG_INDEXES[reg]
+            mc.stfd(reg.value, r.SPP.value, base_ofs + v * WORD)
+
+    def _pop_core_regs_from_jitframe(self, mc, includes=r.MANAGED_REGS):
+        base_ofs = self.cpu.get_baseofs_of_frame_field()
+        for reg in includes:
+            v = r.ALL_REG_INDEXES[reg]
+            mc.ld(reg.value, r.SPP.value, base_ofs + v * WORD)
+
+    def _pop_fp_regs_from_jitframe(self, mc, includes=r.MANAGED_FP_REGS):
+        base_ofs = self.cpu.get_baseofs_of_frame_field()
+        for reg in includes:
+            v = r.ALL_REG_INDEXES[reg]
+            mc.lfd(reg.value, r.SPP.value, base_ofs + v * WORD)
 
     def _build_failure_recovery(self, exc, withfloats=False):
         mc = PPCBuilder()
@@ -201,7 +185,9 @@
         mc.store(r.r0.value, r.SPP.value, ofs)
         mc.store(r.r2.value, r.SPP.value, ofs2)
 
-        self._push_all_regs_to_jitframe(mc, [], withfloats)
+        self._push_core_regs_to_jitframe(mc)
+        if withfloats:
+            self._push_fp_regs_to_jitframe(mc)
 
         if exc:
             # We might have an exception pending.
@@ -236,7 +222,8 @@
         ofs2 = self.cpu.get_ofs_of_frame_field('jf_gcmap')
         mc.store(r.r2.value, r.SPP.value, ofs2)
 
-        self._push_all_regs_to_jitframe(mc, [], self.cpu.supports_floats)
+        self._push_core_regs_to_jitframe(mc)
+        self._push_fp_regs_to_jitframe(mc)
 
         # Save away the LR inside r30
         mc.mflr(r.RCS1.value)
@@ -251,8 +238,8 @@
 
         # Do the call
         adr = rffi.cast(lltype.Signed, self.cpu.realloc_frame)
-        cb = callbuilder.CallBuilder(self, imm(adr), [r.r3, r.r4], r.r3)
-        cb.emit()
+        mc.load_imm(mc.RAW_CALL_REG, adr)
+        mc.raw_call()
 
         # The result is stored back into SPP (= r31)
         mc.mr(r.SPP.value, r.r3.value)
@@ -261,11 +248,13 @@
 
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
         if gcrootmap and gcrootmap.is_shadow_stack:
-            self._load_shadowstack_top_in_ebx(mc, gcrootmap)
-            mc.MOV_mr((ebx.value, -WORD), eax.value)
+            mc.load_imm(r.r5, gcrootmap.get_root_stack_top_addr())
+            mc.load(r.r5.value, r.r5.value, 0)
+            mc.store(r.r3.value, r.r5.value, -WORD)
 
         mc.mtlr(r.RCS1.value)     # restore LR
-        self._pop_all_regs_from_jitframe(mc, [], self.cpu.supports_floats)
+        self._pop_core_regs_from_jitframe(mc)
+        self._pop_fp_regs_from_jitframe(mc)
         mc.blr()
 
         self._frame_realloc_slowpath = mc.materialize(self.cpu, [])
@@ -294,6 +283,20 @@
         mc.store(excvalloc.value, r.r2.value, 0)
         mc.store(exctploc.value, r.r2.value, diff)
 
+    def _reload_frame_if_necessary(self, mc):
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap:
+            if gcrootmap.is_shadow_stack:
+                mc.load_imm(r.SPP, gcrootmap.get_root_stack_top_addr())
+                mc.load(r.SPP.value, r.SPP.value, 0)
+                mc.load(r.SPP.value, r.SPP.value, -WORD)
+        wbdescr = self.cpu.gc_ll_descr.write_barrier_descr
+        if gcrootmap and wbdescr:
+            # frame never uses card marking, so we enforce this is not
+            # an array
+            self._write_barrier_fastpath(mc, wbdescr, [r.SPP], regalloc=None,
+                                         array=False, is_frame=True)
+
     def _build_cond_call_slowpath(self, supports_floats, callee_only):
         """ This builds a general call slowpath, for whatever call happens to
         come.
@@ -314,87 +317,113 @@
         # because these have already been saved by the caller.  Note that
         # this is not symmetrical: these 5 registers are saved by the caller
         # but restored here at the end of this function.
-        self._push_all_regs_to_jitframe(mc, [r.r3, r.r4, r.r5, r.r6, r.r12],
-                                        supports_floats, callee_only)
+        if callee_only:
+            saved_regs = PPCRegisterManager.save_around_call_regs
+        else:
+            saved_regs = PPCRegisterManager.all_regs
+        self._push_core_regs_to_jitframe(mc, [reg for reg in saved_regs
+                                              if reg is not r.r3 and
+                                                 reg is not r.r4 and
+                                                 reg is not r.r5 and
+                                                 reg is not r.r6 and
+                                                 reg is not r.r12])
+        if supports_floats:
+            self._push_fp_regs_to_jitframe(mc)
 
         # Save away the LR inside r30
         mc.mflr(r.RCS1.value)
 
         # Do the call
-        cb = callbuilder.CallBuilder(self, r.r12, [r.r3, r.r4, r.r5, r.r6],
-                                     None)
-        cb.emit()
+        mc.raw_call(r.r12)
 
         # Finish
-        # XXX self._reload_frame_if_necessary(mc, align_stack=True)
+        self._reload_frame_if_necessary(mc)
 
         mc.mtlr(r.RCS1.value)     # restore LR
-        self._pop_all_regs_from_jitframe(mc, [], supports_floats, callee_only)
+        self._pop_core_regs_from_jitframe(mc, saved_regs)
+        if supports_floats:
+            self._pop_fp_regs_from_jitframe(mc)
         mc.blr()
         self.mc = None
         return mc.materialize(self.cpu, [])
 
-    def _build_malloc_slowpath(self):
-        xxxxxxx
+    def _build_malloc_slowpath(self, kind):
+        """ While arriving on slowpath, we have a gcmap in r2.
+        The arguments are passed in r.RES and r.RSZ, as follows:
+
+        kind == 'fixed': nursery_head in r.RES and the size in r.RSZ - r.RES.
+
+        kind == 'str/unicode': length of the string to allocate in r.RES.
+
+        kind == 'var': itemsize in r.RES, length to allocate in r.RSZ,
+                       and tid in r.SCRATCH.
+
+        This function must preserve all registers apart from r.RES and r.RSZ.
+        On return, r2 must contain the address of nursery_free.
+        """
+        assert kind in ['fixed', 'str', 'unicode', 'var']
         mc = PPCBuilder()
-        frame_size = (len(r.MANAGED_FP_REGS) * WORD
-                    + (BACKCHAIN_SIZE + MAX_REG_PARAMS) * WORD)
+        self.mc = mc
+        ofs2 = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+        mc.store(r.r2.value, r.SPP.value, ofs2)
+        saved_regs = [reg for reg in r.MANAGED_REGS
+                          if reg is not r.RES and reg is not r.RSZ]
+        self._push_core_regs_to_jitframe(mc, saved_regs)
+        self._push_fp_regs_to_jitframe(mc)
+        #
+        if kind == 'fixed':
+            addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
+        elif kind == 'str':
+            addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_str')
+        elif kind == 'unicode':
+            addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_unicode')
+        else:
+            addr = self.cpu.gc_ll_descr.get_malloc_slowpath_array_addr()
 
-        mc.make_function_prologue(frame_size)
-        # managed volatiles are saved below
-        if self.cpu.supports_floats:
-            for i in range(len(r.MANAGED_FP_REGS)):
-                mc.stfd(r.MANAGED_FP_REGS[i].value, r.SP.value,
-                        (BACKCHAIN_SIZE + MAX_REG_PARAMS + i) * WORD)
-        # Values to compute size stored in r3 and r4
-        mc.subf(r.RES.value, r.RES.value, r.r4.value)
-        addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
-        for reg, ofs in PPCRegisterManager.REGLOC_TO_COPY_AREA_OFS.items():
-            mc.store(reg.value, r.SPP.value, ofs)
-        mc.call(rffi.cast(lltype.Signed, addr))
-        for reg, ofs in PPCRegisterManager.REGLOC_TO_COPY_AREA_OFS.items():
-            mc.load(reg.value, r.SPP.value, ofs)
-        # restore floats
-        if self.cpu.supports_floats:
-            for i in range(len(r.MANAGED_FP_REGS)):
-                mc.lfd(r.MANAGED_FP_REGS[i].value, r.SP.value,
-                       (BACKCHAIN_SIZE + MAX_REG_PARAMS + i) * WORD)
+        # Save away the LR inside r30
+        mc.mflr(r.RCS1.value)
 
-        mc.cmp_op(0, r.RES.value, 0, imm=True)
-        jmp_pos = mc.currpos()
-        mc.trap()
+        if kind == 'fixed':
+            # compute the size we want
+            mc.subf(r.r3.value, r.RES.value, r.RSZ.value)
+            if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
+                # for tests only
+                mc.mr(r.r4.value, r.SPP.value)
+        elif kind == 'str' or kind == 'unicode':
+            pass  # length is already in r3
+        else:
+            # arguments to the called function are [itemsize, tid, length]
+            # itemsize is already in r3
+            mc.mr(r.r5.value, r.RSZ.value)       # length
+            mc.mr(r.r4.value, r.SCRATCH.value)   # tid
+
+        # Do the call
+        addr = rffi.cast(lltype.Signed, addr)
+        mc.load_imm(mc.RAW_CALL_REG, addr)
+        mc.raw_call()
+
+        self._reload_frame_if_necessary(mc)
+
+        # Check that we don't get NULL; if we do, we always interrupt the
+        # current loop, as a "good enough" approximation (same as
+        # emit_call_malloc_gc()).
+        self.propagate_memoryerror_if_r3_is_null()
+
+        mc.mtlr(r.RCS1.value)     # restore LR
+        self._pop_core_regs_from_jitframe(mc, saved_regs)
+        self._pop_fp_regs_from_jitframe(mc)
 
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
-        mc.load_imm(r.r4, nursery_free_adr)
-        mc.load(r.r4.value, r.r4.value, 0)
- 
-        if IS_PPC_32:
-            ofs = WORD
-        else:
-            ofs = WORD * 2
-        
-        with scratch_reg(mc):
-            mc.load(r.SCRATCH.value, r.SP.value, frame_size + ofs) 
-            mc.mtlr(r.SCRATCH.value)
-        mc.addi(r.SP.value, r.SP.value, frame_size)
+        self.mc.load_imm(r.r2, nursery_free_adr)
+
+        # r2 is now the address of nursery_free
+        # r.RES is still the result of the call done above
+        # r.RSZ is loaded from [r2], to make the caller's store a no-op here
+        mc.load(r.RSZ.value, r.r2.value, 0)
+        #
         mc.blr()
-
-        # if r3 == 0 we skip the return above and jump to the exception path
-        offset = mc.currpos() - jmp_pos
-        pmc = OverwritingBuilder(mc, jmp_pos, 1)
-        pmc.beq(offset)
-        pmc.overwrite()
-        # restore the frame before leaving
-        with scratch_reg(mc):
-            mc.load(r.SCRATCH.value, r.SP.value, frame_size + ofs) 
-            mc.mtlr(r.SCRATCH.value)
-        mc.addi(r.SP.value, r.SP.value, frame_size)
-        mc.b_abs(self.propagate_exception_path)
-
-        rawstart = mc.materialize(self.cpu, [])
-        # here we do not need a function descr. This is being only called using
-        # an internal ABI
-        self.malloc_slowpath = rawstart
+        self.mc = None
+        return mc.materialize(self.cpu, [])
 
     def _build_stack_check_slowpath(self):
         _, _, slowpathaddr = self.cpu.insert_stack_check()
@@ -512,54 +541,65 @@
         #
         # This builds a helper function called from the slow path of
         # write barriers.  It must save all registers, and optionally
-        # all fp registers.  It takes its single argument in r0.
+        # all fp registers.  It takes its single argument in r0
+        # (or in SPP if 'for_frame').
+        if for_frame:
+            argument_loc = r.SPP
+        else:
+            argument_loc = r.r0
+
         mc = PPCBuilder()
         old_mc = self.mc
         self.mc = mc
-        #
-        ignored_regs = [reg for reg in r.MANAGED_REGS if not (
-                            # 'reg' will be pushed if the following is true:
-                            reg in r.VOLATILES or
-                            reg is r.RCS1 or
-                            (withcards and reg is r.RCS2))]
-        if not for_frame:
+
+        if for_frame:
+            # This 'for_frame' version is called after a CALL.  It does not
+            # need to save many registers: the registers that are anyway
+            # destroyed by the call can be ignored (VOLATILES), and the
+            # non-volatile registers won't be changed here.  It only needs
+            # to save r.RCS1 (used below), r3 and f1 (possible results of
+            # the call), and two more non-volatile registers (used to store
+            # the RPython exception that occurred in the CALL, if any).
+            saved_regs = [r.r3, r.RCS1, r.RCS2, r.RCS3]
+            saved_fp_regs = [r.f1]
+        else:
             # push all volatile registers, push RCS1, and sometimes push RCS2
-            self._push_all_regs_to_jitframe(mc, ignored_regs, withfloats)
-        else:
-            return #XXXXX
-            # we have one word to align
-            mc.SUB_ri(esp.value, 7 * WORD) # align and reserve some space
-            mc.MOV_sr(WORD, eax.value) # save for later
-            if self.cpu.supports_floats:
-                mc.MOVSD_sx(2 * WORD, xmm0.value)   # 32-bit: also 3 * WORD
-            if IS_X86_32:
-                mc.MOV_sr(4 * WORD, edx.value)
-                mc.MOV_sr(0, ebp.value)
-                exc0, exc1 = esi, edi
+            if withcards:
+                saved_regs = r.VOLATILES + [r.RCS1, r.RCS2]
             else:
-                mc.MOV_rr(edi.value, ebp.value)
-                exc0, exc1 = ebx, r12
-            mc.MOV(RawEspLoc(WORD * 5, REF), exc0)
-            mc.MOV(RawEspLoc(WORD * 6, INT), exc1)
-            # note that it's save to store the exception in register,
+                saved_regs = r.VOLATILES + [r.RCS1]
+            if withfloats:
+                saved_fp_regs = r.MANAGED_FP_REGS
+            else:
+                saved_fp_regs = []
+
+        self._push_core_regs_to_jitframe(mc, saved_regs)
+        self._push_fp_regs_to_jitframe(mc, saved_fp_regs)
+
+        if for_frame:
+            # note that it's safe to store the exception in register,
             # since the call to write barrier can't collect
             # (and this is assumed a bit left and right here, like lack
             # of _reload_frame_if_necessary)
-            self._store_and_reset_exception(mc, exc0, exc1)
+            self._store_and_reset_exception(mc, r.RCS2, r.RCS3)
 
         if withcards:
-            mc.mr(r.RCS2.value, r.r0.value)
+            mc.mr(r.RCS2.value, argument_loc.value)
         #
         # Save the lr into r.RCS1
         mc.mflr(r.RCS1.value)
         #
         func = rffi.cast(lltype.Signed, func)
-        cb = callbuilder.CallBuilder(self, imm(func), [r.r0], None)
-        cb.emit()
+        mc.mr(r.r3.value, argument_loc.value)
+        mc.load_imm(mc.RAW_CALL_REG, func)
+        mc.raw_call()
         #
         # Restore lr
         mc.mtlr(r.RCS1.value)
-        #
+
+        if for_frame:
+            self._restore_exception(mc, r.RCS2, r.RCS3)
+
         if withcards:
             # A final andix before the blr, for the caller.  Careful to
             # not follow this instruction with another one that changes
@@ -567,23 +607,10 @@
             card_marking_mask = descr.jit_wb_cards_set_singlebyte
             mc.lbz(r.RCS2.value, r.RCS2.value, descr.jit_wb_if_flag_byteofs)
             mc.andix(r.RCS2.value, r.RCS2.value, card_marking_mask & 0xFF)
-        #
 
-        if not for_frame:
-            self._pop_all_regs_from_jitframe(mc, ignored_regs, withfloats)
-            mc.blr()
-        else:
-            XXXXXXX
-            if IS_X86_32:
-                mc.MOV_rs(edx.value, 4 * WORD)
-            if self.cpu.supports_floats:
-                mc.MOVSD_xs(xmm0.value, 2 * WORD)
-            mc.MOV_rs(eax.value, WORD) # restore
-            self._restore_exception(mc, exc0, exc1)
-            mc.MOV(exc0, RawEspLoc(WORD * 5, REF))
-            mc.MOV(exc1, RawEspLoc(WORD * 6, INT))
-            mc.LEA_rs(esp.value, 7 * WORD)
-            mc.RET()
+        self._pop_core_regs_from_jitframe(mc, saved_regs)
+        self._pop_fp_regs_from_jitframe(mc, saved_fp_regs)
+        mc.blr()
 
         self.mc = old_mc
         rawstart = mc.materialize(self.cpu, [])
@@ -615,52 +642,6 @@
         self.propagate_exception_path = rawstart
         self.mc = None
 
-    # The code generated here serves as an exit stub from
-    # the executed machine code.
-    # It is generated only once when the backend is initialized.
-    #
-    # The following actions are performed:
-    #   - The fail boxes are filled with the computed values 
-    #        (failure_recovery_func)
-    #   - The nonvolatile registers are restored 
-    #   - jump back to the calling code
-    def _gen_exit_path(self):
-        mc = PPCBuilder() 
-        self._save_managed_regs(mc)
-        decode_func_addr = llhelper(self.recovery_func_sign,
-                self.failure_recovery_func)
-        addr = rffi.cast(lltype.Signed, decode_func_addr)
-
-        # load parameters into parameter registers
-        # address of state encoding 
-        mc.load(r.RES.value, r.SPP.value, FORCE_INDEX_OFS)
-        mc.mr(r.r4.value, r.SPP.value)  # load spilling pointer
-        mc.mr(r.r5.value, r.SPP.value)  # load managed registers pointer
-        #
-        # call decoding function
-        mc.call(addr)
-
-        # generate return and restore registers
-        self._gen_epilogue(mc)
-
-        return mc.materialize(self.cpu, [], self.cpu.gc_ll_descr.gcrootmap)
-
-    def _save_managed_regs(self, mc):
-        """ store managed registers in ENCODING AREA
-        """
-        for i in range(len(r.MANAGED_REGS)):
-            reg = r.MANAGED_REGS[i]
-            mc.store(reg.value, r.SPP.value, i * WORD)
-        FLOAT_OFFSET = len(r.MANAGED_REGS)
-        for i in range(len(r.MANAGED_FP_REGS)):
-            fpreg = r.MANAGED_FP_REGS[i]
-            mc.stfd(fpreg.value, r.SPP.value, (i + FLOAT_OFFSET) * WORD)
-
-    #def gen_bootstrap_code(self, loophead, spilling_area):
-    #    self._insert_stack_check()
-    #    self._make_frame(spilling_area)
-    #    self.mc.b_offset(loophead)
-
     def _call_header(self):
         if IS_PPC_64 and IS_BIG_ENDIAN:
             # Reserve space for a function descriptor, 3 words
@@ -687,8 +668,7 @@
 
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
         if gcrootmap and gcrootmap.is_shadow_stack:
-            XXX
-            self.gen_shadowstack_header(gcrootmap)
+            self._call_header_shadowstack(gcrootmap)
 
     def _call_header_with_stack_check(self):
         self._call_header()
@@ -814,8 +794,7 @@
         mc.trap()     # placeholder for li(r0, ...)
         mc.load_imm(r.SCRATCH2, self._frame_realloc_slowpath)
         mc.mtctr(r.SCRATCH2.value)
-        #XXXXX:
-        if we_are_translated(): XXX #self.load_gcmap(mc, gcmap)  # -> r2
+        self.load_gcmap(mc, r.r2, gcmap)
         mc.bctrl()
 
         self.frame_depth_to_patch.append((patch_pos, mc.currpos()))
@@ -927,7 +906,7 @@
                                              operations,
                                              self.current_clt.allgcrefs,
                                              self.current_clt.frame_info)
-        self._check_frame_depth(self.mc, "??")
+        self._check_frame_depth(self.mc, regalloc.get_gcmap())
         frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, 
operations)
         codeendpos = self.mc.get_relative_pos()
         self.write_pending_failure_recoveries()
@@ -975,14 +954,17 @@
         #print "=== Loop start is at %s ===" % hex(r_uint(start))
         return start
 
-    def load_gcmap(self, mc, gcmap):
-        # load the current gcmap into register r2
+    def load_gcmap(self, mc, reg, gcmap):
+        # load the current gcmap into register 'reg'
         ptr = rffi.cast(lltype.Signed, gcmap)
-        mc.load_imm(r.r2, ptr)
+        mc.load_imm(reg, ptr)
 
-    def push_gcmap(self, mc, gcmap, store):
+    def push_gcmap(self, mc, gcmap, store=True):
+        # (called from callbuilder.py and ../llsupport/callbuilder.py)
         assert store is True
-        # XXX IGNORED FOR NOW
+        self.load_gcmap(mc, r.SCRATCH, gcmap)
+        ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+        mc.store(r.SCRATCH.value, r.SPP.value, ofs)
 
     def break_long_loop(self):
         # If the loop is too long, the guards in it will jump forward
@@ -1003,7 +985,7 @@
         startpos = self.mc.currpos()
         fail_descr, target = self.store_info_on_descr(startpos, guardtok)
         assert target != 0
-        self.load_gcmap(self.mc, gcmap=guardtok.gcmap)   # -> r2
+        self.load_gcmap(self.mc, r.r2, gcmap=guardtok.gcmap)
         self.mc.load_imm(r.r0, target)
         self.mc.mtctr(r.r0.value)
         self.mc.load_imm(r.r0, fail_descr)
@@ -1187,56 +1169,201 @@
                 self.mc.ld(r.SCRATCH.value, r.SP.value, index)
                 self.regalloc_mov(r.SCRATCH, loc)
 
-    def malloc_cond(self, nursery_free_adr, nursery_top_adr, size):
+    def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap):
         assert size & (WORD-1) == 0     # must be correctly aligned
 
-        self.mc.load_imm(r.RES, nursery_free_adr)
-        self.mc.load(r.RES.value, r.RES.value, 0)
+        # We load into RES the address stored at nursery_free_adr. We
+        # calculate the new value for nursery_free_adr and store it in
+        # RSZ.  Then we load the address stored in nursery_top_adr
+        # into SCRATCH.  In the rare case where the value in RSZ is
+        # (unsigned) bigger than the one in SCRATCH we call
+        # malloc_slowpath.  In the common case where malloc_slowpath
+        # is not called, we must still write RSZ back into
+        # nursery_free_adr (r2); so we do it always, even if we called
+        # malloc_slowpath.
+
+        diff = nursery_top_adr - nursery_free_adr
+        assert _check_imm_arg(diff)
+        mc = self.mc
+        mc.load_imm(r.r2, nursery_free_adr)
+
+        mc.load(r.RES.value, r.r2.value, 0)         # load nursery_free
+        mc.load(r.SCRATCH.value, r.r2.value, diff)  # load nursery_top
 
         if _check_imm_arg(size):
-            self.mc.addi(r.r4.value, r.RES.value, size)
+            mc.addi(r.RSZ.value, r.RES.value, size)
         else:
-            self.mc.load_imm(r.r4, size)
-            self.mc.add(r.r4.value, r.RES.value, r.r4.value)
+            mc.load_imm(r.RSZ, size)
+            mc.add(r.RSZ.value, r.RES.value, r.RSZ.value)
 
-        with scratch_reg(self.mc):
-            self.mc.load_imm(r.SCRATCH, nursery_top_adr)
-            self.mc.loadx(r.SCRATCH.value, 0, r.SCRATCH.value)
-            self.mc.cmp_op(0, r.r4.value, r.SCRATCH.value, signed=False)
+        mc.cmp_op(0, r.RSZ.value, r.SCRATCH.value, signed=False)
 
-        fast_jmp_pos = self.mc.currpos()
-        self.mc.trap()
+        fast_jmp_pos = mc.currpos()
+        mc.trap()        # conditional jump, patched later
 
-        # We load into r3 the address stored at nursery_free_adr. We calculate
-        # the new value for nursery_free_adr and store in r1 The we load the
-        # address stored in nursery_top_adr into IP If the value in r4 is
-        # (unsigned) bigger than the one in ip we conditionally call
-        # malloc_slowpath in case we called malloc_slowpath, which returns the
-        # new value of nursery_free_adr in r4 and the adr of the new object in
-        # r3.
-        self.mark_gc_roots(self.write_new_force_index(),
-                           use_copy_area=True)
+        # new value of nursery_free_adr in RSZ and the adr of the new object
+        # in RES.
+        self.load_gcmap(mc, r.r2, gcmap)
         # We are jumping to malloc_slowpath without a call through a function
-        # descriptor, because it is an internal call and "call" would trash r11
-        self.mc.bl_abs(self.malloc_slowpath)
+        # descriptor, because it is an internal call and "call" would trash
+        # r2 and r11
+        mc.bl_abs(self.malloc_slowpath)
 
-        offset = self.mc.currpos() - fast_jmp_pos
-        pmc = OverwritingBuilder(self.mc, fast_jmp_pos, 1)
-        pmc.ble(offset) # jump if LE (not GT)
+        offset = mc.currpos() - fast_jmp_pos
+        pmc = OverwritingBuilder(mc, fast_jmp_pos, 1)
+        pmc.bc(7, 1, offset)    # jump if LE (not GT), predicted to be true
         pmc.overwrite()
-        
-        with scratch_reg(self.mc):
-            self.mc.load_imm(r.SCRATCH, nursery_free_adr)
-            self.mc.storex(r.r4.value, 0, r.SCRATCH.value)
 
-    def mark_gc_roots(self, force_index, use_copy_area=False):
-        if force_index < 0:
-            return     # not needed
-        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
-        if gcrootmap:
-            mark = self._regalloc.get_mark_gc_roots(gcrootmap, use_copy_area)
-            assert gcrootmap.is_shadow_stack
-            gcrootmap.write_callshape(mark, force_index)
+        mc.store(r.RSZ.value, r.r2.value, 0)    # store into nursery_free
+
+    def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr,
+                                  sizeloc, gcmap):
+        diff = nursery_top_adr - nursery_free_adr
+        assert _check_imm_arg(diff)
+        mc = self.mc
+        mc.load_imm(r.r2, nursery_free_adr)
+
+        if sizeloc is r.RES:
+            mc.mr(r.RSZ.value, r.RES.value)
+            sizeloc = r.RSZ
+
+        mc.load(r.RES.value, r.r2.value, 0)         # load nursery_free
+        mc.load(r.SCRATCH.value, r.r2.value, diff)  # load nursery_top
+
+        mc.add(r.RSZ.value, r.RES.value, sizeloc.value)
+
+        mc.cmp_op(0, r.RSZ.value, r.SCRATCH.value, signed=False)
+
+        fast_jmp_pos = mc.currpos()
+        mc.trap()        # conditional jump, patched later
+
+        # new value of nursery_free_adr in RSZ and the adr of the new object
+        # in RES.
+        self.load_gcmap(mc, r.r2, gcmap)
+        mc.bl_abs(self.malloc_slowpath)
+
+        offset = mc.currpos() - fast_jmp_pos
+        pmc = OverwritingBuilder(mc, fast_jmp_pos, 1)
+        pmc.bc(7, 1, offset)    # jump if LE (not GT), predicted to be true
+        pmc.overwrite()
+
+        mc.store(r.RSZ.value, r.r2.value, 0)    # store into nursery_free
+
+    def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr,
+                            lengthloc, itemsize, maxlength, gcmap,
+                            arraydescr):
+        from rpython.jit.backend.llsupport.descr import ArrayDescr
+        assert isinstance(arraydescr, ArrayDescr)
+
+        # lengthloc is the length of the array, which we must not modify!
+        assert lengthloc is not r.RES and lengthloc is not r.RSZ
+        assert lengthloc.is_reg()
+
+        if maxlength > 2**16-1:
+            maxlength = 2**16-1      # makes things easier
+        mc = self.mc
+        mc.cmp_op(0, lengthloc.value, maxlength, imm=True, signed=False)
+
+        jmp_adr0 = mc.currpos()
+        mc.trap()       # conditional jump, patched later
+
+        # ------------------------------------------------------------
+        # block of code for the case: the length is <= maxlength
+
+        diff = nursery_top_adr - nursery_free_adr
+        assert _check_imm_arg(diff)
+        mc.load_imm(r.r2, nursery_free_adr)
+
+        varsizeloc = self._multiply_by_constant(lengthloc, itemsize,
+                                                r.RSZ)
+        # varsizeloc is either RSZ here, or equal to lengthloc if
+        # itemsize == 1.  It is the size of the variable part of the
+        # array, in bytes.
+
+        mc.load(r.RES.value, r.r2.value, 0)         # load nursery_free
+        mc.load(r.SCRATCH.value, r.r2.value, diff)  # load nursery_top
+
+        assert arraydescr.basesize >= self.gc_minimal_size_in_nursery
+        constsize = arraydescr.basesize + self.gc_size_of_header
+        force_realignment = (itemsize % WORD) != 0
+        if force_realignment:
+            constsize += WORD - 1
+        mc.addi(r.RSZ.value, r.RSZ.value, constsize)
+        if force_realignment:
+            # "& ~(WORD-1)"
+            bit_limit = 60 if WORD == 8 else 61
+            mc.rldicr(r.RSZ.value, r.RSZ.value, 0, bit_limit)
+
+        mc.add(r.RSZ.value, r.RES.value, r.RSZ.value)
+        # now RSZ contains the total size in bytes, rounded up to a multiple
+        # of WORD, plus nursery_free_adr
+
+        mc.cmp_op(0, r.RSZ.value, r.SCRATCH.value, signed=False)
+
+        jmp_adr1 = mc.currpos()
+        mc.trap()        # conditional jump, patched later
+
+        # ------------------------------------------------------------
+        # block of code for two cases: either the length is > maxlength
+        # (jump from jmp_adr0), or the length is small enough but there
+        # is not enough space in the nursery (fall-through)
+        #
+        offset = mc.currpos() - jmp_adr0
+        pmc = OverwritingBuilder(mc, jmp_adr0, 1)
+        pmc.bgt(offset)    # jump if GT
+        pmc.overwrite()
+        #
+        # save the gcmap
+        self.load_gcmap(mc, r.r2, gcmap)
+        #
+        # load the function to call into CTR
+        if kind == rewrite.FLAG_ARRAY:
+            addr = self.malloc_slowpath_varsize
+        elif kind == rewrite.FLAG_STR:
+            addr = self.malloc_slowpath_str
+        elif kind == rewrite.FLAG_UNICODE:
+            addr = self.malloc_slowpath_unicode
+        else:
+            raise AssertionError(kind)
+        mc.load_imm(r.SCRATCH, addr)
+        mc.mtctr(r.SCRATCH.value)
+        #
+        # load the argument(s)
+        if kind == rewrite.FLAG_ARRAY:
+            mc.mr(r.RSZ.value, lengthloc.value)
+            mc.load_imm(r.RES, itemsize)
+            mc.load_imm(r.SCRATCH, arraydescr.tid)
+        else:
+            mc.mr(r.RES.value, lengthloc.value)
+        #
+        # call!
+        mc.bctrl()
+
+        jmp_location = mc.currpos()
+        mc.trap()      # jump forward, patched later
+
+        # ------------------------------------------------------------
+        # block of code for the common case: the length is <= maxlength
+        # and there is enough space in the nursery
+
+        offset = mc.currpos() - jmp_adr1
+        pmc = OverwritingBuilder(mc, jmp_adr1, 1)
+        pmc.ble(offset)    # jump if LE
+        pmc.overwrite()
+        #
+        # write down the tid, but only in this case (not in other cases
+        # where r.RES is the result of the CALL)
+        mc.load_imm(r.SCRATCH, arraydescr.tid)
+        mc.store(r.SCRATCH.value, r.RES.value, 0)
+        # while we're at it, this line is not needed if we've done the CALL
+        mc.store(r.RSZ.value, r.r2.value, 0)    # store into nursery_free
+
+        # ------------------------------------------------------------
+
+        offset = mc.currpos() - jmp_location
+        pmc = OverwritingBuilder(mc, jmp_location, 1)
+        pmc.b(offset)    # jump always
+        pmc.overwrite()
 
     def propagate_memoryerror_if_r3_is_null(self):
         # if self.propagate_exception_path == 0 (tests), this may jump to 0
diff --git a/rpython/jit/backend/ppc/regalloc.py 
b/rpython/jit/backend/ppc/regalloc.py
--- a/rpython/jit/backend/ppc/regalloc.py
+++ b/rpython/jit/backend/ppc/regalloc.py
@@ -24,9 +24,11 @@
 from rpython.jit.backend.llsupport.descr import unpack_arraydescr
 from rpython.jit.backend.llsupport.descr import unpack_fielddescr
 from rpython.jit.backend.llsupport.descr import unpack_interiorfielddescr
+from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.jit.codewriter.effectinfo import EffectInfo
 from rpython.rlib import rgc
+from rpython.rlib.rarithmetic import r_uint
 
 LIMIT_LOOP_BREAK = 15000      # should be much smaller than 32 KB
 
@@ -56,7 +58,8 @@
 class FPRegisterManager(RegisterManager):
     all_regs              = r.MANAGED_FP_REGS
     box_types             = [FLOAT]
-    save_around_call_regs = [_r for _r in all_regs if _r in r.VOLATILES_FLOAT]
+    save_around_call_regs = r.VOLATILES_FLOAT
+    assert set(save_around_call_regs).issubset(all_regs)
 
     def convert_to_imm(self, c):
         assert isinstance(c, ConstFloat)
@@ -93,8 +96,9 @@
     all_regs              = r.MANAGED_REGS
     box_types             = None       # or a list of acceptable types
     no_lower_byte_regs    = all_regs
-    save_around_call_regs = [_r for _r in all_regs if _r in r.VOLATILES]
+    save_around_call_regs = r.VOLATILES
     frame_reg             = r.SPP
+    assert set(save_around_call_regs).issubset(all_regs)
 
     REGLOC_TO_COPY_AREA_OFS = {
         r.r5:   MY_COPY_OF_REGS + 0 * WORD,
@@ -349,9 +353,24 @@
         while self.min_bytes_before_label > mc.get_relative_pos():
             mc.nop()
 
-    def get_gcmap(self, noregs=False):
-        #xxxxxx
-        return '???'
+    def get_gcmap(self, forbidden_regs=[], noregs=False):
+        frame_depth = self.fm.get_frame_depth()
+        gcmap = allocate_gcmap(self.assembler, frame_depth,
+                               r.JITFRAME_FIXED_SIZE)
+        for box, loc in self.rm.reg_bindings.iteritems():
+            if loc in forbidden_regs:
+                continue
+            if box.type == REF and self.rm.is_still_alive(box):
+                assert not noregs
+                assert loc.is_reg()
+                val = self.assembler.cpu.all_reg_indexes[loc.value]
+                gcmap[val // WORD // 8] |= r_uint(1) << (val % (WORD * 8))
+        for box, loc in self.fm.bindings.iteritems():
+            if box.type == REF and self.rm.is_still_alive(box):
+                assert isinstance(loc, locations.StackLocation)
+                val = loc.get_position() + r.JITFRAME_FIXED_SIZE
+                gcmap[val // WORD // 8] |= r_uint(1) << (val % (WORD * 8))
+        return gcmap
 
     def loc(self, var):
         if var.type == FLOAT:
@@ -921,36 +940,46 @@
         return args
 
     def prepare_call_malloc_nursery(self, op):
-        size_box = op.getarg(0)
-        assert isinstance(size_box, ConstInt)
-        size = size_box.getint()
+        self.rm.force_allocate_reg(op.result, selected_reg=r.RES)
+        self.rm.temp_boxes.append(op.result)
+        tmp_box = TempInt()
+        self.rm.force_allocate_reg(tmp_box, selected_reg=r.RSZ)
+        self.rm.temp_boxes.append(tmp_box)
+        return []
 
-        self.rm.force_allocate_reg(op.result, selected_reg=r.r3)
-        t = TempInt()
-        self.rm.force_allocate_reg(t, selected_reg=r.r4)
-        self.possibly_free_var(op.result)
-        self.possibly_free_var(t)
-        return [imm(size)]
+    def prepare_call_malloc_nursery_varsize_frame(self, op):
+        sizeloc = self.ensure_reg(op.getarg(0))
+        # sizeloc must be in a register, but we can free it now
+        # (we take care explicitly of conflicts with r.RES or r.RSZ)
+        self.free_op_vars()
+        # the result will be in r.RES
+        self.rm.force_allocate_reg(op.result, selected_reg=r.RES)
+        self.rm.temp_boxes.append(op.result)
+        # we need r.RSZ as a temporary
+        tmp_box = TempInt()
+        self.rm.force_allocate_reg(tmp_box, selected_reg=r.RSZ)
+        self.rm.temp_boxes.append(tmp_box)
+        return [sizeloc]
 
-    def get_mark_gc_roots(self, gcrootmap, use_copy_area=False):
-        shape = gcrootmap.get_basic_shape()
-        for v, val in self.frame_manager.bindings.items():
-            if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
-                assert val.is_stack()
-                gcrootmap.add_frame_offset(shape, val.value)
-        for v, reg in self.rm.reg_bindings.items():
-            gcrootmap = self.assembler.cpu.gc_ll_descr.gcrootmap
-            assert gcrootmap is not None and gcrootmap.is_shadow_stack
-            if reg is r.r3:
-                continue
-            if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
-                assert use_copy_area
-                xxxxxxxxxx   # check REGLOC_TO_COPY_AREA_OFS
-                assert reg in self.rm.REGLOC_TO_COPY_AREA_OFS
-                area_offset = self.rm.REGLOC_TO_COPY_AREA_OFS[reg]
-                gcrootmap.add_frame_offset(shape, area_offset)
-        return gcrootmap.compress_callshape(shape,
-                                            self.assembler.datablockwrapper)
+    def prepare_call_malloc_nursery_varsize(self, op):
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        if not hasattr(gc_ll_descr, 'max_size_of_young_obj'):
+            raise Exception("unreachable code")
+            # for boehm, this function should never be called
+        # the result will be in r.RES
+        self.rm.force_allocate_reg(op.result, selected_reg=r.RES)
+        self.rm.temp_boxes.append(op.result)
+        # we need r.RSZ as a temporary
+        tmp_box = TempInt()
+        self.rm.force_allocate_reg(tmp_box, selected_reg=r.RSZ)
+        self.rm.temp_boxes.append(tmp_box)
+        # length_box always survives: it's typically also present in the
+        # next operation that will copy it inside the new array.  Make
+        # sure it is in a register different from r.RES and r.RSZ.  (It
+        # should not be a ConstInt at all.)
+        length_box = op.getarg(2)
+        lengthloc = self.ensure_reg(length_box)
+        return [lengthloc]
 
     prepare_debug_merge_point = void
     prepare_jit_debug = void
diff --git a/rpython/jit/backend/ppc/register.py 
b/rpython/jit/backend/ppc/register.py
--- a/rpython/jit/backend/ppc/register.py
+++ b/rpython/jit/backend/ppc/register.py
@@ -14,14 +14,15 @@
 
 NONVOLATILES        = [r14, r15, r16, r17, r18, r19, r20, r21, r22, r23,
                     r24, r25, r26, r27, r28, r29, r30, r31]
-VOLATILES           = [r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12]
-# volatile r2 is persisted around calls and r13 can be ignored
+VOLATILES           = [r3, r4, r5, r6, r7, r8, r9, r10, r11, r12]
+# volatiles r0 and r2 are special, and r13 should be fully ignored
 
 # we don't use any non-volatile float register, to keep the frame header
 # code short-ish
 #NONVOLATILES_FLOAT  = [f14, f15, f16, f17, f18, f19, f20, f21, f22, f23,
 #                    f24, f25, f26, f27, f28, f29, f30, f31]
-VOLATILES_FLOAT  = [f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13]
+VOLATILES_FLOAT  = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13]
+# volatile f0 is special
 
 SCRATCH    = r0
 SCRATCH2   = r2
@@ -33,17 +34,19 @@
 RCS1       = r30    # a random managed non-volatile register
 RCS2       = r29    # a random managed non-volatile register
 RCS3       = r28    # a random managed non-volatile register
+RSZ        = r25    # size argument to malloc_slowpath
 
 MANAGED_REGS = [r3, r4, r5, r6, r7, r8, r9, r10, r11, r12,
                 r25, r26, r27, r28, r29, r30]
                 # registers r14 to r24 are not touched, we have enough
                 # registers already
 
-MANAGED_FP_REGS = VOLATILES_FLOAT[1:] #+ NONVOLATILES_FLOAT
+MANAGED_FP_REGS = VOLATILES_FLOAT #+ NONVOLATILES_FLOAT
 
 assert RCS1 in MANAGED_REGS and RCS1 in NONVOLATILES
 assert RCS2 in MANAGED_REGS and RCS2 in NONVOLATILES
 assert RCS3 in MANAGED_REGS and RCS3 in NONVOLATILES
+assert RSZ in MANAGED_REGS
 
 
 # The JITFRAME_FIXED_SIZE is measured in words, and should be the
diff --git a/rpython/jit/backend/ppc/runner.py 
b/rpython/jit/backend/ppc/runner.py
--- a/rpython/jit/backend/ppc/runner.py
+++ b/rpython/jit/backend/ppc/runner.py
@@ -18,6 +18,7 @@
     # missing: supports_singlefloats
 
     IS_64_BIT = True
+    backend_name = 'ppc64'
 
     from rpython.jit.backend.ppc.register import JITFRAME_FIXED_SIZE
     frame_reg = r.SP
diff --git a/rpython/jit/backend/ppc/test/conftest.py 
b/rpython/jit/backend/ppc/test/conftest.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/ppc/test/conftest.py
@@ -0,0 +1,12 @@
+"""
+This conftest disables the backend tests on non PPC platforms
+"""
+import py, os
+from rpython.jit.backend import detect_cpu
+
+cpu = detect_cpu.autodetect()
+
+def pytest_collect_directory(path, parent):
+    if not cpu.startswith('ppc'):
+        py.test.skip("PPC tests skipped: cpu is %r" % (cpu,))
+pytest_collect_file = pytest_collect_directory
diff --git a/rpython/jit/backend/ppc/test/test_calling_convention.py 
b/rpython/jit/backend/ppc/test/test_calling_convention.py
--- a/rpython/jit/backend/ppc/test/test_calling_convention.py
+++ b/rpython/jit/backend/ppc/test/test_calling_convention.py
@@ -1,9 +1,16 @@
-from rpython.rtyper.annlowlevel import llhelper
-from rpython.jit.metainterp.history import JitCellToken
-from rpython.jit.backend.test.calling_convention_test import CallingConvTests, 
parse
-from rpython.rtyper.lltypesystem import lltype
-from rpython.jit.codewriter.effectinfo import EffectInfo
+from rpython.jit.backend.test.calling_convention_test import CallingConvTests
+from rpython.jit.backend.ppc.codebuilder import PPCBuilder
+import rpython.jit.backend.ppc.register as r
+
 
 class TestPPCCallingConvention(CallingConvTests):
     # ../../test/calling_convention_test.py
-    pass
+
+    def make_function_returning_stack_pointer(self):
+        mc = PPCBuilder()
+        mc.mr(r.r3.value, r.r1.value)
+        mc.blr()
+        return mc.materialize(self.cpu, [])
+
+    def get_alignment_requirements(self):
+        return 16
diff --git a/rpython/jit/backend/ppc/test/test_gc_integration.py 
b/rpython/jit/backend/ppc/test/test_gc_integration.py
deleted file mode 100644
--- a/rpython/jit/backend/ppc/test/test_gc_integration.py
+++ /dev/null
@@ -1,403 +0,0 @@
-
-""" Tests for register allocation for common constructs
-"""
-
-import py
-from rpython.jit.metainterp.history import BoxInt, ConstInt,\
-     BoxPtr, ConstPtr, TreeLoop, TargetToken
-from rpython.jit.metainterp.resoperation import rop, ResOperation
-from rpython.jit.codewriter import heaptracker
-from rpython.jit.codewriter.effectinfo import EffectInfo
-from rpython.jit.backend.llsupport.descr import GcCache, FieldDescr, 
FLAG_SIGNED
-from rpython.jit.backend.llsupport.gc import GcLLDescription
-from rpython.jit.backend.detect_cpu import getcpuclass
-from rpython.jit.backend.ppc.regalloc import Regalloc
-from rpython.jit.backend.ppc.arch import WORD
-from rpython.jit.tool.oparser import parse
-from rpython.rtyper.lltypesystem import lltype, llmemory, rffi
-from rpython.rtyper.annlowlevel import llhelper
-from rpython.rtyper.lltypesystem import rclass, rstr
-from rpython.jit.backend.llsupport.gc import GcLLDescr_framework
-
-from rpython.jit.backend.arm.test.test_regalloc import MockAssembler
-from rpython.jit.backend.ppc.test.test_regalloc import BaseTestRegalloc
-from rpython.jit.backend.ppc.regalloc import PPCRegisterManager, 
PPCFrameManager,\
-     FPRegisterManager
-
-CPU = getcpuclass()
-
-class MockGcRootMap(object):
-    is_shadow_stack = False
-    def get_basic_shape(self):
-        return ['shape']
-    def add_frame_offset(self, shape, offset):
-        shape.append(offset)
-    def add_callee_save_reg(self, shape, reg_index):
-        index_to_name = { 1: 'ebx', 2: 'esi', 3: 'edi' }
-        shape.append(index_to_name[reg_index])
-    def compress_callshape(self, shape, datablockwrapper):
-        assert datablockwrapper == 'fakedatablockwrapper'
-        assert shape[0] == 'shape'
-        return ['compressed'] + shape[1:]
-
-class MockGcDescr(GcCache):
-    get_malloc_slowpath_addr = None
-    write_barrier_descr = None
-    moving_gc = True
-    gcrootmap = MockGcRootMap()
-
-    def initialize(self):
-        pass
-
-    _record_constptrs = GcLLDescr_framework._record_constptrs.im_func
-    rewrite_assembler = GcLLDescr_framework.rewrite_assembler.im_func
-
-class TestRegallocGcIntegration(BaseTestRegalloc):
-    
-    cpu = CPU(None, None)
-    cpu.gc_ll_descr = MockGcDescr(False)
-    cpu.setup_once()
-    
-    S = lltype.GcForwardReference()
-    S.become(lltype.GcStruct('S', ('field', lltype.Ptr(S)),
-                             ('int', lltype.Signed)))
-
-    fielddescr = cpu.fielddescrof(S, 'field')
-
-    struct_ptr = lltype.malloc(S)
-    struct_ref = lltype.cast_opaque_ptr(llmemory.GCREF, struct_ptr)
-    child_ptr = lltype.nullptr(S)
-    struct_ptr.field = child_ptr
-
-
-    descr0 = cpu.fielddescrof(S, 'int')
-    ptr0 = struct_ref
-
-    targettoken = TargetToken()
-
-    namespace = locals().copy()
-
-    def test_basic(self):
-        ops = '''
-        [p0]
-        p1 = getfield_gc(p0, descr=fielddescr)
-        finish(p1)
-        '''
-        self.interpret(ops, [self.struct_ptr])
-        assert not self.getptr(0, lltype.Ptr(self.S))
-
-    def test_rewrite_constptr(self):
-        ops = '''
-        []
-        p1 = getfield_gc(ConstPtr(struct_ref), descr=fielddescr)
-        finish(p1)
-        '''
-        self.interpret(ops, [])
-        assert not self.getptr(0, lltype.Ptr(self.S))
-
-    def test_bug_0(self):
-        ops = '''
-        [i0, i1, i2, i3, i4, i5, i6, i7, i8]
-        label(i0, i1, i2, i3, i4, i5, i6, i7, i8, descr=targettoken)
-        guard_value(i2, 1) [i2, i3, i4, i5, i6, i7, i0, i1, i8]
-        guard_class(i4, 138998336) [i4, i5, i6, i7, i0, i1, i8]
-        i11 = getfield_gc(i4, descr=descr0)
-        guard_nonnull(i11) [i4, i5, i6, i7, i0, i1, i11, i8]
-        i13 = getfield_gc(i11, descr=descr0)
-        guard_isnull(i13) [i4, i5, i6, i7, i0, i1, i11, i8]
-        i15 = getfield_gc(i4, descr=descr0)
-        i17 = int_lt(i15, 0)
-        guard_false(i17) [i4, i5, i6, i7, i0, i1, i11, i15, i8]
-        i18 = getfield_gc(i11, descr=descr0)
-        i19 = int_ge(i15, i18)
-        guard_false(i19) [i4, i5, i6, i7, i0, i1, i11, i15, i8]
-        i20 = int_lt(i15, 0)
-        guard_false(i20) [i4, i5, i6, i7, i0, i1, i11, i15, i8]
-        i21 = getfield_gc(i11, descr=descr0)
-        i22 = getfield_gc(i11, descr=descr0)
-        i23 = int_mul(i15, i22)
-        i24 = int_add(i21, i23)
-        i25 = getfield_gc(i4, descr=descr0)
-        i27 = int_add(i25, 1)
-        setfield_gc(i4, i27, descr=descr0)
-        i29 = getfield_raw(144839744, descr=descr0)
-        i31 = int_and(i29, -2141192192)
-        i32 = int_is_true(i31)
-        guard_false(i32) [i4, i6, i7, i0, i1, i24]
-        i33 = getfield_gc(i0, descr=descr0)
-        guard_value(i33, ConstPtr(ptr0)) [i4, i6, i7, i0, i1, i33, i24]
-        jump(i0, i1, 1, 17, i4, ConstPtr(ptr0), i6, i7, i24, descr=targettoken)
-        '''
-        self.interpret(ops, [0, 0, 0, 0, 0, 0, 0, 0, 0], run=False)
-
-NOT_INITIALIZED = chr(0xdd)
-
-class GCDescrFastpathMalloc(GcLLDescription):
-    gcrootmap = None
-    write_barrier_descr = None
-
-    def __init__(self):
-        GcLLDescription.__init__(self, None)
-        # create a nursery
-        NTP = rffi.CArray(lltype.Char)
-        self.nursery = lltype.malloc(NTP, 64, flavor='raw')
-        for i in range(64):
-            self.nursery[i] = NOT_INITIALIZED
-        self.addrs = lltype.malloc(rffi.CArray(lltype.Signed), 2,
-                                   flavor='raw')
-        self.addrs[0] = rffi.cast(lltype.Signed, self.nursery)
-        self.addrs[1] = self.addrs[0] + 64
-        self.calls = []
-        def malloc_slowpath(size):
-            if self.gcrootmap is not None:   # hook
-                self.gcrootmap.hook_malloc_slowpath()
-            self.calls.append(size)
-            # reset the nursery
-            nadr = rffi.cast(lltype.Signed, self.nursery)
-            self.addrs[0] = nadr + size
-            return nadr
-        self.generate_function('malloc_nursery', malloc_slowpath,
-                               [lltype.Signed], lltype.Signed)
-
-    def get_nursery_free_addr(self):
-        return rffi.cast(lltype.Signed, self.addrs)
-
-    def get_nursery_top_addr(self):
-        return rffi.cast(lltype.Signed, self.addrs) + WORD
-
-    def get_malloc_slowpath_addr(self):
-        return self.get_malloc_fn_addr('malloc_nursery')
-
-    def check_nothing_in_nursery(self):
-        # CALL_MALLOC_NURSERY should not write anything in the nursery
-        for i in range(64):
-            assert self.nursery[i] == NOT_INITIALIZED
-
-class TestMallocFastpath(BaseTestRegalloc):
-
-    def setup_method(self, method):
-        cpu = CPU(None, None)
-        cpu.gc_ll_descr = GCDescrFastpathMalloc()
-        cpu.setup_once()
-        self.cpu = cpu
-
-    def test_malloc_fastpath(self):
-        ops = '''
-        []
-        p0 = call_malloc_nursery(16)
-        p1 = call_malloc_nursery(32)
-        p2 = call_malloc_nursery(16)
-        finish(p0, p1, p2)
-        '''
-        self.interpret(ops, [])
-        # check the returned pointers
-        gc_ll_descr = self.cpu.gc_ll_descr
-        nurs_adr = rffi.cast(lltype.Signed, gc_ll_descr.nursery)
-        ref = self.cpu.get_latest_value_ref
-        assert rffi.cast(lltype.Signed, ref(0)) == nurs_adr + 0
-        assert rffi.cast(lltype.Signed, ref(1)) == nurs_adr + 16
-        assert rffi.cast(lltype.Signed, ref(2)) == nurs_adr + 48
-        # check the nursery content and state
-        gc_ll_descr.check_nothing_in_nursery()
-        assert gc_ll_descr.addrs[0] == nurs_adr + 64
-        # slowpath never called
-        assert gc_ll_descr.calls == []
-
-    def test_malloc_slowpath(self):
-        ops = '''
-        []
-        p0 = call_malloc_nursery(16)
-        p1 = call_malloc_nursery(32)
-        p2 = call_malloc_nursery(24)     # overflow
-        finish(p0, p1, p2)
-        '''
-        self.interpret(ops, [])
-        # check the returned pointers
-        gc_ll_descr = self.cpu.gc_ll_descr
-        nurs_adr = rffi.cast(lltype.Signed, gc_ll_descr.nursery)
-        ref = self.cpu.get_latest_value_ref
-        assert rffi.cast(lltype.Signed, ref(0)) == nurs_adr + 0
-        assert rffi.cast(lltype.Signed, ref(1)) == nurs_adr + 16
-        assert rffi.cast(lltype.Signed, ref(2)) == nurs_adr + 0
-        # check the nursery content and state
-        gc_ll_descr.check_nothing_in_nursery()
-        assert gc_ll_descr.addrs[0] == nurs_adr + 24
-        # this should call slow path once
-        assert gc_ll_descr.calls == [24]
-
-class MockShadowStackRootMap(MockGcRootMap):
-    is_shadow_stack = True
-    MARKER_FRAME = 88       # this marker follows the frame addr
-    S1 = lltype.GcStruct('S1')
-
-    def __init__(self):
-        self.addrs = lltype.malloc(rffi.CArray(lltype.Signed), 20,
-                                   flavor='raw')
-        # root_stack_top
-        self.addrs[0] = rffi.cast(lltype.Signed, self.addrs) + 3*WORD
-        # random stuff
-        self.addrs[1] = 123456
-        self.addrs[2] = 654321
-        self.check_initial_and_final_state()
-        self.callshapes = {}
-        self.should_see = []
-
-    def check_initial_and_final_state(self):
-        assert self.addrs[0] == rffi.cast(lltype.Signed, self.addrs) + 3*WORD
-        assert self.addrs[1] == 123456
-        assert self.addrs[2] == 654321
-
-    def get_root_stack_top_addr(self):
-        return rffi.cast(lltype.Signed, self.addrs)
-
-    def compress_callshape(self, shape, datablockwrapper):
-        assert shape[0] == 'shape'
-        return ['compressed'] + shape[1:]
-
-    def write_callshape(self, mark, force_index):
-        assert mark[0] == 'compressed'
-        assert force_index not in self.callshapes
-        assert force_index == 42 + len(self.callshapes)
-        self.callshapes[force_index] = mark
-
-    def hook_malloc_slowpath(self):
-        num_entries = self.addrs[0] - rffi.cast(lltype.Signed, self.addrs)
-        assert num_entries == 5*WORD    # 3 initially, plus 2 by the asm frame
-        assert self.addrs[1] == 123456  # unchanged
-        assert self.addrs[2] == 654321  # unchanged
-        frame_addr = self.addrs[3]                   # pushed by the asm frame
-        assert self.addrs[4] == self.MARKER_FRAME    # pushed by the asm frame
-        #
-        from pypy.jit.backend.ppc.arch import FORCE_INDEX_OFS
-        addr = rffi.cast(rffi.CArrayPtr(lltype.Signed),
-                         frame_addr + FORCE_INDEX_OFS)
-        force_index = addr[0]
-        assert force_index == 43    # in this test: the 2nd call_malloc_nursery
-        #
-        # The callshapes[43] saved above should list addresses both in the
-        # COPY_AREA and in the "normal" stack, where all the 16 values p1-p16
-        # of test_save_regs_at_correct_place should have been stored.  Here
-        # we replace them with new addresses, to emulate a moving GC.
-        shape = self.callshapes[force_index]
-        assert len(shape[1:]) == len(self.should_see)
-        new_objects = [None] * len(self.should_see)
-        for ofs in shape[1:]:
-            assert isinstance(ofs, int)    # not a register at all here
-            addr = rffi.cast(rffi.CArrayPtr(lltype.Signed), frame_addr + ofs)
-            contains = addr[0]
-            for j in range(len(self.should_see)):
-                obj = self.should_see[j]
-                if contains == rffi.cast(lltype.Signed, obj):
-                    assert new_objects[j] is None   # duplicate?
-                    break
-            else:
-                assert 0   # the value read from the stack looks random?
-            new_objects[j] = lltype.malloc(self.S1)
-            addr[0] = rffi.cast(lltype.Signed, new_objects[j])
-        self.should_see[:] = new_objects
-
-
-class TestMallocShadowStack(BaseTestRegalloc):
-
-    def setup_method(self, method):
-        cpu = CPU(None, None)
-        cpu.gc_ll_descr = GCDescrFastpathMalloc()
-        cpu.gc_ll_descr.gcrootmap = MockShadowStackRootMap()
-        cpu.setup_once()
-        for i in range(42):
-            cpu.reserve_some_free_fail_descr_number()
-        self.cpu = cpu
-
-    def test_save_regs_at_correct_place(self):
-        cpu = self.cpu
-        gc_ll_descr = cpu.gc_ll_descr
-        S1 = gc_ll_descr.gcrootmap.S1
-        S2 = lltype.GcStruct('S2', ('s0', lltype.Ptr(S1)),
-                                   ('s1', lltype.Ptr(S1)),
-                                   ('s2', lltype.Ptr(S1)),
-                                   ('s3', lltype.Ptr(S1)),
-                                   ('s4', lltype.Ptr(S1)),
-                                   ('s5', lltype.Ptr(S1)),
-                                   ('s6', lltype.Ptr(S1)),
-                                   ('s7', lltype.Ptr(S1)),
-                                   ('s8', lltype.Ptr(S1)),
-                                   ('s9', lltype.Ptr(S1)),
-                                   ('s10', lltype.Ptr(S1)),
-                                   ('s11', lltype.Ptr(S1)),
-                                   ('s12', lltype.Ptr(S1)),
-                                   ('s13', lltype.Ptr(S1)),
-                                   ('s14', lltype.Ptr(S1)),
-                                   ('s15', lltype.Ptr(S1)),
-                                   ('s16', lltype.Ptr(S1)),
-                                   ('s17', lltype.Ptr(S1)),
-                                   ('s18', lltype.Ptr(S1)),
-                                   ('s19', lltype.Ptr(S1)),
-                                   ('s20', lltype.Ptr(S1)),
-                                   ('s21', lltype.Ptr(S1)),
-                                   ('s22', lltype.Ptr(S1)),
-                                   ('s23', lltype.Ptr(S1)),
-                                   ('s24', lltype.Ptr(S1)),
-                                   ('s25', lltype.Ptr(S1)),
-                                   ('s26', lltype.Ptr(S1)),
-                                   ('s27', lltype.Ptr(S1)))
-        self.namespace = self.namespace.copy()
-        for i in range(28):
-            self.namespace['ds%i' % i] = cpu.fielddescrof(S2, 's%d' % i)
-        ops = '''
-        [p0]
-        p1 = getfield_gc(p0, descr=ds0)
-        p2 = getfield_gc(p0, descr=ds1)
-        p3 = getfield_gc(p0, descr=ds2)
-        p4 = getfield_gc(p0, descr=ds3)
-        p5 = getfield_gc(p0, descr=ds4)
-        p6 = getfield_gc(p0, descr=ds5)
-        p7 = getfield_gc(p0, descr=ds6)
-        p8 = getfield_gc(p0, descr=ds7)
-        p9 = getfield_gc(p0, descr=ds8)
-        p10 = getfield_gc(p0, descr=ds9)
-        p11 = getfield_gc(p0, descr=ds10)
-        p12 = getfield_gc(p0, descr=ds11)
-        p13 = getfield_gc(p0, descr=ds12)
-        p14 = getfield_gc(p0, descr=ds13)
-        p15 = getfield_gc(p0, descr=ds14)
-        p16 = getfield_gc(p0, descr=ds15)
-        p17 = getfield_gc(p0, descr=ds16)
-        p18 = getfield_gc(p0, descr=ds17)
-        p19 = getfield_gc(p0, descr=ds18)
-        p20 = getfield_gc(p0, descr=ds19)
-        p21 = getfield_gc(p0, descr=ds20)
-        p22 = getfield_gc(p0, descr=ds21)
-        p23 = getfield_gc(p0, descr=ds22)
-        p24 = getfield_gc(p0, descr=ds23)
-        p25 = getfield_gc(p0, descr=ds24)
-        p26 = getfield_gc(p0, descr=ds25)
-        p27 = getfield_gc(p0, descr=ds26)
-        p28 = getfield_gc(p0, descr=ds27)
-        #
-        # now all registers are in use
-        p29 = call_malloc_nursery(40)
-        p30 = call_malloc_nursery(40)     # overflow
-        #
-        finish(p1, p2, p3, p4, p5, p6, p7, p8,         \
-               p9, p10, p11, p12, p13, p14, p15, p16,  \
-               p17, p18, p19, p20, p21, p22, p23, p24, \
-               p25, p26, p27, p28)
-        '''
-        s2 = lltype.malloc(S2)
-        for i in range(28):
-            s1 = lltype.malloc(S1)
-            setattr(s2, 's%d' % i, s1)
-            gc_ll_descr.gcrootmap.should_see.append(s1)
-        s2ref = lltype.cast_opaque_ptr(llmemory.GCREF, s2)
-        #
-        self.interpret(ops, [s2ref])
-        gc_ll_descr.check_nothing_in_nursery()
-        assert gc_ll_descr.calls == [40]
-        gc_ll_descr.gcrootmap.check_initial_and_final_state()
-        # check the returned pointers
-        for i in range(28):
-            s1ref = self.cpu.get_latest_value_ref(i)
-            s1 = lltype.cast_opaque_ptr(lltype.Ptr(S1), s1ref)
-            for j in range(28):
-                assert s1 != getattr(s2, 's%d' % j)
-            assert s1 == gc_ll_descr.gcrootmap.should_see[i]
diff --git a/rpython/jit/backend/ppc/test/test_runner.py 
b/rpython/jit/backend/ppc/test/test_runner.py
--- a/rpython/jit/backend/ppc/test/test_runner.py
+++ b/rpython/jit/backend/ppc/test/test_runner.py
@@ -22,18 +22,16 @@
     # for the individual tests see
     # ====> ../../test/runner_test.py
 
-    if IS_PPC_32:
-        add_loop_instructions = ["ld", "add", "cmpwi", "beq", "b"]
-    else:
-        add_loop_instructions = ["ld", "add", "cmpdi", "beq", "b"]
-    bridge_loop_instructions = [
-        "ld", "cmpdi", "bge+",
-        "li", "lis", "ori", "mtctr", "bctrl",
-        "lis", "ori", "mtctr", "bctr"]
-    bridge_loop_instructions_alternative = [
-        "ld", "cmpdi", "bge+",
-        "li", "li", "rldicr", "oris", "ori", "mtctr", "bctrl",
-        "li", "rldicr", "oris", "ori", "mtctr", "bctr"]
+    assert not IS_PPC_32
+    load_imm_instructions = (
+        "(li|lis(; ori)?)(; rldicr(; oris)?(; ori)?)?")
+    add_loop_instructions = "ld; add; cmpdi; beq; b;$"
+    bridge_loop_instructions = (
+        "ld; cmpdi; bge.; "
+        "li; %s; mtctr; %s; bctrl; "
+        "%s; mtctr; bctr;$" % (
+            load_imm_instructions, load_imm_instructions,
+            load_imm_instructions))
 
     def get_cpu(self):
         cpu = PPC_CPU(rtyper=None, stats=FakeStats())
diff --git a/rpython/jit/backend/test/runner_test.py 
b/rpython/jit/backend/test/runner_test.py
--- a/rpython/jit/backend/test/runner_test.py
+++ b/rpython/jit/backend/test/runner_test.py
@@ -47,7 +47,6 @@
 
     add_loop_instructions = ['overload for a specific cpu']
     bridge_loop_instructions = ['overload for a specific cpu']
-    bridge_loop_instructions_alternative = None   # or another possible answer
 
     def execute_operation(self, opname, valueboxes, result_type, descr=None):
         inputargs, operations = self._get_single_operation_list(opname,
@@ -4380,14 +4379,11 @@
         # XXX we have to check the precise assembler, otherwise
         # we don't quite know if borders are correct
 
-        def checkops(mc, ops, alt_ops=None):
-            if len(mc) != len(ops) and alt_ops is not None:
-                ops = alt_ops
-            assert len(mc) == len(ops)
-            for i in range(len(mc)):
-                if ops[i] == '*':
-                    continue # ingore ops marked as '*', i.e. inline constants
-                assert mc[i].split("\t")[2].startswith(ops[i])
+        def checkops(mc, ops_regexp):
+            import re
+            words = [line.split("\t")[2].split()[0] + ';' for line in mc]
+            text = ' '.join(words)
+            assert re.compile(ops_regexp).match(text)
 
         data = ctypes.string_at(info.asmaddr, info.asmlen)
         try:
@@ -4397,8 +4393,7 @@
             data = ctypes.string_at(bridge_info.asmaddr, bridge_info.asmlen)
             mc = list(machine_code_dump(data, bridge_info.asmaddr, cpuname))
             lines = [line for line in mc if line.count('\t') >= 2]
-            checkops(lines, self.bridge_loop_instructions,
-                            self.bridge_loop_instructions_alternative)
+            checkops(lines, self.bridge_loop_instructions)
         except ObjdumpNotFound:
             py.test.skip("requires (g)objdump")
 
diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -373,6 +373,14 @@
             else:
                 mc.MOV_rs(edi.value, WORD)
         else:
+            # This 'for_frame' version is called after a CALL.  It does not
+            # need to save many registers: the registers that are anyway
+            # destroyed by the call can be ignored (volatiles), and the
+            # non-volatile registers won't be changed here.  It only needs
+            # to save eax, maybe edx, and xmm0 (possible results of the call)
+            # and two more non-volatile registers (used to store the RPython
+            # exception that occurred in the CALL, if any).
+            assert not withcards
             # we have one word to align
             mc.SUB_ri(esp.value, 7 * WORD) # align and reserve some space
             mc.MOV_sr(WORD, eax.value) # save for later
@@ -387,7 +395,7 @@
                 exc0, exc1 = ebx, r12
             mc.MOV(RawEspLoc(WORD * 5, REF), exc0)
             mc.MOV(RawEspLoc(WORD * 6, INT), exc1)
-            # note that it's save to store the exception in register,
+            # note that it's safe to store the exception in register,
             # since the call to write barrier can't collect
             # (and this is assumed a bit left and right here, like lack
             # of _reload_frame_if_necessary)
diff --git a/rpython/jit/backend/x86/test/test_runner.py 
b/rpython/jit/backend/x86/test/test_runner.py
--- a/rpython/jit/backend/x86/test/test_runner.py
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to