[pypy-commit] pypy default: merge arm64 support

fijal Mon, 15 Jul 2019 08:02:15 -0700

Author: fijal
Branch: 
Changeset: r97005:f3b7650ebfc0
Date: 2019-07-15 17:00 +0200
http://bitbucket.org/pypy/pypy/changeset/f3b7650ebfc0/


Log:    merge arm64 support

diff too long, truncating to 2000 out of 6406 lines

diff --git a/rpython/config/translationoption.py 
b/rpython/config/translationoption.py
--- a/rpython/config/translationoption.py
+++ b/rpython/config/translationoption.py
@@ -1,5 +1,6 @@
 import sys
 import os
+import platform as _stdlib_platform
 from rpython.config.config import OptionDescription, BoolOption, IntOption, 
ArbitraryOption, FloatOption
 from rpython.config.config import ChoiceOption, StrOption, Config, 
ConflictConfigError
 from rpython.config.config import ConfigError
@@ -30,7 +31,9 @@
     False)
     # Windows doesn't work.  Please
     # add other platforms here if it works on them.
-
+MACHINE = _stdlib_platform.machine()
+if MACHINE == 'aarch64':
+    SUPPORT__THREAD = False
 # (*) NOTE: __thread on OS/X does not work together with
 # pthread_key_create(): when the destructor is called, the __thread is
 # already freed!
diff --git a/rpython/jit/backend/aarch64/TODO b/rpython/jit/backend/aarch64/TODO
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/aarch64/TODO
@@ -0,0 +1,35 @@
+* cond_call and following guard_exception
+
+
+* We can try to make generate_quick_failure() emit two instructions less:
+  the two store_reg() [one in generate_quick_failure and the other in
+  push_gcmap].  Instead we'd load the values in ip2 and ip3, and the
+  store_regs would occur inside self.failure_recovery_code
+  (which 'target' points to).
+
+
+* use STP instead of STR in all long sequences of STR.  Same with LDR
+
+* use "STR xzr, [..]" instead of "gen_load_int(ip, 0); STR ip, [..]".
+  Search around for gen_load_int(...0): it occurs at least in pop_gcmap()
+  _build_failure_recovery(), build_frame_realloc_slowpath(), etc.
+
+
+* malloc_cond() and malloc_cond_varsize_frame() hard-code forward jump
+  distances by guessing the number of instructions that follows.  Bad
+  idea because some of these instructions could easily be optimized in
+  the future to be a bit shorter.  Rewrite this two places to use the
+  proper way instead of a magic "40" (or at least assert that it was
+  really 40).
+
+
+* use "CBNZ register, offset" (compare-and-branch-if-not-zero)
+  instead of a CMP+BNE pair.  Same with CBZ instead of CMP+BEQ
+
+
+* when we need to save things on the stack, we typically push two words
+  and pop them later.  It would be cheaper if we reserved two locations
+  in the stack from _call_header, then we could just write there.
+  *OR*
+  maybe it's enough if we use the form "str x0, [sp, !#offset]" which
+  combines in a single instruction the "str" with the change of sp
diff --git a/rpython/jit/backend/aarch64/__init__.py 
b/rpython/jit/backend/aarch64/__init__.py
new file mode 100644
diff --git a/rpython/jit/backend/aarch64/arch.py 
b/rpython/jit/backend/aarch64/arch.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/aarch64/arch.py
@@ -0,0 +1,14 @@
+
+WORD = 8
+
+# The stack contains the force_index and the, callee saved registers and
+# ABI required information
+# All the rest of the data is in a GC-managed variable-size "frame".
+# This jitframe object's address is always stored in the register FP
+# A jitframe is a jit.backend.llsupport.llmodel.jitframe.JITFRAME
+# Stack frame fixed area
+# Currently only the force_index
+NUM_MANAGED_REGS = 16
+NUM_VFP_REGS = 8
+JITFRAME_FIXED_SIZE = NUM_MANAGED_REGS + NUM_VFP_REGS
+# 16 GPR + 8 VFP Regs, for now
diff --git a/rpython/jit/backend/aarch64/assembler.py 
b/rpython/jit/backend/aarch64/assembler.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/aarch64/assembler.py
@@ -0,0 +1,1482 @@
+
+from rpython.jit.backend.aarch64.arch import WORD, JITFRAME_FIXED_SIZE
+from rpython.jit.backend.aarch64.codebuilder import InstrBuilder, 
OverwritingBuilder
+from rpython.jit.backend.aarch64.locations import imm, StackLocation, 
get_fp_offset
+#from rpython.jit.backend.arm.helper.regalloc import VMEM_imm_size
+from rpython.jit.backend.aarch64.opassembler import ResOpAssembler
+from rpython.jit.backend.aarch64.regalloc import (Regalloc, check_imm_arg,
+    operations as regalloc_operations, guard_operations, comp_operations,
+    CoreRegisterManager, VFPRegisterManager)
+from rpython.jit.backend.aarch64 import registers as r
+from rpython.jit.backend.arm import conditions as c
+from rpython.jit.backend.llsupport import jitframe, rewrite
+from rpython.jit.backend.llsupport.assembler import BaseAssembler
+from rpython.jit.backend.llsupport.regalloc import get_scale, 
valid_addressing_size
+from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
+from rpython.jit.backend.model import CompiledLoopToken
+from rpython.jit.codewriter.effectinfo import EffectInfo
+from rpython.jit.metainterp.history import AbstractFailDescr, FLOAT, INT, VOID
+from rpython.jit.metainterp.resoperation import rop
+from rpython.rlib.debug import debug_print, debug_start, debug_stop
+from rpython.rlib.jit import AsmInfo
+from rpython.rlib.objectmodel import we_are_translated, specialize, 
compute_unique_id
+from rpython.rlib.rarithmetic import r_uint
+from rpython.rtyper.annlowlevel import llhelper, cast_instance_to_gcref
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rtyper.lltypesystem.lloperation import llop
+from rpython.rlib.rjitlog import rjitlog as jl
+
+class AssemblerARM64(ResOpAssembler):
+    def __init__(self, cpu, translate_support_code=False):
+        ResOpAssembler.__init__(self, cpu, translate_support_code)
+        self.failure_recovery_code = [0, 0, 0, 0]
+        self.wb_slowpath = [0, 0, 0, 0, 0]
+
+    def assemble_loop(self, jd_id, unique_id, logger, loopname, inputargs,
+                      operations, looptoken, log):
+        clt = CompiledLoopToken(self.cpu, looptoken.number)
+        clt._debug_nbargs = len(inputargs)
+        looptoken.compiled_loop_token = clt
+
+        if not we_are_translated():
+            # Arguments should be unique
+            assert len(set(inputargs)) == len(inputargs)
+
+        self.setup(looptoken)
+
+        frame_info = self.datablockwrapper.malloc_aligned(
+            jitframe.JITFRAMEINFO_SIZE, alignment=WORD)
+        clt.frame_info = rffi.cast(jitframe.JITFRAMEINFOPTR, frame_info)
+        clt.frame_info.clear() # for now
+
+        if log:
+            operations = self._inject_debugging_code(looptoken, operations,
+                                                     'e', looptoken.number)
+
+        regalloc = Regalloc(assembler=self)
+        allgcrefs = []
+        operations = regalloc.prepare_loop(inputargs, operations, looptoken,
+                                           allgcrefs)
+        self.reserve_gcref_table(allgcrefs)
+        functionpos = self.mc.get_relative_pos()
+
+        self._call_header_with_stack_check()
+        self._check_frame_depth_debug(self.mc)
+
+        loop_head = self.mc.get_relative_pos()
+        looptoken._ll_loop_code = loop_head
+        #
+        frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, 
operations)
+        self.update_frame_depth(frame_depth_no_fixed_size + 
JITFRAME_FIXED_SIZE)
+        #
+        size_excluding_failure_stuff = self.mc.get_relative_pos()
+
+        self.write_pending_failure_recoveries()
+
+        full_size = self.mc.get_relative_pos()
+        rawstart = self.materialize_loop(looptoken)
+        looptoken._ll_function_addr = rawstart + functionpos
+
+        self.patch_gcref_table(looptoken, rawstart)
+        self.process_pending_guards(rawstart)
+        self.fixup_target_tokens(rawstart)
+
+        if log and not we_are_translated():
+            self.mc._dump_trace(rawstart,
+                    'loop.asm')
+
+        ops_offset = self.mc.ops_offset
+
+        if logger:
+            log = logger.log_trace(jl.MARK_TRACE_ASM, None, self.mc)
+            log.write(inputargs, operations, ops_offset=ops_offset)
+
+            # legacy
+            if logger.logger_ops:
+                logger.logger_ops.log_loop(inputargs, operations, 0,
+                                           "rewritten", name=loopname,
+                                           ops_offset=ops_offset)
+
+        self.teardown()
+
+        debug_start("jit-backend-addr")
+        debug_print("Loop %d (%s) has address 0x%x to 0x%x (bootstrap 0x%x)" % 
(
+            looptoken.number, loopname,
+            r_uint(rawstart + loop_head),
+            r_uint(rawstart + size_excluding_failure_stuff),
+            r_uint(rawstart + functionpos)))
+        debug_print("       gc table: 0x%x" % r_uint(rawstart))
+        debug_print("       function: 0x%x" % r_uint(rawstart + functionpos))
+        debug_print("         resops: 0x%x" % r_uint(rawstart + loop_head))
+        debug_print("       failures: 0x%x" % r_uint(rawstart +
+                                                 size_excluding_failure_stuff))
+        debug_print("            end: 0x%x" % r_uint(rawstart + full_size))
+        debug_stop("jit-backend-addr")
+
+        return AsmInfo(ops_offset, rawstart + loop_head,
+                       size_excluding_failure_stuff - loop_head)
+
+    def assemble_bridge(self, logger, faildescr, inputargs, operations,
+                        original_loop_token, log):
+        if not we_are_translated():
+            # Arguments should be unique
+            assert len(set(inputargs)) == len(inputargs)
+
+        self.setup(original_loop_token)
+        #self.codemap.inherit_code_from_position(faildescr.adr_jump_offset)
+        descr_number = compute_unique_id(faildescr)
+        if log:
+            operations = self._inject_debugging_code(faildescr, operations,
+                                                     'b', descr_number)
+
+        assert isinstance(faildescr, AbstractFailDescr)
+
+        arglocs = self.rebuild_faillocs_from_descr(faildescr, inputargs)
+
+        regalloc = Regalloc(assembler=self)
+        allgcrefs = []
+        operations = regalloc.prepare_bridge(inputargs, arglocs,
+                                             operations,
+                                             allgcrefs,
+                                             self.current_clt.frame_info)
+        self.reserve_gcref_table(allgcrefs)
+        startpos = self.mc.get_relative_pos()
+
+        self._check_frame_depth(self.mc, regalloc.get_gcmap())
+
+        bridgestartpos = self.mc.get_relative_pos()
+        frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, 
operations)
+
+        codeendpos = self.mc.get_relative_pos()
+
+        self.write_pending_failure_recoveries()
+
+        fullsize = self.mc.get_relative_pos()
+        rawstart = self.materialize_loop(original_loop_token)
+
+        self.patch_gcref_table(original_loop_token, rawstart)
+        self.process_pending_guards(rawstart)
+
+        debug_start("jit-backend-addr")
+        debug_print("bridge out of Guard 0x%x has address 0x%x to 0x%x" %
+                    (r_uint(descr_number), r_uint(rawstart + startpos),
+                        r_uint(rawstart + codeendpos)))
+        debug_print("       gc table: 0x%x" % r_uint(rawstart))
+        debug_print("    jump target: 0x%x" % r_uint(rawstart + startpos))
+        debug_print("         resops: 0x%x" % r_uint(rawstart + 
bridgestartpos))
+        debug_print("       failures: 0x%x" % r_uint(rawstart + codeendpos))
+        debug_print("            end: 0x%x" % r_uint(rawstart + fullsize))
+        debug_stop("jit-backend-addr")
+
+        # patch the jump from original guard
+        self.patch_trace(faildescr, original_loop_token,
+                                    rawstart + startpos, regalloc)
+
+        self.patch_stack_checks(frame_depth_no_fixed_size + 
JITFRAME_FIXED_SIZE,
+                                rawstart)
+        if not we_are_translated():
+            if log:
+                self.mc._dump_trace(rawstart, 'bridge.asm')
+
+        ops_offset = self.mc.ops_offset
+        frame_depth = max(self.current_clt.frame_info.jfi_frame_depth,
+                          frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE)
+        self.fixup_target_tokens(rawstart)
+        self.update_frame_depth(frame_depth)
+
+        if logger:
+            log = logger.log_trace(jl.MARK_TRACE_ASM, None, self.mc)
+            log.write(inputargs, operations, ops_offset)
+            # log that the already written bridge is stitched to a descr!
+            logger.log_patch_guard(descr_number, rawstart)
+
+            # legacy
+            if logger.logger_ops:
+                logger.logger_ops.log_bridge(inputargs, operations, 
"rewritten",
+                                          faildescr, ops_offset=ops_offset)
+
+        self.teardown()
+
+        return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
+
+    def setup(self, looptoken):
+        BaseAssembler.setup(self, looptoken)
+        assert self.memcpy_addr != 0, 'setup_once() not called?'
+        if we_are_translated():
+            self.debug = False
+        self.current_clt = looptoken.compiled_loop_token
+        self.mc = InstrBuilder()
+        self.pending_guards = []
+        #assert self.datablockwrapper is None --- but obscure case
+        # possible, e.g. getting MemoryError and continuing
+        allblocks = self.get_asmmemmgr_blocks(looptoken)
+        self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
+                                                        allblocks)
+        self.mc.datablockwrapper = self.datablockwrapper
+        self.target_tokens_currently_compiling = {}
+        self.frame_depth_to_patch = []
+
+    def teardown(self):
+        self.current_clt = None
+        self._regalloc = None
+        self.mc = None
+        self.pending_guards = None
+
+    def _push_all_regs_to_jitframe(self, mc, ignored_regs, withfloats,
+                                   callee_only=False):
+        # Push general purpose registers
+        base_ofs = self.cpu.get_baseofs_of_frame_field()
+        if callee_only:
+            regs = CoreRegisterManager.save_around_call_regs
+        else:
+            regs = CoreRegisterManager.all_regs
+        # XXX add special case if ignored_regs are a block at the start of regs
+        if not ignored_regs:  # we want to push a contiguous block of regs
+            assert base_ofs < 0x100
+            for i, reg in enumerate(regs):
+                mc.STR_ri(reg.value, r.fp.value, base_ofs + i * WORD)
+        else:
+            for reg in ignored_regs:
+                assert not reg.is_vfp_reg()  # sanity check
+            # we can have holes in the list of regs
+            for i, gpr in enumerate(regs):
+                if gpr in ignored_regs:
+                    continue
+                self.store_reg(mc, gpr, r.fp, base_ofs + i * WORD)
+
+        if withfloats:
+            # Push VFP regs
+            regs = VFPRegisterManager.all_regs
+            ofs = len(CoreRegisterManager.all_regs) * WORD
+            for reg in regs:
+                mc.STR_di(reg.value, r.fp.value, ofs + base_ofs + reg.value * 
WORD)
+
+    def _pop_all_regs_from_jitframe(self, mc, ignored_regs, withfloats,
+                                    callee_only=False):
+        # Pop general purpose registers
+        base_ofs = self.cpu.get_baseofs_of_frame_field()
+        if callee_only:
+            regs = CoreRegisterManager.save_around_call_regs
+        else:
+            regs = CoreRegisterManager.all_regs
+        # XXX add special case if ignored_regs are a block at the start of regs
+        if not ignored_regs:  # we want to pop a contiguous block of regs
+            assert base_ofs < 0x100
+            for i, reg in enumerate(regs):
+                mc.LDR_ri(reg.value, r.fp.value, base_ofs + i * WORD)
+        else:
+            for reg in ignored_regs:
+                assert not reg.is_vfp_reg()  # sanity check
+            # we can have holes in the list of regs
+            for i, gpr in enumerate(regs):
+                if gpr in ignored_regs:
+                    continue
+                ofs = i * WORD + base_ofs
+                self.load_reg(mc, gpr, r.fp, ofs)
+        if withfloats:
+            # Pop VFP regs
+            regs = VFPRegisterManager.all_regs
+            ofs = len(CoreRegisterManager.all_regs) * WORD
+            for reg in regs:
+                mc.LDR_di(reg.value, r.fp.value, ofs + base_ofs + reg.value * 
WORD)
+
+    def _build_failure_recovery(self, exc, withfloats=False):
+        mc = InstrBuilder()
+        self._push_all_regs_to_jitframe(mc, [], withfloats)
+
+        if exc:
+            # We might have an exception pending.  Load it into r4
+            # (this is a register saved across calls)
+            mc.gen_load_int(r.x5.value, self.cpu.pos_exc_value())
+            mc.LDR_ri(r.x4.value, r.x5.value, 0)
+            # clear the exc flags
+            mc.gen_load_int(r.x6.value, 0)
+            mc.STR_ri(r.x6.value, r.x5.value, 0) # pos_exc_value is still in r5
+            mc.gen_load_int(r.x5.value, self.cpu.pos_exception())
+            mc.STR_ri(r.x6.value, r.x5.value, 0)
+            # save r4 into 'jf_guard_exc'
+            offset = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
+            assert check_imm_arg(abs(offset))
+            mc.STR_ri(r.x4.value, r.fp.value, offset)
+        # now we return from the complete frame, which starts from
+        # _call_header_with_stack_check().  The LEA in _call_footer below
+        # throws away most of the frame, including all the PUSHes that we
+        # did just above.
+
+        # set return value
+        mc.MOV_rr(r.x0.value, r.fp.value)
+
+        self.gen_func_epilog(mc)
+        rawstart = mc.materialize(self.cpu, [])
+        self.failure_recovery_code[exc + 2 * withfloats] = rawstart
+
+    def propagate_memoryerror_if_reg_is_null(self, reg_loc):
+        # see ../x86/assembler.py:genop_discard_check_memory_error()
+        self.mc.CMP_ri(reg_loc.value, 0)
+        self.mc.B_ofs_cond(6 * 4, c.NE)
+        self.mc.B(self.propagate_exception_path)
+
+    def _build_wb_slowpath(self, withcards, withfloats=False, for_frame=False):
+        descr = self.cpu.gc_ll_descr.write_barrier_descr
+        if descr is None:
+            return
+        if not withcards:
+            func = descr.get_write_barrier_fn(self.cpu)
+        else:
+            if descr.jit_wb_cards_set == 0:
+                return
+            func = descr.get_write_barrier_from_array_fn(self.cpu)
+            if func == 0:
+                return
+        #
+        # This builds a helper function called from the slow path of
+        # write barriers.  It must save all registers, and optionally
+        # all vfp registers.  It takes a single argument which is in x0.
+        # It must keep stack alignment accordingly.
+        mc = InstrBuilder()
+        #
+        mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+        mc.STR_ri(r.lr.value, r.sp.value, 0)
+        if not for_frame:
+            self._push_all_regs_to_jitframe(mc, [], withfloats, 
callee_only=True)
+        else:
+            # NOTE: don't save registers on the jitframe here!  It might
+            # override already-saved values that will be restored
+            # later...
+            #
+            # we're possibly called from the slowpath of malloc
+            # save the caller saved registers
+            # assuming we do not collect here
+            exc0, exc1 = r.x19, r.x20
+            mc.SUB_ri(r.sp.value, r.sp.value, (len(r.caller_resp) + 2 + 
len(r.caller_vfp_resp)) * WORD)
+            cur_stack = 0
+            for i in range(0, len(r.caller_resp), 2):
+                mc.STP_rri(r.caller_resp[i].value, r.caller_resp[i + 1].value, 
r.sp.value, i * WORD)
+            cur_stack = len(r.caller_resp)
+            mc.STP_rri(exc0.value, exc1.value, r.sp.value, cur_stack * WORD)
+            cur_stack += 2
+            for i in range(len(r.caller_vfp_resp)):
+                mc.STR_di(r.caller_vfp_resp[i].value, r.sp.value, cur_stack * 
WORD)
+                cur_stack += 1
+
+            self._store_and_reset_exception(mc, exc0, exc1)
+        mc.BL(func)
+        #
+        if not for_frame:
+            self._pop_all_regs_from_jitframe(mc, [], withfloats, 
callee_only=True)
+        else:
+            exc0, exc1 = r.x19, r.x20
+            self._restore_exception(mc, exc0, exc1)
+
+            cur_stack = 0
+            for i in range(0, len(r.caller_resp), 2):
+                mc.LDP_rri(r.caller_resp[i].value, r.caller_resp[i + 1].value, 
r.sp.value, i * WORD)
+            cur_stack = len(r.caller_resp)
+            mc.LDP_rri(exc0.value, exc1.value, r.sp.value, cur_stack * WORD)
+            cur_stack += 2
+            for i in range(len(r.caller_vfp_resp)):
+                mc.LDR_di(r.caller_vfp_resp[i].value, r.sp.value, cur_stack * 
WORD)
+                cur_stack += 1
+
+            assert exc0 is not None
+            assert exc1 is not None
+
+            mc.ADD_ri(r.sp.value, r.sp.value, (len(r.caller_resp) + 2 + 
len(r.caller_vfp_resp)) * WORD)
+
+        #
+        if withcards:
+            # A final TEST8 before the RET, for the caller.  Careful to
+            # not follow this instruction with another one that changes
+            # the status of the CPU flags!
+            mc.LDRB_ri(r.ip0.value, r.x0.value, descr.jit_wb_if_flag_byteofs)
+            mc.MOVZ_r_u16(r.ip1.value, 0x80, 0)
+            mc.TST_rr_shift(r.ip0.value, r.ip1.value, 0)
+        #
+        mc.LDR_ri(r.ip1.value, r.sp.value, 0)
+        mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
+        mc.RET_r(r.ip1.value)
+        #
+        rawstart = mc.materialize(self.cpu, [])
+        if for_frame:
+            self.wb_slowpath[4] = rawstart
+        else:
+            self.wb_slowpath[withcards + 2 * withfloats] = rawstart
+
+    def build_frame_realloc_slowpath(self):
+        # this code should do the following steps
+        # a) store all registers in the jitframe
+        # b) fish for the arguments passed by the caller
+        # c) store the gcmap in the jitframe
+        # d) call realloc_frame
+        # e) set the fp to point to the new jitframe
+        # f) store the address of the new jitframe in the shadowstack
+        # c) set the gcmap field to 0 in the new jitframe
+        # g) restore registers and return
+        mc = InstrBuilder()
+        self._push_all_regs_to_jitframe(mc, [], self.cpu.supports_floats)
+        # this is the gcmap stored by push_gcmap(mov=True) in 
_check_stack_frame
+        # and the expected_size pushed in _check_stack_frame
+        # pop the values passed on the stack, gcmap -> r0, expected_size -> r1
+        mc.LDP_rri(r.x0.value, r.x1.value, r.sp.value, 0)
+        
+        mc.STR_ri(r.lr.value, r.sp.value, 0)
+
+        # store the current gcmap(r0) in the jitframe
+        gcmap_ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+        mc.STR_ri(r.x0.value, r.fp.value, gcmap_ofs)
+
+        # set first arg, which is the old jitframe address
+        mc.MOV_rr(r.x0.value, r.fp.value)
+
+        # store a possibly present exception
+        self._store_and_reset_exception(mc, None, r.x19, on_frame=True)
+
+        # call realloc_frame, it takes two arguments
+        # arg0: the old jitframe
+        # arg1: the new size
+        #
+        mc.BL(self.cpu.realloc_frame)
+
+        # set fp to the new jitframe returned from the previous call
+        mc.MOV_rr(r.fp.value, r.x0.value)
+
+        # restore a possibly present exception
+        self._restore_exception(mc, None, r.x19)
+
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap and gcrootmap.is_shadow_stack:
+            self._load_shadowstack_top(mc, r.x19, gcrootmap)
+            # store the new jitframe addr in the shadowstack
+            mc.SUB_ri(r.x19.value, r.x19.value, WORD)
+            mc.STR_ri(r.x0.value, r.x19.value, 0)
+
+        # reset the jf_gcmap field in the jitframe
+        mc.gen_load_int(r.ip0.value, 0)
+        mc.STR_ri(r.ip0.value, r.fp.value, gcmap_ofs)
+
+        # restore registers
+        self._pop_all_regs_from_jitframe(mc, [], self.cpu.supports_floats)
+
+        # return
+        mc.LDR_ri(r.lr.value, r.sp.value, 0)
+        mc.ADD_ri(r.sp.value, r.sp.value, 2*WORD)
+        mc.RET_r(r.lr.value)
+        self._frame_realloc_slowpath = mc.materialize(self.cpu, [])        
+
+    def _load_shadowstack_top(self, mc, reg, gcrootmap):
+        rst = gcrootmap.get_root_stack_top_addr()
+        mc.gen_load_int(reg.value, rst)
+        self.load_reg(mc, reg, reg)
+        return rst
+
+    def _store_and_reset_exception(self, mc, excvalloc=None, exctploc=None,
+                                   on_frame=False):
+        """ Resest the exception. If excvalloc is None, then store it on the
+        frame in jf_guard_exc
+        """
+        assert excvalloc is not r.ip0
+        assert exctploc is not r.ip0
+        tmpreg = r.ip1
+        mc.gen_load_int(r.ip0.value, self.cpu.pos_exc_value())
+        if excvalloc is not None: # store
+            assert excvalloc.is_core_reg()
+            self.load_reg(mc, excvalloc, r.ip0)
+        if on_frame:
+            # store exc_value in JITFRAME
+            ofs = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
+            assert check_imm_arg(ofs)
+            #
+            self.load_reg(mc, r.ip0, r.ip0, helper=tmpreg)
+            #
+            self.store_reg(mc, r.ip0, r.fp, ofs, helper=tmpreg)
+        if exctploc is not None:
+            # store pos_exception in exctploc
+            assert exctploc.is_core_reg()
+            mc.gen_load_int(r.ip0.value, self.cpu.pos_exception())
+            self.load_reg(mc, exctploc, r.ip0, helper=tmpreg)
+
+        if on_frame or exctploc is not None:
+            mc.gen_load_int(r.ip0.value, self.cpu.pos_exc_value())
+
+        # reset exception
+        mc.gen_load_int(tmpreg.value, 0)
+
+        self.store_reg(mc, tmpreg, r.ip0, 0)
+
+        mc.gen_load_int(r.ip0.value, self.cpu.pos_exception())
+        self.store_reg(mc, tmpreg, r.ip0, 0)
+
+    def _restore_exception(self, mc, excvalloc, exctploc):
+        assert excvalloc is not r.ip0
+        assert exctploc is not r.ip0
+        mc.gen_load_int(r.ip0.value, self.cpu.pos_exc_value())
+        if excvalloc is not None:
+            assert excvalloc.is_core_reg()
+            self.store_reg(mc, excvalloc, r.ip0)
+        else:
+            assert exctploc is not r.fp
+            # load exc_value from JITFRAME and put it in pos_exc_value
+            ofs = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
+            self.load_reg(mc, r.ip1, r.fp, ofs)
+            self.store_reg(mc, r.ip1, r.ip0)
+            # reset exc_value in the JITFRAME
+            mc.gen_load_int(r.ip1.value, 0)
+            self.store_reg(mc, r.ip1, r.fp, ofs)
+
+        # restore pos_exception from exctploc register
+        mc.gen_load_int(r.ip0.value, self.cpu.pos_exception())
+        self.store_reg(mc, exctploc, r.ip0)
+
+    def _build_propagate_exception_path(self):
+        mc = InstrBuilder()
+        self._store_and_reset_exception(mc, r.x0)
+        ofs = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
+        # make sure ofs fits into a register
+        assert check_imm_arg(ofs)
+        self.store_reg(mc, r.x0, r.fp, ofs)
+        propagate_exception_descr = rffi.cast(lltype.Signed,
+                  cast_instance_to_gcref(self.cpu.propagate_exception_descr))
+        # put propagate_exception_descr into frame
+        ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
+        # make sure ofs fits into a register
+        assert check_imm_arg(ofs)
+        mc.gen_load_int(r.x0.value, propagate_exception_descr)
+        self.store_reg(mc, r.x0, r.fp, ofs)
+        mc.MOV_rr(r.x0.value, r.fp.value)
+        self.gen_func_epilog(mc)
+        rawstart = mc.materialize(self.cpu, [])
+        self.propagate_exception_path = rawstart
+
+    def _build_cond_call_slowpath(self, supports_floats, callee_only):
+        """ This builds a general call slowpath, for whatever call happens to
+        come.
+
+        The address of function to call comes in ip1. the result is also stored
+        in ip1 or ivfp
+        """
+        mc = InstrBuilder()
+        #
+        self._push_all_regs_to_jitframe(mc, [], self.cpu.supports_floats, 
callee_only)
+        ## args are in their respective positions
+        mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+        mc.STR_ri(r.ip0.value, r.sp.value, WORD)
+        mc.STR_ri(r.lr.value, r.sp.value, 0)
+        mc.BLR_r(r.ip1.value)
+        # callee saved
+        self._reload_frame_if_necessary(mc) # <- this will not touch x0
+        mc.MOV_rr(r.ip1.value, r.x0.value)
+        self._pop_all_regs_from_jitframe(mc, [], supports_floats,
+                                         callee_only) # <- this does not touch 
ip1
+        # return
+        mc.LDR_ri(r.ip0.value, r.sp.value, 0)
+        mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
+        mc.RET_r(r.ip0.value)
+        return mc.materialize(self.cpu, [])
+
+    def _build_malloc_slowpath(self, kind):
+        """ While arriving on slowpath, we have a gcpattern on stack 0.
+        The arguments are passed in r0 and r10, as follows:
+
+        kind == 'fixed': nursery_head in r0 and the size in r1 - r0.
+
+        kind == 'str/unicode': length of the string to allocate in r0.
+
+        kind == 'var': length to allocate in r1, tid in r0,
+                       and itemsize on the stack.
+
+        This function must preserve all registers apart from r0 and r1.
+        """
+        assert kind in ['fixed', 'str', 'unicode', 'var']
+        mc = InstrBuilder()
+        #
+        self._push_all_regs_to_jitframe(mc, [r.x0, r.x1], True)
+        #
+        if kind == 'fixed':
+            addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
+        elif kind == 'str':
+            addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_str')
+        elif kind == 'unicode':
+            addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_unicode')
+        else:
+            addr = self.cpu.gc_ll_descr.get_malloc_slowpath_array_addr()
+        if kind == 'fixed':
+            # At this point we know that the values we need to compute the size
+            # are stored in x0 and x1.
+            mc.SUB_rr(r.x0.value, r.x1.value, r.x0.value) # compute the size 
we want
+
+            if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
+                mc.MOV_rr(r.x1.value, r.fp.value)
+        elif kind == 'str' or kind == 'unicode':
+            mc.MOV_rr(r.x0.value, r.x1.value)
+        else:  # var
+            # tid is in x0
+            # length is in x1
+            # gcmap in ip1
+            # itemsize in ip2
+            mc.MOV_rr(r.x2.value, r.x1.value)
+            mc.MOV_rr(r.x1.value, r.x0.value)
+            mc.MOV_rr(r.x0.value, r.ip2.value) # load itemsize, ip2 now free
+        # store the gc pattern
+        ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+        mc.STR_ri(r.ip1.value, r.fp.value, ofs)
+        #
+        mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+        mc.STR_ri(r.lr.value, r.sp.value, 0)
+        #
+        mc.BL(addr)
+        #
+        # If the slowpath malloc failed, we raise a MemoryError that
+        # always interrupts the current loop, as a "good enough"
+        # approximation.
+        mc.CMP_ri(r.x0.value, 0)
+        mc.B_ofs_cond(4 * 6, c.NE)
+        mc.B(self.propagate_exception_path)
+        # jump here
+        self._reload_frame_if_necessary(mc)
+        self._pop_all_regs_from_jitframe(mc, [r.x0, r.x1], 
self.cpu.supports_floats)
+        #
+        nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
+        mc.gen_load_int(r.x1.value, nursery_free_adr)
+        mc.LDR_ri(r.x1.value, r.x1.value, 0)
+        # clear the gc pattern
+        mc.gen_load_int(r.ip0.value, 0)
+        self.store_reg(mc, r.ip0, r.fp, ofs)
+        # return
+        mc.LDR_ri(r.lr.value, r.sp.value, 0)
+        mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
+        mc.RET_r(r.lr.value)
+
+        #
+        rawstart = mc.materialize(self.cpu, [])
+        return rawstart
+
+    def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap):
+        assert size & (WORD-1) == 0
+
+        self.mc.gen_load_int(r.x0.value, nursery_free_adr)
+        self.mc.LDR_ri(r.x0.value, r.x0.value, 0)
+
+        if check_imm_arg(size):
+            self.mc.ADD_ri(r.x1.value, r.x0.value, size)
+        else:
+            self.mc.gen_load_int(r.x1.value, size)
+            self.mc.ADD_rr(r.x1.value, r.x0.value, r.x1.value)
+
+        self.mc.gen_load_int(r.ip0.value, nursery_top_adr)
+        self.mc.LDR_ri(r.ip0.value, r.ip0.value, 0)
+
+        self.mc.CMP_rr(r.x1.value, r.ip0.value)
+
+        # We load into r0 the address stored at nursery_free_adr We calculate
+        # the new value for nursery_free_adr and store in r1 The we load the
+        # address stored in nursery_top_adr into IP If the value in r1 is
+        # (unsigned) bigger than the one in ip we conditionally call
+        # malloc_slowpath in case we called malloc_slowpath, which returns the
+        # new value of nursery_free_adr in r1 and the adr of the new object in
+        # r0.
+
+        self.mc.B_ofs_cond(10 * 4, c.LO) # 4 for gcmap load, 5 for BL, 1 for 
B_ofs_cond
+        self.mc.gen_load_int_full(r.ip1.value, rffi.cast(lltype.Signed, gcmap))
+
+        self.mc.BL(self.malloc_slowpath)
+
+        self.mc.gen_load_int(r.ip0.value, nursery_free_adr)
+        self.mc.STR_ri(r.x1.value, r.ip0.value, 0)
+
+    def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr,
+                                  sizeloc, gcmap):
+        if sizeloc is r.x0:
+            self.mc.MOV_rr(r.x1.value, r.x0.value)
+            sizeloc = r.x1
+        self.mc.gen_load_int(r.x0.value, nursery_free_adr)
+        self.mc.LDR_ri(r.x0.value, r.x0.value, 0)
+        #
+        self.mc.ADD_rr(r.x1.value, r.x0.value, sizeloc.value)
+        #
+        self.mc.gen_load_int(r.ip0.value, nursery_top_adr)
+        self.mc.LDR_ri(r.ip0.value, r.ip0.value, 0)
+
+        self.mc.CMP_rr(r.x1.value, r.ip0.value)
+        #
+        self.mc.B_ofs_cond(40, c.LO) # see calculations in malloc_cond
+        self.mc.gen_load_int_full(r.ip1.value, rffi.cast(lltype.Signed, gcmap))
+
+        self.mc.BL(self.malloc_slowpath)
+
+        self.mc.gen_load_int(r.ip0.value, nursery_free_adr)
+        self.mc.STR_ri(r.x1.value, r.ip0.value, 0)
+
+    def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr,
+                            lengthloc, itemsize, maxlength, gcmap,
+                            arraydescr):
+        from rpython.jit.backend.llsupport.descr import ArrayDescr
+        assert isinstance(arraydescr, ArrayDescr)
+
+        # lengthloc is the length of the array, which we must not modify!
+        assert lengthloc is not r.x0 and lengthloc is not r.x1
+        if lengthloc.is_core_reg():
+            varsizeloc = lengthloc
+        else:
+            assert lengthloc.is_stack()
+            self.regalloc_mov(lengthloc, r.x1)
+            varsizeloc = r.x1
+        #
+        if check_imm_arg(maxlength):
+            self.mc.CMP_ri(varsizeloc.value, maxlength)
+        else:
+            self.mc.gen_load_int(r.ip0.value, maxlength)
+            self.mc.CMP_rr(varsizeloc.value, r.ip0.value)
+        jmp_adr0 = self.mc.currpos()  # jump to (large)
+        self.mc.BRK()
+        #
+        self.mc.gen_load_int(r.x0.value, nursery_free_adr)
+        self.mc.LDR_ri(r.x0.value, r.x0.value, 0)
+
+
+        if valid_addressing_size(itemsize):
+            shiftsize = get_scale(itemsize)
+        else:
+            shiftsize = self._mul_const_scaled(self.mc, r.lr, varsizeloc,
+                                                itemsize)
+            varsizeloc = r.lr
+        # now varsizeloc is a register != x0.  The size of
+        # the variable part of the array is (varsizeloc << shiftsize)
+        assert arraydescr.basesize >= self.gc_minimal_size_in_nursery
+        constsize = arraydescr.basesize + self.gc_size_of_header
+        force_realignment = (itemsize % WORD) != 0
+        if force_realignment:
+            constsize += WORD - 1
+        self.mc.gen_load_int(r.ip0.value, constsize)
+        # constsize + (varsizeloc << shiftsize)
+        self.mc.ADD_rr_shifted(r.x1.value, r.ip0.value, varsizeloc.value,
+                               shiftsize)
+        self.mc.ADD_rr(r.x1.value, r.x1.value, r.x0.value)
+        if force_realignment:
+            # -WORD = 0xfffffffffffffff8
+            self.mc.gen_load_int(r.ip0.value, -WORD)
+            self.mc.AND_rr(r.x1.value, r.x1.value, r.ip0.value)
+        # now x1 contains the total size in bytes, rounded up to a multiple
+        # of WORD, plus nursery_free_adr
+        #
+        self.mc.gen_load_int(r.ip0.value, nursery_top_adr)
+        self.mc.LDR_ri(r.ip0.value, r.ip0.value, 0)
+
+        self.mc.CMP_rr(r.x1.value, r.ip0.value)
+        jmp_adr1 = self.mc.currpos()  # jump to (after-call)
+        self.mc.BRK()
+        #
+        # (large)
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_adr0, WORD)
+        pmc.B_ofs_cond(currpos - jmp_adr0, c.GT)
+        #
+        # save the gcmap
+        self.mc.gen_load_int_full(r.ip1.value, rffi.cast(lltype.Signed, gcmap))
+        #
+
+        if kind == rewrite.FLAG_ARRAY:
+            self.mc.gen_load_int(r.x0.value, arraydescr.tid)
+            self.regalloc_mov(lengthloc, r.x1)
+            self.mc.gen_load_int(r.ip2.value, itemsize)
+            addr = self.malloc_slowpath_varsize
+        else:
+            if kind == rewrite.FLAG_STR:
+                addr = self.malloc_slowpath_str
+            else:
+                assert kind == rewrite.FLAG_UNICODE
+                addr = self.malloc_slowpath_unicode
+            self.regalloc_mov(lengthloc, r.x1)
+        self.mc.BL(addr)
+        #
+        jmp_location = self.mc.currpos()  # jump to (done)
+        self.mc.BRK()
+        # (after-call)
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_adr1, WORD)
+        pmc.B_ofs_cond(currpos - jmp_adr1, c.LS)
+        #
+        # write down the tid, but not if it's the result of the CALL
+        self.mc.gen_load_int(r.ip0.value, arraydescr.tid)
+        self.mc.STR_ri(r.ip0.value, r.x0.value, 0)
+
+        # while we're at it, this line is not needed if we've done the CALL
+        self.mc.gen_load_int(r.ip0.value, nursery_free_adr)
+        self.mc.STR_ri(r.x1.value, r.ip0.value, 0)
+        # (done)
+        # skip instructions after call
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_location, WORD)
+        pmc.B_ofs(currpos - jmp_location)
+
+    def _mul_const_scaled(self, mc, targetreg, sourcereg, itemsize):
+        """Produce one operation to do roughly
+               targetreg = sourcereg * itemsize
+           except that the targetreg may still need shifting by 0,1,2,3.
+        """
+        if (itemsize & 7) == 0:
+            shiftsize = 3
+        elif (itemsize & 3) == 0:
+            shiftsize = 2
+        elif (itemsize & 1) == 0:
+            shiftsize = 1
+        else:
+            shiftsize = 0
+        itemsize >>= shiftsize
+        #
+        if valid_addressing_size(itemsize - 1):
+            self.mc.ADD_rr_shifted(targetreg.value, sourcereg.value, 
sourcereg.value,
+                                   get_scale(itemsize - 1))
+        elif valid_addressing_size(itemsize):
+            self.mc.LSL_ri(targetreg.value, sourcereg.value,
+                    get_scale(itemsize))
+        else:
+            mc.gen_load_int(targetreg.value, itemsize)
+            mc.MUL_rr(targetreg.value, sourcereg.value, targetreg.value)
+        #
+        return shiftsize
+
+
+    def _build_stack_check_slowpath(self):
+        _, _, slowpathaddr = self.cpu.insert_stack_check()
+        if slowpathaddr == 0 or not self.cpu.propagate_exception_descr:
+            return      # no stack check (for tests, or non-translated)
+        #
+        # make a "function" that is called immediately at the start of
+        # an assembler function.  In particular, the stack looks like:
+        #
+        #    |  retaddr of caller    |   <-- aligned to a multiple of 16
+        #    |  saved argument regs  |
+        #    |  my own retaddr       |    <-- sp
+        #    +-----------------------+
+        #
+        mc = InstrBuilder()
+        # save argument registers and return address
+        mc.SUB_ri(r.sp.value, r.sp.value, (len(r.argument_regs) + 2) * WORD)
+        mc.STR_ri(r.lr.value, r.sp.value, 0)
+        for i in range(0, len(r.argument_regs), 2):
+            mc.STP_rri(r.argument_regs[i].value, r.argument_regs[i + 1].value,
+                       r.sp.value, (i + 2) * WORD)
+        # stack is aligned here
+        # Pass current stack pointer as argument to the call
+        mc.SUB_ri(r.x0.value, r.sp.value, 0)
+        #
+        mc.BL(slowpathaddr)
+
+        # check for an exception
+        mc.gen_load_int(r.x0.value, self.cpu.pos_exception())
+        mc.LDR_ri(r.x0.value, r.x0.value, 0)
+        mc.TST_rr_shift(r.x0.value, r.x0.value, 0)
+        #
+        # restore registers and return
+        # We check for c.EQ here, meaning all bits zero in this case
+
+        jmp = mc.currpos()
+        mc.BRK()
+
+        for i in range(0, len(r.argument_regs), 2):
+            mc.LDP_rri(r.argument_regs[i].value, r.argument_regs[i + 1].value,
+                       r.sp.value, (i + 2) * WORD)
+        mc.LDR_ri(r.ip0.value, r.sp.value, 0)
+        mc.ADD_ri(r.sp.value, r.sp.value, (len(r.argument_regs) + 2) * WORD)
+        mc.RET_r(r.ip0.value)
+
+        # jump here
+
+        pmc = OverwritingBuilder(mc, jmp, WORD)
+        pmc.B_ofs_cond(mc.currpos() - jmp, c.NE)
+
+        mc.ADD_ri(r.sp.value, r.sp.value, (len(r.argument_regs) + 2) * WORD)
+        mc.B(self.propagate_exception_path)
+        #
+
+        rawstart = mc.materialize(self.cpu, [])
+        self.stack_check_slowpath = rawstart
+
+    def _check_frame_depth_debug(self, mc):
+        pass
+
+    def _check_frame_depth(self, mc, gcmap, expected_size=-1):
+        """ check if the frame is of enough depth to follow this bridge.
+        Otherwise reallocate the frame in a helper.
+        There are other potential solutions
+        to that, but this one does not sound too bad.
+        """
+        descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
+        ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
+        mc.LDR_ri(r.ip0.value, r.fp.value, ofs)
+        stack_check_cmp_ofs = mc.currpos()
+        if expected_size == -1:
+            for _ in range(mc.get_max_size_of_gen_load_int()):
+                mc.NOP()
+        else:
+            mc.gen_load_int(r.ip1.value, expected_size)
+        mc.CMP_rr(r.ip0.value, r.ip1.value)
+
+        jg_location = mc.currpos()
+        mc.BRK()
+
+        # the size value is still stored in ip1
+        mc.SUB_ri(r.sp.value, r.sp.value, 2*WORD)
+        mc.STR_ri(r.ip1.value, r.sp.value, WORD)
+
+        mc.gen_load_int(r.ip0.value, rffi.cast(lltype.Signed, gcmap))
+        mc.STR_ri(r.ip0.value, r.sp.value, 0)
+
+        mc.BL(self._frame_realloc_slowpath)
+
+        # patch jg_location above
+        currpos = mc.currpos()
+        pmc = OverwritingBuilder(mc, jg_location, WORD)
+        pmc.B_ofs_cond(currpos - jg_location, c.GE)
+
+        self.frame_depth_to_patch.append(stack_check_cmp_ofs)
+
+    def update_frame_depth(self, frame_depth):
+        baseofs = self.cpu.get_baseofs_of_frame_field()
+        self.current_clt.frame_info.update_frame_depth(baseofs, frame_depth)
+
+    def _reload_frame_if_necessary(self, mc):
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap and gcrootmap.is_shadow_stack:
+            rst = gcrootmap.get_root_stack_top_addr()
+            mc.gen_load_int(r.ip0.value, rst)
+            self.load_reg(mc, r.ip0, r.ip0)
+            mc.SUB_ri(r.ip0.value, r.ip0.value, WORD)
+            mc.LDR_ri(r.fp.value, r.ip0.value, 0)
+        wbdescr = self.cpu.gc_ll_descr.write_barrier_descr
+        if gcrootmap and wbdescr:
+            # frame never uses card marking, so we enforce this is not
+            # an array
+            self._write_barrier_fastpath(mc, wbdescr, [r.fp], array=False,
+                                         is_frame=True)
+
+    def generate_quick_failure(self, guardtok):
+        startpos = self.mc.currpos()
+        faildescrindex, target = self.store_info_on_descr(startpos, guardtok)
+        self.load_from_gc_table(r.ip0.value, faildescrindex)
+        ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
+        self.store_reg(self.mc, r.ip0, r.fp, ofs)
+        self.push_gcmap(self.mc, gcmap=guardtok.gcmap)
+        assert target
+        self.mc.BL(target)
+        return startpos
+
+    def push_gcmap(self, mc, gcmap, store=True):
+        assert store
+        ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+        ptr = rffi.cast(lltype.Signed, gcmap)
+        mc.gen_load_int(r.ip0.value, ptr)
+        self.store_reg(mc, r.ip0, r.fp, ofs)
+
+    def pop_gcmap(self, mc):
+        ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+        mc.gen_load_int(r.ip0.value, 0)
+        self.store_reg(mc, r.ip0, r.fp, ofs)
+
+    def write_pending_failure_recoveries(self):
+        for tok in self.pending_guards:
+            #generate the exit stub and the encoded representation
+            tok.pos_recovery_stub = self.generate_quick_failure(tok)
+
+    def reserve_gcref_table(self, allgcrefs):
+        gcref_table_size = len(allgcrefs) * WORD
+       # align to a multiple of 16 and reserve space at the beginning
+       # of the machine code for the gc table.  This lets us write
+       # machine code with relative addressing (LDR literal).
+        gcref_table_size = (gcref_table_size + 15) & ~15
+        mc = self.mc
+        assert mc.get_relative_pos() == 0
+        for i in range(gcref_table_size):
+            mc.writechar('\x00')
+        self.setup_gcrefs_list(allgcrefs)
+
+    def patch_gcref_table(self, looptoken, rawstart):
+        # the gc table is at the start of the machine code
+        self.gc_table_addr = rawstart
+        tracer = self.cpu.gc_ll_descr.make_gcref_tracer(rawstart,
+                                                        self._allgcrefs)
+        gcreftracers = self.get_asmmemmgr_gcreftracers(looptoken)
+        gcreftracers.append(tracer)    # keepalive
+        self.teardown_gcrefs_list()
+
+    def patch_stack_checks(self, framedepth, rawstart):
+        for ofs in self.frame_depth_to_patch:
+            mc = InstrBuilder()
+            mc.gen_load_int(r.ip1.value, framedepth)
+            mc.copy_to_raw_memory(ofs + rawstart)
+
+    def load_from_gc_table(self, regnum, index):
+        address_in_buffer = index * WORD   # at the start of the buffer
+        p_location = self.mc.get_relative_pos(break_basic_block=False)
+        offset = address_in_buffer - p_location
+        self.mc.LDR_r_literal(regnum, offset)
+
+    def materialize_loop(self, looptoken):
+        self.datablockwrapper.done()      # finish using cpu.asmmemmgr
+        self.datablockwrapper = None
+        allblocks = self.get_asmmemmgr_blocks(looptoken)
+        size = self.mc.get_relative_pos() 
+        res = self.mc.materialize(self.cpu, allblocks,
+                                   self.cpu.gc_ll_descr.gcrootmap)
+        #self.cpu.codemap.register_codemap(
+        #    self.codemap.get_final_bytecode(res, size))
+        return res
+
+    def patch_trace(self, faildescr, looptoken, bridge_addr, regalloc):
+        b = InstrBuilder()
+        patch_addr = faildescr.adr_jump_offset
+        assert patch_addr != 0
+        b.BL(bridge_addr)
+        b.copy_to_raw_memory(patch_addr)
+        faildescr.adr_jump_offset = 0
+
+    def process_pending_guards(self, block_start):
+        clt = self.current_clt
+        for tok in self.pending_guards:
+            descr = tok.faildescr
+            assert isinstance(descr, AbstractFailDescr)
+            failure_recovery_pos = block_start + tok.pos_recovery_stub
+            descr.adr_jump_offset = failure_recovery_pos
+            relative_offset = tok.pos_recovery_stub - tok.offset
+            guard_pos = block_start + tok.offset
+            if not tok.guard_not_invalidated():
+                # patch the guard jump to the stub
+                # overwrite the generate BRK with a B_offs to the pos of the
+                # stub
+                mc = InstrBuilder()
+                mc.B_ofs_cond(relative_offset, c.get_opposite_of(tok.fcond))
+                mc.copy_to_raw_memory(guard_pos)
+                if tok.extra_offset != -1:
+                    mc = InstrBuilder()
+                    relative_offset = tok.pos_recovery_stub - tok.extra_offset
+                    guard_pos = block_start + tok.extra_offset
+                    mc.B_ofs_cond(relative_offset, 
c.get_opposite_of(tok.extra_cond))
+                    mc.copy_to_raw_memory(guard_pos)
+            else:
+                clt.invalidate_positions.append((guard_pos, relative_offset))
+
+    def fixup_target_tokens(self, rawstart):
+        for targettoken in self.target_tokens_currently_compiling:
+            targettoken._ll_loop_code += rawstart
+        self.target_tokens_currently_compiling = None
+
+    def _call_header_with_stack_check(self):
+        self._call_header()
+        if self.stack_check_slowpath == 0:
+            pass                # no stack check (e.g. not translated)
+        else:
+            endaddr, lengthaddr, _ = self.cpu.insert_stack_check()
+            # load stack end
+            self.mc.gen_load_int(r.lr.value, endaddr)           # load lr, 
[end]
+            self.mc.LDR_ri(r.lr.value, r.lr.value, 0)             # LDR lr, lr
+            # load stack length
+            self.mc.gen_load_int(r.ip1.value, lengthaddr)        # load ip1, 
lengh
+            self.mc.LDR_ri(r.ip1.value, r.ip1.value, 0)             # ldr ip1, 
*lengh
+            # calculate ofs
+            self.mc.SUB_ri(r.ip0.value, r.sp.value, 0) # ip0 = sp
+                                                       # otherwise we can't 
use sp
+            self.mc.SUB_rr(r.lr.value, r.lr.value, r.ip0.value) # lr = lr - ip0
+            # if ofs
+            self.mc.CMP_rr(r.lr.value, r.ip1.value)             # CMP ip0, ip1
+            pos = self.mc.currpos()
+            self.mc.BRK()
+            self.mc.BL(self.stack_check_slowpath)                 # call if 
ip0 > ip1
+            pmc = OverwritingBuilder(self.mc, pos, WORD)
+            pmc.B_ofs_cond(self.mc.currpos() - pos, c.LS)
+
+    def _call_header(self):
+        stack_size = (len(r.callee_saved_registers) + 4) * WORD
+        self.mc.STP_rr_preindex(r.lr.value, r.fp.value, r.sp.value, 
-stack_size)
+        for i in range(0, len(r.callee_saved_registers), 2):
+            self.mc.STP_rri(r.callee_saved_registers[i].value,
+                            r.callee_saved_registers[i + 1].value,
+                            r.sp.value,
+                            (i + 4) * WORD)
+        
+        self.saved_threadlocal_addr = 3 * WORD   # at offset 3 from location 
'sp'
+        self.mc.STR_ri(r.x1.value, r.sp.value, 3 * WORD)
+
+        # set fp to point to the JITFRAME, passed in argument 'x0'
+        self.mc.MOV_rr(r.fp.value, r.x0.value)
+        #
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap and gcrootmap.is_shadow_stack:
+            self.gen_shadowstack_header(gcrootmap)
+
+    def _assemble(self, regalloc, inputargs, operations):
+        #self.guard_success_cc = c.cond_none
+        regalloc.compute_hint_frame_locations(operations)
+        self._walk_operations(inputargs, operations, regalloc)
+        #assert self.guard_success_cc == c.cond_none
+        frame_depth = regalloc.get_final_frame_depth()
+        jump_target_descr = regalloc.jump_target_descr
+        if jump_target_descr is not None:
+            tgt_depth = jump_target_descr._arm_clt.frame_info.jfi_frame_depth
+            target_frame_depth = tgt_depth - JITFRAME_FIXED_SIZE
+            frame_depth = max(frame_depth, target_frame_depth)
+        return frame_depth
+
+    def _walk_operations(self, inputargs, operations, regalloc):
+        self._regalloc = regalloc
+        regalloc.operations = operations
+        while regalloc.position() < len(operations) - 1:
+            regalloc.next_instruction()
+            i = regalloc.position()
+            op = operations[i]
+            self.mc.mark_op(op)
+            opnum = op.getopnum()
+            if rop.has_no_side_effect(opnum) and op not in regalloc.longevity:
+                regalloc.possibly_free_vars_for_op(op)
+            elif not we_are_translated() and op.getopnum() == rop.FORCE_SPILL:
+                regalloc.force_spill_var(op.getarg(0))
+            elif ((rop.returns_bool_result(opnum) or op.is_ovf()) and 
+                  i < len(operations) - 1 and
+                  regalloc.next_op_can_accept_cc(operations, i) or
+                                               operations[i].is_ovf()):
+                if operations[i].is_ovf():
+                    assert operations[i + 1].getopnum() in [rop.GUARD_OVERFLOW,
+                                                            
rop.GUARD_NO_OVERFLOW]
+                guard_op = operations[i + 1]
+                guard_num = guard_op.getopnum()
+                arglocs, fcond = guard_operations[guard_num](regalloc, 
guard_op, op)
+                if arglocs is not None:
+                    asm_guard_operations[guard_num](self, op, guard_op, fcond, 
arglocs)
+                regalloc.next_instruction() # advance one more
+                if guard_op.is_guard(): # can be also cond_call
+                    regalloc.possibly_free_vars(guard_op.getfailargs())
+                regalloc.possibly_free_vars_for_op(guard_op)
+            elif (rop.is_call_may_force(op.getopnum()) or
+                  rop.is_call_release_gil(op.getopnum()) or
+                  rop.is_call_assembler(op.getopnum())):
+                guard_op = operations[i + 1] # has to exist
+                guard_num = guard_op.getopnum()
+                assert guard_num in (rop.GUARD_NOT_FORCED, 
rop.GUARD_NOT_FORCED_2)
+                arglocs, fcond = guard_operations[guard_num](regalloc, 
guard_op, op)
+                if arglocs is not None:
+                    asm_guard_operations[guard_num](self, op, guard_op, fcond, 
arglocs)
+                # fcond is abused here to pass the number of args
+                regalloc.next_instruction() # advance one more
+                regalloc.possibly_free_vars(guard_op.getfailargs())
+                regalloc.possibly_free_vars_for_op(guard_op)
+            else:
+                arglocs = regalloc_operations[opnum](regalloc, op)
+                if arglocs is not None:
+                    asm_operations[opnum](self, op, arglocs)
+            if rop.is_guard(opnum):
+                regalloc.possibly_free_vars(op.getfailargs())
+            if op.type != 'v':
+                regalloc.possibly_free_var(op)
+            regalloc.possibly_free_vars_for_op(op)
+            regalloc.free_temp_vars()
+            regalloc._check_invariants()
+        if not we_are_translated():
+            self.mc.BRK()
+        self.mc.mark_op(None)  # end of the loop
+        regalloc.operations = None
+
+    def dispatch_comparison(self, op):
+        opnum = op.getopnum()
+        arglocs = comp_operations[opnum](self._regalloc, op, True)
+        assert arglocs is not None
+        return asm_comp_operations[opnum](self, op, arglocs)
+
+    # regalloc support
+    def load(self, loc, value):
+        """load an immediate value into a register"""
+        assert (loc.is_core_reg() and value.is_imm()
+                    or loc.is_vfp_reg() and value.is_imm_float())
+        if value.is_imm():
+            self.mc.gen_load_int(loc.value, value.getint())
+        elif value.is_imm_float():
+            self.mc.gen_load_int(r.ip0.value, value.getint())
+            self.mc.LDR_di(loc.value, r.ip0.value, 0)
+
+    def _mov_stack_to_loc(self, prev_loc, loc):
+        offset = prev_loc.value
+        if loc.is_core_reg():
+            assert prev_loc.type != FLOAT, 'trying to load from an \
+                incompatible location into a core register'
+            # unspill a core register
+            assert 0 <= offset <= (1<<15) - 1
+            self.mc.LDR_ri(loc.value, r.fp.value, offset)
+            return
+        if loc.is_vfp_reg():
+            assert prev_loc.type == FLOAT, 'trying to load from an \
+                incompatible location into a float register'
+            assert 0 <= offset <= (1 << 15) - 1
+            self.mc.LDR_di(loc.value, r.fp.value, offset)
+            return
+        assert False
+        # elif loc.is_vfp_reg():
+        #     assert prev_loc.type == FLOAT, 'trying to load from an \
+        #         incompatible location into a float register'
+        #     # load spilled value into vfp reg
+        #     is_imm = check_imm_arg(offset)
+        #     helper, save = self.get_tmp_reg()
+        #     save_helper = not is_imm and save
+        # elif loc.is_raw_sp():
+        #     assert (loc.type == prev_loc.type == FLOAT
+        #             or (loc.type != FLOAT and prev_loc.type != FLOAT))
+        #     tmp = loc
+        #     if loc.is_float():
+        #         loc = r.vfp_ip
+        #     else:
+        #         loc, save_helper = self.get_tmp_reg()
+        #         assert not save_helper
+        #     helper, save_helper = self.get_tmp_reg([loc])
+        #     assert not save_helper
+        # else:
+        #     assert 0, 'unsupported case'
+
+        # if save_helper:
+        #     self.mc.PUSH([helper.value], cond=cond)
+        # self.load_reg(self.mc, loc, r.fp, offset, cond=cond, helper=helper)
+        # if save_helper:
+        #     self.mc.POP([helper.value], cond=cond)
+
+    def _mov_reg_to_loc(self, prev_loc, loc):
+        if loc.is_core_reg():
+            self.mc.MOV_rr(loc.value, prev_loc.value)
+        elif loc.is_stack():
+            self.mc.STR_ri(prev_loc.value, r.fp.value, loc.value)
+        else:
+            assert False
+
+    def _mov_imm_to_loc(self, prev_loc, loc):
+        if loc.is_core_reg():
+            self.mc.gen_load_int(loc.value, prev_loc.value)
+        elif loc.is_stack():
+            self.mc.gen_load_int(r.ip0.value, prev_loc.value)
+            self.mc.STR_ri(r.ip0.value, r.fp.value, loc.value)
+        else:
+            assert False
+
+    def new_stack_loc(self, i, tp):
+        base_ofs = self.cpu.get_baseofs_of_frame_field()
+        return StackLocation(i, get_fp_offset(base_ofs, i), tp)
+
+    def mov_loc_to_raw_stack(self, loc, pos):
+        if loc.is_core_reg():
+            self.mc.STR_ri(loc.value, r.sp.value, pos)
+        elif loc.is_stack():
+            self.mc.LDR_ri(r.ip0.value, r.fp.value, loc.value)
+            self.mc.STR_ri(r.ip0.value, r.sp.value, pos)
+        elif loc.is_vfp_reg():
+            self.mc.STR_di(loc.value, r.sp.value, pos)
+        elif loc.is_imm():
+            self.mc.gen_load_int(r.ip0.value, loc.value)
+            self.mc.STR_ri(r.ip0.value, r.sp.value, pos)
+        else:
+            assert False, "wrong loc"
+
+    def mov_raw_stack_to_loc(self, pos, loc):
+        if loc.is_core_reg():
+            self.mc.LDR_ri(loc.value, r.sp.value, pos)
+        elif loc.is_stack():
+            self.mc.LDR_ri(r.ip0.value, r.sp.value, pos)
+            self.mc.STR_ri(r.ip0.value, r.fp.value, loc.value)
+        elif loc.is_vfp_reg():
+            self.mc.LDR_di(loc.value, r.sp.value, pos)
+        else:
+            assert False, "wrong loc"
+
+    def _mov_imm_float_to_loc(self, prev_loc, loc):
+        if loc.is_vfp_reg():
+            self.load(loc, prev_loc)
+        elif loc.is_stack():
+            self.load(r.vfp_ip, prev_loc)
+            self._mov_vfp_reg_to_loc(r.vfp_ip, loc)
+        else:
+            assert False, "wrong loc"
+
+    def _mov_vfp_reg_to_loc(self, prev_loc, loc):
+        if loc.is_stack():
+            self.mc.STR_di(prev_loc.value, r.fp.value, loc.value)
+        elif loc.is_vfp_reg():
+            self.mc.FMOV_dd(loc.value, prev_loc.value)
+        else:
+            assert False, "wrong loc"
+
+    def push_locations(self, locs):
+        if not locs:
+            return
+        depth = len(locs) * WORD
+        depth += depth & WORD # align
+        self.mc.SUB_ri(r.sp.value, r.sp.value, depth)
+        for i, loc in enumerate(locs):
+            self.mov_loc_to_raw_stack(loc, i * WORD)
+
+    def pop_locations(self, locs):
+        if not locs:
+            return
+        depth = len(locs) * WORD
+        depth += depth & WORD # align
+        for i, loc in enumerate(locs):
+            self.mov_raw_stack_to_loc(i * WORD, loc)
+        self.mc.ADD_ri(r.sp.value, r.sp.value, depth)
+
+    def regalloc_mov(self, prev_loc, loc):
+        """Moves a value from a previous location to some other location"""
+        if prev_loc.is_imm():
+            return self._mov_imm_to_loc(prev_loc, loc)
+        elif prev_loc.is_core_reg():
+            self._mov_reg_to_loc(prev_loc, loc)
+        elif prev_loc.is_stack():
+            self._mov_stack_to_loc(prev_loc, loc)
+        elif prev_loc.is_imm_float():
+            self._mov_imm_float_to_loc(prev_loc, loc)
+        elif prev_loc.is_vfp_reg():
+            self._mov_vfp_reg_to_loc(prev_loc, loc)
+        else:
+            assert 0, 'unsupported case'
+    mov_loc_loc = regalloc_mov
+
+    def gen_func_epilog(self, mc=None):
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if mc is None:
+            mc = self.mc
+        if gcrootmap and gcrootmap.is_shadow_stack:
+            self.gen_footer_shadowstack(gcrootmap, mc)
+
+        # pop all callee saved registers
+
+        stack_size = (len(r.callee_saved_registers) + 4) * WORD
+
+        for i in range(0, len(r.callee_saved_registers), 2):
+            mc.LDP_rri(r.callee_saved_registers[i].value,
+                            r.callee_saved_registers[i + 1].value,
+                            r.sp.value,
+                            (i + 4) * WORD)
+        mc.LDP_rr_postindex(r.lr.value, r.fp.value, r.sp.value, stack_size)
+
+
+        mc.RET_r(r.lr.value)
+
+    def gen_shadowstack_header(self, gcrootmap):
+        # we push two words, like the x86 backend does:
+        # the '1' is to benefit from the shadowstack 'is_minor' optimization
+        rst = gcrootmap.get_root_stack_top_addr()
+        self.mc.gen_load_int(r.ip1.value, rst)
+        # x8 = *ip1
+        self.load_reg(self.mc, r.x8, r.ip1)
+        # x8[0] = 1
+        self.mc.gen_load_int(r.ip0.value, 1)
+        self.store_reg(self.mc, r.ip0, r.x8)
+        # x8[1] = r.fp
+        self.store_reg(self.mc, r.fp, r.x8, WORD)
+        # *ip1 = x8 + 2 * WORD
+        self.mc.ADD_ri(r.x8.value, r.x8.value, 2 * WORD)
+        self.store_reg(self.mc, r.x8, r.ip1)
+
+    def gen_footer_shadowstack(self, gcrootmap, mc):
+        rst = gcrootmap.get_root_stack_top_addr()
+        mc.gen_load_int(r.ip0.value, rst)
+        self.load_reg(mc, r.ip1, r.ip0)
+        mc.SUB_ri(r.ip1.value, r.ip1.value, 2 * WORD)   # two words, see above
+        self.store_reg(mc, r.ip1, r.ip0)
+
+    def store_reg(self, mc, source, base, ofs=0, helper=None):
+        if source.is_vfp_reg():
+            return self._store_vfp_reg(mc, source, base, ofs)
+        else:
+            return self._store_core_reg(mc, source, base, ofs)
+
+    def _store_vfp_reg(self, mc, source, base, ofs):
+        assert ofs <= (1 << 15) - 1
+        mc.STR_di(source.value, base.value, ofs)
+
+    def _store_core_reg(self, mc, source, base, ofs):
+        # XXX fix:
+        assert ofs & 0x7 == 0
+        assert 0 <= ofs < 32768
+        mc.STR_ri(source.value, base.value, ofs)
+        #if check_imm_arg(ofs):
+        #    mc.STR_ri(source.value, base.value, imm=ofs)
+        #else:
+        #    mc.gen_load_int(r.ip1, ofs)
+        #    mc.STR_rr(source.value, base.value, r.ip1)
+
+    def load_reg(self, mc, target, base, ofs=0, helper=r.ip0):
+        assert target.is_core_reg()
+        if check_imm_arg(abs(ofs)):
+            mc.LDR_ri(target.value, base.value, ofs)
+        else:
+            mc.gen_load_int(helper.value, ofs)
+            mc.LDR_rr(target.value, base.value, helper.value)
+
+    def check_frame_before_jump(self, target_token):
+        if target_token in self.target_tokens_currently_compiling:
+            return
+        if target_token._arm_clt is self.current_clt:
+            return
+        # We can have a frame coming from god knows where that's
+        # passed to a jump to another loop. Make sure it has the
+        # correct depth
+        expected_size = target_token._arm_clt.frame_info.jfi_frame_depth
+        self._check_frame_depth(self.mc, self._regalloc.get_gcmap(),
+                                expected_size=expected_size)
+
+    # ../x86/assembler.py:668
+    def redirect_call_assembler(self, oldlooptoken, newlooptoken):
+        # some minimal sanity checking
+        old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
+        new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
+        assert old_nbargs == new_nbargs
+        # we overwrite the instructions at the old _ll_function_addr
+        # to start with a JMP to the new _ll_function_addr.
+        # Ideally we should rather patch all existing CALLs, but well.
+        oldadr = oldlooptoken._ll_function_addr
+        target = newlooptoken._ll_function_addr
+        # copy frame-info data
+        baseofs = self.cpu.get_baseofs_of_frame_field()
+        newlooptoken.compiled_loop_token.update_frame_info(
+            oldlooptoken.compiled_loop_token, baseofs)
+        mc = InstrBuilder()
+        mc.B(target)
+        mc.copy_to_raw_memory(oldadr)
+        #
+        jl.redirect_assembler(oldlooptoken, newlooptoken, newlooptoken.number)
+
+
+
+def not_implemented(msg):
+    msg = '[ARM64/asm] %s\n' % msg
+    if we_are_translated():
+        llop.debug_print(lltype.Void, msg)
+    raise NotImplementedError(msg)
+
+
+def notimplemented_op(self, op, arglocs):
+    print "[ARM64/asm] %s not implemented" % op.getopname()
+    raise NotImplementedError(op)
+
+def notimplemented_comp_op(self, op, arglocs):
+    print "[ARM64/asm] %s not implemented" % op.getopname()
+    raise NotImplementedError(op)
+
+def notimplemented_guard_op(self, op, guard_op, fcond, arglocs):
+    print "[ARM64/asm] %s not implemented" % op.getopname()
+    raise NotImplementedError(op)
+
+asm_operations = [notimplemented_op] * (rop._LAST + 1)
+asm_guard_operations = [notimplemented_guard_op] * (rop._LAST + 1)
+asm_comp_operations = [notimplemented_comp_op] * (rop._LAST + 1)
+asm_extra_operations = {}
+
+for name, value in ResOpAssembler.__dict__.iteritems():
+    if name.startswith('emit_opx_'):
+        opname = name[len('emit_opx_'):]
+        num = getattr(EffectInfo, 'OS_' + opname.upper())
+        asm_extra_operations[num] = value
+    elif name.startswith('emit_op_'):
+        opname = name[len('emit_op_'):]
+        num = getattr(rop, opname.upper())
+        asm_operations[num] = value
+    elif name.startswith('emit_guard_op_'):
+        opname = name[len('emit_guard_op_'):]
+        num = getattr(rop, opname.upper())
+        asm_guard_operations[num] = value
+    elif name.startswith('emit_comp_op_'):
+        opname = name[len('emit_comp_op_'):]
+        num = getattr(rop, opname.upper())
+        asm_comp_operations[num] = value
diff --git a/rpython/jit/backend/aarch64/callbuilder.py 
b/rpython/jit/backend/aarch64/callbuilder.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/aarch64/callbuilder.py
@@ -0,0 +1,291 @@
+
+from rpython.jit.backend.llsupport.callbuilder import AbstractCallBuilder
+from rpython.jit.backend.aarch64.arch import WORD
+from rpython.jit.metainterp.history import INT, FLOAT, REF
+from rpython.jit.backend.aarch64 import registers as r
+from rpython.jit.backend.arm import conditions as c
+from rpython.jit.backend.aarch64.jump import remap_frame_layout # we use arm 
algo
+from rpython.jit.backend.llsupport import llerrno
+from rpython.jit.backend.aarch64.codebuilder import OverwritingBuilder
+
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.rtyper.lltypesystem import rffi
+
+class Aarch64CallBuilder(AbstractCallBuilder):
+    def __init__(self, assembler, fnloc, arglocs,
+                 resloc=r.x0, restype=INT, ressize=WORD, ressigned=True):
+        AbstractCallBuilder.__init__(self, assembler, fnloc, arglocs,
+                                     resloc, restype, ressize)
+        self.current_sp = 0
+
+    def prepare_arguments(self):
+        arglocs = self.arglocs
+        non_float_locs = []
+        non_float_regs = []
+        float_locs = []
+        float_regs = []
+        stack_locs = []
+        free_regs = [r.x7, r.x6, r.x5, r.x4, r.x3, r.x2, r.x1, r.x0]
+        free_float_regs = [r.d7, r.d6, r.d5, r.d4, r.d3, r.d2, r.d1, r.d0]
+        for arg in arglocs:
+            if arg.type == FLOAT:
+                if free_float_regs:
+                    float_locs.append(arg)
+                    float_regs.append(free_float_regs.pop())
+                else:
+                    stack_locs.append(arg)
+            else:
+                if free_regs:
+                    non_float_locs.append(arg)
+                    non_float_regs.append(free_regs.pop())
+                else:
+                    stack_locs.append(arg)
+
+        if stack_locs:
+            adj = len(stack_locs) + (len(stack_locs) & 1)
+            self.mc.SUB_ri(r.sp.value, r.sp.value, adj * WORD)
+            self.current_sp = adj * WORD
+            c = 0
+            for loc in stack_locs:
+                self.asm.mov_loc_to_raw_stack(loc, c)
+                c += WORD
+
+        move_back = False
+        if not self.fnloc.is_imm():
+            if self.fnloc.is_core_reg():
+                self.mc.MOV_rr(r.ip1.value, self.fnloc.value)
+            else:
+                assert self.fnloc.is_stack()
+                self.mc.LDR_ri(r.ip1.value, r.fp.value, self.fnloc.value)
+            self.fnloc = r.x8
+            move_back = True
+
+        remap_frame_layout(self.asm, non_float_locs, non_float_regs, r.ip0)
+        if float_locs:
+            remap_frame_layout(self.asm, float_locs, float_regs, r.d8)
+
+        if move_back:
+            self.mc.MOV_rr(r.x8.value, r.ip1.value)
+
+    def push_gcmap(self):
+        noregs = self.asm.cpu.gc_ll_descr.is_shadow_stack()
+        gcmap = self.asm._regalloc.get_gcmap([r.x0], noregs=noregs)
+        self.asm.push_gcmap(self.mc, gcmap)
+
+    def pop_gcmap(self):
+        self.asm._reload_frame_if_necessary(self.mc)
+        self.asm.pop_gcmap(self.mc)        
+
+    def emit_raw_call(self):
+        #the actual call
+        if self.fnloc.is_imm():
+            self.mc.BL(self.fnloc.value)
+            return
+        if self.fnloc.is_stack():
+            assert False, "we should never be here"
+        else:
+            assert self.fnloc.is_core_reg()
+            assert self.fnloc is r.x8
+            self.mc.BLR_r(self.fnloc.value)
+
+    def restore_stack_pointer(self):
+        assert self.current_sp & 1 == 0 # always adjusted to 16 bytes
+        if self.current_sp == 0:
+            return
+        self.mc.ADD_ri(r.sp.value, r.sp.value, self.current_sp)
+        self.current_sp = 0
+
+    def load_result(self):
+        resloc = self.resloc
+        if self.restype == 'S':
+            assert False, "not supported yet"
+            XXX
+            self.mc.VMOV_sc(resloc.value, r.s0.value)
+        elif self.restype == 'L':
+            assert False, "not possible on 64bit backend"
+            YYY
+            assert resloc.is_vfp_reg()
+            self.mc.FMDRR(resloc.value, r.r0.value, r.r1.value)
+        # ensure the result is wellformed and stored in the correct location
+        if resloc is not None and resloc.is_core_reg():
+            self._ensure_result_bit_extension(resloc,
+                                                  self.ressize, self.ressign)
+
+    def _ensure_result_bit_extension(self, resloc, size, signed):
+        if size == WORD:
+            return
+        if size == 4:
+            if not signed: # unsigned int
+                self.mc.LSL_ri(resloc.value, resloc.value, 32)
+                self.mc.LSR_ri(resloc.value, resloc.value, 32)
+            else: # signed int
+                self.mc.LSL_ri(resloc.value, resloc.value, 32)
+                self.mc.ASR_ri(resloc.value, resloc.value, 32)
+        elif size == 2:
+            if not signed:
+                self.mc.LSL_ri(resloc.value, resloc.value, 48)
+                self.mc.LSR_ri(resloc.value, resloc.value, 48)
+            else:
+                self.mc.LSL_ri(resloc.value, resloc.value, 48)
+                self.mc.ASR_ri(resloc.value, resloc.value, 48)
+        elif size == 1:
+            if not signed:  # unsigned char
+                self.mc.AND_ri(resloc.value, resloc.value, 0xFF)
+            else:
+                self.mc.LSL_ri(resloc.value, resloc.value, 56)
+                self.mc.ASR_ri(resloc.value, resloc.value, 56)
+
+    def call_releasegil_addr_and_move_real_arguments(self, fastgil):
+        assert self.is_call_release_gil
+        assert not self.asm._is_asmgcc()
+        RFASTGILPTR = r.x19    # constant &rpy_fastgil
+        RSHADOWOLD = r.x20     # old value of the shadowstack pointer,
+                               #    which we save here for later comparison
+
+        gcrootmap = self.asm.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap:
+            rst = gcrootmap.get_root_stack_top_addr()
+            self.mc.gen_load_int(r.ip1.value, rst)
+            self.mc.LDR_ri(RSHADOWOLD.value, r.ip1.value, 0)
+
+        # change 'rpy_fastgil' to 0 (it should be non-zero right now)
+        self.mc.gen_load_int(RFASTGILPTR.value, fastgil)
+        self.mc.STLR(r.xzr.value, RFASTGILPTR.value)
+
+        if not we_are_translated():                     # for testing: we 
should not access
+            self.mc.ADD_ri(r.fp.value, r.fp.value, 1)   # fp any more
+
+    def write_real_errno(self, save_err):
+        if save_err & rffi.RFFI_READSAVED_ERRNO:
+            # Just before a call, read '*_errno' and write it into the
+            # real 'errno'.  The x0-x7 registers contain arguments to the
+            # future call;
+            # the x8-x10 registers contain various stuff. XXX what?
+            # We still have x11 and up.
+            if save_err & rffi.RFFI_ALT_ERRNO:
+                rpy_errno = llerrno.get_alt_errno_offset(self.asm.cpu)
+            else:
+                rpy_errno = llerrno.get_rpy_errno_offset(self.asm.cpu)
+            p_errno = llerrno.get_p_errno_offset(self.asm.cpu)
+            self.mc.LDR_ri(r.x11.value, r.sp.value,
+                           self.asm.saved_threadlocal_addr + self.current_sp)
+            self.mc.LDR_ri(r.ip0.value, r.x11.value, p_errno)
+            self.mc.LDR_ri(r.x11.value, r.x11.value, rpy_errno)
+            self.mc.STR_ri(r.x11.value, r.ip0.value, 0)
+        elif save_err & rffi.RFFI_ZERO_ERRNO_BEFORE:
+            # Same, but write zero.
+            p_errno = llerrno.get_p_errno_offset(self.asm.cpu)
+            self.mc.LDR_ri(r.x11.value, r.sp.value,
+                           self.asm.saved_threadlocal_addr + self.current_sp)
+            self.mc.LDR_ri(r.ip0.value, r.x11.value, p_errno)
+            self.mc.MOVZ_r_u16(r.x11.value, 0, 0)
+            self.mc.STR_ri(r.x11.value, r.ip0.value, 0)
+
+    def read_real_errno(self, save_err):
+        if save_err & rffi.RFFI_SAVE_ERRNO:
+            # Just after a call, read the real 'errno' and save a copy of
+            # it inside our thread-local '*_errno'.  Registers x11 and up
+            # are unused here, and registers x2-x3 never contain anything
+            # after the call.
+            if save_err & rffi.RFFI_ALT_ERRNO:
+                rpy_errno = llerrno.get_alt_errno_offset(self.asm.cpu)
+            else:
+                rpy_errno = llerrno.get_rpy_errno_offset(self.asm.cpu)
+            p_errno = llerrno.get_p_errno_offset(self.asm.cpu)
+            self.mc.LDR_ri(r.x3.value, r.sp.value,
+                           self.asm.saved_threadlocal_addr)
+            self.mc.LDR_ri(r.ip0.value, r.x3.value, p_errno)
+            self.mc.LDR_ri(r.ip0.value, r.ip0.value, 0)
+            self.mc.STR_ri(r.ip0.value, r.x3.value, rpy_errno)
+
+    def move_real_result_and_call_reacqgil_addr(self, fastgil):
+        # try to reacquire the lock.  The following two values are saved
+        # across the call and are still alive now:
+        RFASTGILPTR = r.x19    # constant &rpy_fastgil
+        RSHADOWOLD = r.x20     # old value of the shadowstack pointer
+
+        # this comes from gcc compiling this code:
+        #    while (__atomic_test_and_set(&lock, __ATOMIC_ACQUIRE))
+        #        ;
+        self.mc.gen_load_int(r.x2.value, 1)
+        self.mc.LDXR(r.x1.value, RFASTGILPTR.value)
+        self.mc.STXR(r.x3.value, r.x2.value, RFASTGILPTR.value)
+        self.mc.CBNZ_w(r.x3.value, -8)
+        # now x1 is the old value of the lock, and the lock contains 1
+
+        b1_location = self.mc.currpos()
+        self.mc.BRK()        # boehm: patched with a CBZ (jump if x1 == 0)
+                             # shadowstack: patched with CBNZ instead
+
+        gcrootmap = self.asm.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap:
+            # When doing a call_release_gil with shadowstack, there
+            # is the risk that the 'rpy_fastgil' was free but the
+            # current shadowstack can be the one of a different
+            # thread.  So here we check if the shadowstack pointer
+            # is still the same as before we released the GIL (saved
+            # in 'x20'), and if not, we fall back to 'reacqgil_addr'.
+            rst = gcrootmap.get_root_stack_top_addr()
+            self.mc.gen_load_int(r.ip1.value, rst)
+            self.mc.LDR_ri(r.ip0.value, r.ip1.value, 0)   # new shadowstack
+            self.mc.CMP_rr(r.ip0.value, RSHADOWOLD.value)
+            b3_location = self.mc.currpos()
+            self.mc.BRK() # B.eq forward
+
+            # revert the rpy_fastgil acquired above, so that the
+            # general 'reacqgil_addr' below can acquire it again...
+            self.mc.STR_ri(r.xzr.value, RFASTGILPTR.value, 0)
+
+            # patch the b1_location above, with "CBNZ here"
+            pmc = OverwritingBuilder(self.mc, b1_location, WORD)
+            pmc.CBNZ(r.x1.value, self.mc.currpos() - b1_location)
+
+            open_location = b3_location
+        else:
+            open_location = b1_location
+
+        # Yes, we need to call the reacqgil() function.
+        # save the result we just got
+        RSAVEDRES = RFASTGILPTR     # can reuse this reg here to save things
+        reg = self.resloc
+        if reg is not None:
+            if reg.is_core_reg():
+                self.mc.MOV_rr(RSAVEDRES.value, reg.value)
+            elif reg.is_vfp_reg():
+                self.mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+                self.mc.STR_di(reg.value, r.sp.value, 0)
+
+        # call the function
+        self.mc.BL(self.asm.reacqgil_addr)
+
+        # restore the saved register
+        if reg is not None:
+            if reg.is_core_reg():
+                self.mc.MOV_rr(reg.value, RSAVEDRES.value)
+            elif reg.is_vfp_reg():
+                self.mc.LDR_di(reg.value, r.sp.value, 0)
+                self.mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
+
+        # now patch the still-open jump above:
+        #     boehm: patch b1_location with a CBZ(x1)
+        #     shadowstack: patch b3_location with BEQ
+        pmc = OverwritingBuilder(self.mc, open_location, WORD)
+        offset = self.mc.currpos() - open_location
+        if gcrootmap:
+            pmc.B_ofs_cond(offset, c.EQ)
+        else:
+            pmc.CBZ(r.x1.value, offset)
+
+        if not we_are_translated():                    # for testing: now we 
can accesss
+            self.mc.SUB_ri(r.fp.value, r.fp.value, 1)  # fp again
+
+    def get_result_locs(self):
+        if self.resloc is None:
+            return [], []
+        if self.resloc.is_vfp_reg():
+            if self.restype == 'L':      # long long
+                return [r.r0], []
+            else:
+                return [], [r.d0]
+        assert self.resloc.is_core_reg()
+        return [r.r0], []
diff --git a/rpython/jit/backend/aarch64/codebuilder.py 
b/rpython/jit/backend/aarch64/codebuilder.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/aarch64/codebuilder.py
@@ -0,0 +1,591 @@
+
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.jit.backend.llsupport.asmmemmgr import BlockBuilderMixin
+from rpython.jit.backend.aarch64.locations import RegisterLocation
+from rpython.jit.backend.aarch64 import registers as r
+from rpython.rlib.rarithmetic import intmask
+from rpython.rtyper.lltypesystem import lltype, rffi, llmemory
+from rpython.tool.udir import udir
+
+clear_cache = rffi.llexternal(
+    "__clear_cache",
+    [llmemory.Address, llmemory.Address],
+    lltype.Void,
+    _nowrapper=True,
+    sandboxsafe=True)
+
+
+class AbstractAarch64Builder(object):
+    def write32(self, word):
+        self.writechar(chr(word & 0xFF))
+        self.writechar(chr((word >> 8) & 0xFF))
+        self.writechar(chr((word >> 16) & 0xFF))
+        self.writechar(chr((word >> 24) & 0xFF))
+
+    def RET_r(self, arg):
+        self.write32((0b1101011001011111 << 16) | (arg << 5))
+
+    def STR_ri(self, rt, rn, offset):
+        base = 0b1111100100
+        assert offset & 0x7 == 0
+        assert 0 <= offset < 32768
+        self.write32((base << 22) | ((offset >> 3) << 10) |
+                     (rn << 5) | rt)
+
+    def STR_di(self, rt, rn, offset):
+        base = 0b1111110100
+        assert offset & 0x7 == 0
+        assert 0 <= offset < 32768
+        self.write32((base << 22) | ((offset >> 3) << 10) | (rn << 5) | rt)
+
+    def STR_dd(self, rt, rn, rm):
+        base = 0b11111100001
+        self.write32((base << 21) | (rm << 16) | (0b011010 << 10) | (rn << 5) 
| rt)
+
+    def STP_rr_preindex(self, reg1, reg2, rn, offset):
+        base = 0b1010100110
+        assert -512 <= offset < 512
+        assert offset & 0x7 == 0
+        self.write32((base << 22) | ((0x7F & (offset >> 3)) << 15) |
+                     (reg2 << 10) | (rn << 5) | reg1)
+
+    def STP_rri(self, reg1, reg2, rn, offset):
+        base = 0b1010100100
+        assert -512 <= offset < 512
+        assert offset & 0x7 == 0
+        self.write32((base << 22) | ((0x7F & (offset >> 3)) << 15) |
+                     (reg2 << 10) | (rn << 5) | reg1)
+
+    def STR_size_rr(self, scale, rt, rn, rm):
+        base = 0b111000001
+        assert 0 <= scale <= 3
+        self.write32((scale << 30) | (base << 21) | (rm << 16) | (0b11 << 13) |
+                     (0b010 << 10) | (rn << 5) | rt)
+
+    def STR_size_ri(self, scale, rt, rn, imm):
+        assert 0 <= imm < 4096
+        assert 0 <= scale <= 3
+        base = 0b11100100
+        self.write32((scale << 30) | (base << 22) | (imm >> scale << 10) | (rn 
<< 5) | rt)
+
+    def STRB_ri(self, rt, rn, imm):
+        self.STR_size_ri(0, rt, rn, imm)
+
+    def STRH_ri(self, rt, rn, imm):
+        self.STR_size_ri(1, rt, rn, imm)
+
+    def STRW_ri(self, rt, rn, imm):
+        self.STR_size_ri(2, rt, rn, imm)
+
+    def MOV_rr(self, rd, rn):
+        self.ORR_rr(rd, r.xzr.value, rn)
+
+    def UMOV_rd(self, rd, rn):
+        base = 0b0100111000001000001111
+        self.write32((base << 10) | (rn << 5) | rd)
+
+    def INS_dr(self, rd, rn):
+        base = 0b0100111000001000000111
+        self.write32((base << 10) | (rn << 5) | rd)
+
+    def ORR_rr(self, rd, rn, rm):
+        base = 0b10101010000
+        self.write32((base << 21) | (rm << 16) |
+                     (rn << 5) | rd)
+
+    def MOVK_r_u16(self, rd, immed, shift):
+        base = 0b111100101
+        assert 0 <= immed < 1 << 16
+        assert shift in (0, 16, 32, 48)
+        self.write32((base << 23) | (shift >> 4 << 21) | (immed << 5) | rd) 
+
+    def MOVZ_r_u16(self, rd, immed, shift):
+        base = 0b110100101
+        assert 0 <= immed < 1 << 16
+        assert shift in (0, 16, 32, 48)
+        self.write32((base << 23) | (shift >> 4 << 21) | (immed << 5) | rd) 
+
+    def MOVN_r_u16(self, rd, immed):
+        base = 0b10010010100
+        assert 0 <= immed < 1 << 16
+        self.write32((base << 21) | (immed << 5) | rd)
+
+    def ADD_ri(self, rd, rn, constant, s=0):
+        base = 0b1001000100 | (s << 7)
+        assert 0 <= constant < 4096
+        self.write32((base << 22) | (constant << 10) |
+                     (rn << 5) | rd)
+
+    def SUB_ri(self, rd, rn, constant, s=0):
+        base = 0b1101000100 | (s << 7)
+        assert 0 <= constant < 4096
+        self.write32((base << 22) | (constant << 10) | (rn << 5) | rd)
+
+    def LDP_rri(self, reg1, reg2, rn, offset):
+        base = 0b1010100101
+        assert -512 <= offset < 512
+        assert offset & 0x7 == 0
+        assert reg1 != reg2
+        self.write32((base << 22) | ((0x7F & (offset >> 3)) << 15) |
+                     (reg2 << 10) | (rn << 5) | reg1)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy default: merge arm64 support

Reply via email to