Update of /cvsroot/arcem/arcem/jit
In directory sfp-cvs-1.v30.ch3.sourceforge.com:/tmp/cvs-serv26947/jit

Added Files:
      Tag: jit
        codeblocks.c codeblocks.h codegen.h decoder.c decoder.h 
        dirtyranges.c dirtyranges.h emuinterf.c emuinterf.h 
        emuinterf2.h jit.c jit.h jitpage.c jitpage.h jitstate.h 
        jitstate2.h memattr.c memattr.h metrics.c metrics.h regalloc.c 
        regalloc.h 
Log Message:
WIP ARM-on-ARM JIT engine
This is the beginnings of an ARM-on-ARM JIT engine, designed to be used by 
emulators like ArcEm and RPCEmu
Main functionality issues to resolve:
* Currently it's only functional for RISC OS hosts. But it should be fairly 
striaghtforward to get it working on other host OS's
* Not all instructions are supported yet; unsupported instructions will be 
interpreted
* The simplified interpreter loop which the JIT is invoked from doesn't 
implement the instruction prefetch pipeline; this will eventually need fixing 
(e.g. make the loop smart enough to stay in interpreter mode until the 
prefetched instructions match what's in memory, i.e. the CPU has left the 
self-modifying code sequence)
* The JIT will update the cycle counter but won't actually trigger any events 
until the end of the JIT code block is reached, this may cause issues with some 
software
* However the biggest problem is likely to be that the single-pass code 
generation results in sub-optimal handling of complex instructions like 
LDR/STR. So future development is likely to focus on experimenting with more 
complex code generation techniques, e.g. compiler-style code graphs



--- NEW FILE: codegen.h ---
#ifndef CODEGEN_HEADER
#define CODEGEN_HEADER

#include "emuinterf.h"
#include <assert.h>

static inline uint32_t JITCodeGen_LoadImm(uint32_t cc, uint32_t hostreg, 
uint32_t basereg, uint32_t offset)
{
  assert(offset < 4096);
  return cc | 0x05900000 | (basereg << 16) | (hostreg << 12) | offset;
}

static inline uint32_t JITCodeGen_Load_Rm(uint32_t cc, uint32_t hostreg, 
uint32_t basereg, uint32_t Rm)
{
  assert(Rm < 4096);
  return cc | 0x07900000 | (basereg << 16) | (hostreg << 12) | Rm;
}

static inline uint32_t JITCodeGen_StoreImm(uint32_t cc, uint32_t hostreg, 
uint32_t basereg, uint32_t offset)
{
  assert(offset < 4096);
  return cc | 0x05800000 | (basereg << 16) | (hostreg << 12) | offset;
}

static inline uint32_t JITCodeGen_EncodeImm12(uint32_t val)
{
  uint32_t ret = 0;
  if (val) {
    while (((val > 255) || !(val & 3)))
    {
      val = (val << 2) | (val >> 30);
      ret+=256;
      assert(ret != (16<<8));
    }
  }
  return ret | val;
}

static inline uint32_t JITCodeGen_DataProc_Rd_Imm(uint32_t cc, uint32_t op, 
uint32_t Rd, int imm)
{
  return cc | (1<<25) | op | (Rd<<12) | JITCodeGen_EncodeImm12(imm);
}

static inline uint32_t JITCodeGen_DataProc_Rd_Rn_Imm(uint32_t cc, uint32_t op, 
uint32_t Rd, uint32_t Rn, int imm)
{
  return cc | (1<<25) | op | (Rn<<16) | (Rd<<12) | JITCodeGen_EncodeImm12(imm);
}

static inline uint32_t JITCodeGen_DataProc_Rn_Imm(uint32_t cc, uint32_t op, 
uint32_t Rn, int imm)
{
  return cc | (1<<25) | op | (1<<20) | (Rn<<16) | JITCodeGen_EncodeImm12(imm);
}

static inline uint32_t JITCodeGen_DataProc_Rn_Rm(uint32_t cc, uint32_t op, 
uint32_t Rn, uint32_t Rm)
{
  return cc | op | (1<<20) | (Rn<<16) | Rm;
}

static inline uint32_t JITCodeGen_DataProc_Rd_Rm(uint32_t cc, uint32_t op, 
uint32_t Rd, uint32_t Rm)
{
  return cc | op | (Rd<<12) | Rm;
}

static inline uint32_t JITCodeGen_DataProc_Rd_Rn_Rm(uint32_t cc, uint32_t op, 
uint32_t Rd, uint32_t Rn, uint32_t Rm)
{
  return cc | op | (Rn<<16) | (Rd<<12) | Rm;
}

static inline uint32_t JITCodeGen_LoadNZCV(uint32_t cc, int hostreg)
{
  /* MSR CPSR_f,hostreg */
  return cc | 0x0128f000 | hostreg | cc;
}

static inline uint32_t JITCodeGen_SavePSR(uint32_t cc, int hostreg)
{
  /* MRS hostreg,CPSR */
  return cc | 0x010f0000 | (hostreg << 12) | cc;
}

static inline uint32_t JITCodeGen_Branch(uint32_t cc, int32_t offset)
{
  return cc | 0x0a000000 | (((offset-8)>>2) & 0xffffff);
}

#endif

--- NEW FILE: dirtyranges.h ---
#ifndef DIRTYRANGES_HEADER
#define DIRTYRANGES_HEADER

#include "emuinterf.h"

typedef struct {
  uintptr_t start;
  uintptr_t end;
} DirtyRange;

extern DirtyRange *DirtyRanges_Claim(uintptr_t addr);

extern void DirtyRanges_Flush(void);

#endif

--- NEW FILE: codeblocks.h ---
#ifndef CODEBLOCKS_HEADER
#define CODEBLOCKS_HEADER

#include <stdio.h>
#include "dirtyranges.h"
#include "jitpage.h"

/* ForwardCodeBlock writes code going forwards, BackwardCodeBlock writes code 
going backwards */
typedef struct {
  DirtyRange *dirty;
  uintptr_t nextinstr; /* Address next instruction will be written to */
  uintptr_t data_start; /* Start of data section, inclusive */
  uintptr_t data_end; /* End of data section, exclusive */
} ForwardCodeBlock, BackwardCodeBlock;

/* Get next ID that will be allocated; marks start of a code generation pass */
extern int CodeBlock_NextID(void);

/* Claim the ID that was returned by NextID; marks end of a code generation 
pass */
extern void CodeBlock_ClaimID(int id, JITPage *page);

/* Mark the chain starting with 'ID' as invalid */
extern void CodeBlock_InvalidateID(int id);

extern void ForwardCodeBlock_New(ForwardCodeBlock *out, bool chain);
extern void BackwardCodeBlock_New(BackwardCodeBlock *out, bool chain);

static inline uintptr_t ForwardCodeBlock_WriteCode(ForwardCodeBlock *block, 
uint32_t word)
{
  uint32_t *ptr = (uint32_t *) block->nextinstr;
#ifdef JIT_DEBUG
  fprintf(stderr,"%08x: %08x\n",ptr,word);
#endif
  *ptr++ = word;
  block->dirty->end = block->nextinstr = (uintptr_t) ptr;
  if (block->data_start == (uintptr_t) (ptr+1))
  {
    /* Ran out of space in this block, generate branch to new block */
    ForwardCodeBlock_New(block, true);
  }
  return (uintptr_t) (ptr-1);
}


static inline uintptr_t ForwardCodeBlock_WriteData(ForwardCodeBlock *block, 
uint32_t data)
{
  /* TODO reuse existing values */
  uint32_t *ptr = (uint32_t *) block->data_start;
  *(--ptr) = data;
  block->data_start = (uintptr_t) ptr;
  if ((uintptr_t) ptr == block->nextinstr+4)
  {
    /* Ran out of space in this block, generate branch to new block */
    ForwardCodeBlock_New(block, true);    
  }
  return (uintptr_t) ptr;
}

static inline uintptr_t BackwardCodeBlock_WriteCode(BackwardCodeBlock *block, 
uint32_t word)
{
  uint32_t *ptr = (uint32_t *) block->nextinstr;
  block->dirty->start = (uintptr_t) ptr;
  *ptr-- = word;
  block->nextinstr = (uintptr_t) ptr;
  if (block->data_end == (uintptr_t) ptr)
  {
    /* Ran out of space in this block, generate branch from new block */
    BackwardCodeBlock_New(block, true);
  }
  return (uintptr_t) (ptr+1);
}

static inline uintptr_t BackwardCodeBlock_WriteData(BackwardCodeBlock *block, 
uint32_t data)
{
  /* TODO reuse existing values */
  uint32_t *ptr = (uint32_t *) block->data_end;
  *(++ptr) = data;
  block->data_end = (uintptr_t) ptr;
  if ((uintptr_t) ptr == block->nextinstr)
  {
    /* Ran out of space in this block, generate branch from new block */
    BackwardCodeBlock_New(block, true);
  }
  return (uintptr_t) (ptr-1);
}

#endif

--- NEW FILE: codeblocks.c ---
#include "codeblocks.h"
#include "emuinterf.h"
#include "jitstate2.h"
#include <assert.h>
#include <stdio.h>

typedef struct {
  JITPage *owner;
} codeblock_header;

#define CODEBLOCK_SIZE (512-sizeof(codeblock_header)) /* Size in bytes */
#define CODEBLOCK_COUNT 2048

typedef struct {
  codeblock_header header;
  uint8_t data[CODEBLOCK_SIZE];
} codeblock;

static codeblock codeblocks[CODEBLOCK_COUNT];
static int rr_next;

static codeblock *codeblock_claim(const JITState *jit)
{
  codeblock *block = &codeblocks[rr_next++];
  if (rr_next == CODEBLOCK_COUNT)
  {
    rr_next = 0;
  }
  if (block->header.owner)
  {
    JITPage_ForgetCode(jit,block->header.owner);
  }
  return block;
}

static void writebranch(uintptr_t src, uintptr_t dest)
{
  uint32_t offset = dest-8-src;
  *((uint32_t *) src) = ((offset>>2) & 0xffffff) | 0xEA000000;
}

int CodeBlock_NextID(void)
{
  return rr_next;
}

void CodeBlock_ClaimID(int id, JITPage *page)
{
  codeblocks[id].header.owner = page;
}

void CodeBlock_InvalidateID(int id)
{
  if (id != -1) {
    codeblocks[id].header.owner = NULL;
  }
}

void ForwardCodeBlock_New(ForwardCodeBlock *out, bool chain)
{
  const JITState *jit = JIT_GetState(JITEmuInterf_GetState());
  codeblock *block = codeblock_claim(jit);
#ifdef JIT_DEBUG
  fprintf(stderr,"ForwardCodeBlock_New: chain %d -> %08x\n",chain,block->data);
#endif
  /* If we're chaining blocks, insert a branch at the end of 'out' */
  if (chain)
  {
    uintptr_t loc = out->nextinstr;
    writebranch(loc, (uintptr_t) block->data);
    out->dirty->end = loc+4;
  }
  out->dirty = DirtyRanges_Claim((uintptr_t) block->data);
  out->nextinstr = (uintptr_t) block->data;
  out->data_start = out->data_end = (uintptr_t) (block->data + CODEBLOCK_SIZE);
}

void BackwardCodeBlock_New(BackwardCodeBlock *out, bool chain)
{
  const JITState *jit = JIT_GetState(JITEmuInterf_GetState());
  codeblock *block = codeblock_claim(jit);
  /* If we're chaining blocks, insert a branch at the end of 'block' */
  if (chain)
  {
    writebranch((uintptr_t) block->data, out->nextinstr+4);
  }
  out->dirty = DirtyRanges_Claim((uintptr_t) (block->data + CODEBLOCK_SIZE));
  if (chain)
  {
    out->dirty->start -= 4;
  }
  out->nextinstr = out->dirty->start - 4;
  out->data_start = out->data_end = (uintptr_t) block->data;
}

--- NEW FILE: dirtyranges.c ---
#include "dirtyranges.h"
#include "metrics.h"
#include <assert.h>
#include <stdio.h>

#ifdef __riscos__
#include <kernel.h>
#include <swis.h>
#endif

#define MAX_DIRTY_RANGES 256

typedef struct {
  DirtyRange ranges[MAX_DIRTY_RANGES];
  int index;
} DirtyRanges;

static DirtyRanges dirty;

DirtyRange *DirtyRanges_Claim(uintptr_t addr)
{
  assert(dirty.index < MAX_DIRTY_RANGES);
  DirtyRange *r = &dirty.ranges[dirty.index++];
  r->start = r->end = addr;
  return r;
}

void DirtyRanges_Flush(void)
{
#ifdef DEBUG_JIT_DUMP
  uint32_t *addr;
  fprintf(stderr,"JIT:\n");
#endif
  /* TODO optimise (can IMB_List be called in user mode?) */
  while (dirty.index > 0)
  {
    DirtyRange *r = &dirty.ranges[--dirty.index];
#ifdef DEBUG_JIT_METRICS
    jitmetrics.instructions_out += (r->end - r->start)>>2;
#endif
#ifdef DEBUG_JIT_DUMP
    for(addr=(uint32_t *)r->start;addr!=(uint32_t *)r->end;addr++) {        
      char *str;
      _swix(Debugger_Disassemble,_INR(0,1)|_OUT(1),*addr,addr,&str);
      fprintf(stderr,"%08x %08x %s\n",addr,*addr,str);
    }
#endif
#ifdef __riscos__
    if (r->start != r->end) {
      _swix(OS_SynchroniseCodeAreas,_INR(0,2),1,r->start,r->end-4);
    }
#endif
  }
}

--- NEW FILE: memattr.h ---
#ifndef MEMATTR_HEADER
#define MEMATTR_HEADER

#include "jitstate.h"
#include <string.h>

#define JIT_MEMFLAG_CODE 1
#define JIT_MEMFLAG_ENTRYPOINT 2

extern void MemAttr_Init(JITState *jit,uint32_t romramchunksize);

static inline uint8_t *MemAttr_Get(const JITState *jit,void *phy)
{
  return (uint8_t *) (jit->addr2flags + (((uintptr_t) phy)>>2));
}

static inline void MemAttr_SetCodeFlag(const JITState *jit,void *phy)
{
  *MemAttr_Get(jit,phy) |= JIT_MEMFLAG_CODE;
}

static inline void MemAttr_SetEntryPointFlag(const JITState *jit,void *phy)
{
  *MemAttr_Get(jit,phy) |= JIT_MEMFLAG_ENTRYPOINT;
}

static inline void MemAttr_ClearFlags(const JITState *jit,void *phy)
{
  *MemAttr_Get(jit,phy) = 0;
}

static inline void MemAttr_ClearFlagsRange(const JITState *jit,void *phy_begin, 
void *phy_end)
{
  uint8_t *begin = MemAttr_Get(jit,phy_begin);
  uint8_t *end = MemAttr_Get(jit,phy_end);
  memset(begin, 0, end-begin);
}

static inline bool MemAttr_GetCodeFlag(const JITState *jit,void *phy)
{
  return (*MemAttr_Get(jit,phy)) & JIT_MEMFLAG_CODE;
}

static inline bool MemAttr_GetEntryPointFlag(const JITState *jit,void *phy)
{
  return (*MemAttr_Get(jit,phy)) & JIT_MEMFLAG_ENTRYPOINT;  
}

#endif

--- NEW FILE: memattr.c ---
#include <stdlib.h>
#include "memattr.h"

void MemAttr_Init(JITState *jit,uint32_t romramchunksize)
{
  jit->memflags = calloc(romramchunksize>>2,1);
  jit->addr2flags = ((uintptr_t) jit->memflags)-(jit->romramchunk>>2);
}

--- NEW FILE: jitstate.h ---
#ifndef JITSTATE_HEADER
#define JITSTATE_HEADER

#include "emuinterf.h"

typedef enum {
  JITResult_Interpret, /* Interpret the instruction at PC-8 */
  JITResult_Normal, /* Continue normal execution (try JITing the instruction at 
the PC-8) */
} JITResult;

typedef JITResult (*JITFunc)(JITEmuState *state,void *addr);

typedef struct JITPage JITPage;

/* Private JIT state struct */
typedef struct {
  uintptr_t addr2func; /* Offset to apply to address pointers to convert to 
func pointers */
  uintptr_t addr2flags; /* Offset to apply to convert (shifted) address 
pointers to memory flag pointers */

  JITFunc *phy2func;
  uint8_t *memflags;
  JITPage *pages;
  uintptr_t romramchunk;
#ifdef DEBUG_JIT_METRICS_EXEC
  uint32_t exec_count;
#endif
} JITState;

#endif

--- NEW FILE: regalloc.c ---
#include <limits.h>
#include <assert.h>
#include <string.h>
#include "regalloc.h"
#include "emuinterf2.h"
#include "decoder.h"

void JITRegAlloc_Init(JITRegAlloc *ra, ForwardCodeBlock *block)
{
  int i;
  memset(ra, 0, sizeof(JITRegAlloc));
  for (i=0;i<JIT_H_REG_NUM;i++)
  {
    ra->host_regs[i].emureg = JIT_E_REG_NONE;
  }
  for (i=0;i<JIT_E_REG_NUM;i++)
  {
    ra->emu_regs[i].hostreg = JIT_H_REG_NONE;
  }
  ra->free_host_regs = (1<<12) + 0xe; /* initially used for state ptr */
  ra->callee_save = 0x4ff0; /* r4-r11, r14 */
  ra->block = block;
  /* Init the state reg */
  ra->emu_regs[JIT_E_StateReg].hostreg = JIT_H_R0;
  ra->host_regs[JIT_H_R0].emureg = JIT_E_StateReg;
  ra->host_regs[JIT_H_R0].required = true;
}

void JITRegAlloc_Copy(JITRegAlloc *dest, JITRegAlloc *src, ForwardCodeBlock 
*block)
{
  /* Forget any callee-save instr, for both src and dest
     This prevents the two going out-of-sync if either src or dest need to 
allocate new callee-save regs */
  src->callee_save_instr = NULL;
  memcpy(dest, src, sizeof(JITRegAlloc));
  dest->block = block;
}

void JITRegAlloc_Fork(JITRegAlloc *parent, JITRegAlloc *child)
{
  /* Force write-back of any dirty regs (we can't guarantee they'll still be 
mapped when we join) */
  JITRegAlloc_WriteBackDirty(parent);
  /* Unlike Copy, we can retain the callee-save instr */
  memcpy(child, parent, sizeof(JITRegAlloc));
}

void JITRegAlloc_Join(JITRegAlloc *parent, JITRegAlloc *child)
{
  int i;
  /* Now resolve the register mappings; if host reg has same emu reg for both 
paths, merge dirty flags
     Else force any writeback */
  for(i=0;i<JIT_H_REG_NUM;i++) {
    if (parent->host_regs[i].emureg == child->host_regs[i].emureg) {
      parent->host_regs[i].dirty |= child->host_regs[i].dirty;
    } else {
      JITRegAlloc_FlushHostReg(child, (JITHostReg) i);
      JITRegAlloc_FlushHostReg(parent, (JITHostReg) i); /* Shouldn't trigger a 
store */
    }
  }
  if (parent->callee_save_instr)
  {
    /* Pull apart this instruction to see if there are any saved registers that 
were added by the child (it's safest for us to work it out this way, rather 
than examine the child) */
    uint32_t saved = 0;
    if (((*parent->callee_save_instr) & 0x0f000000) == 0x05000000) {
      saved = (1<<(((*parent->callee_save_instr) >> 12) & 15));
    } else {
      saved = (*parent->callee_save_instr) & 0xffff;
    }
    saved &= parent->callee_save;
    parent->callee_save &= ~saved;
    parent->callee_saved |= saved;
    parent->free_host_regs |= saved;
  }
  /* If there are any registers the child pushed which we can't locate the save 
instr for, get the child to restore them itself */
  child->callee_saved &= ~parent->callee_saved;
  JITRegAlloc_CalleeRestore(child, false);
  /* Update block reference */
  parent->block = child->block;
  /* We must forget any callee-save instr, since any update we make to it will 
put it out of sync with any restore the child performed
     TODO - Have a flag in JITRegAlloc for whether the child has restored? */
  parent->callee_save_instr = NULL;
}

void JITRegAlloc_CalleeSave(JITRegAlloc *ra, JITHostReg hr)
{
  /* XXX should be in EmuInterf */
  ra->callee_saved |= (1<<hr);
  if (!ra->callee_save_instr) {
    ra->callee_save_instr = (uint32_t *) ForwardCodeBlock_WriteCode(ra->block, 
0xe52d0004 | (hr<<12)); /* STR Rn,[R13,#-4]! */
  } else {
    if (((*ra->callee_save_instr) & 0x0f000000) == 0x05000000) { /* STR? */
      *ra->callee_save_instr = 0xe92d0000 | (1<<(((*ra->callee_save_instr) >> 
12) & 15)); /* Convert to STM */
    }
    *ra->callee_save_instr |= (1<<hr); /* n.b. can't use callee_saved since we 
might be chaining STR/STM */
  }
  /* Update register lists */
  ra->free_host_regs |= 1<<hr;
  ra->callee_save &= ~(1<<hr);
}

JITHostReg JITRegAlloc_MapReg(JITRegAlloc *ra, JITEmuReg r, int required, 
JITHostReg hostreg, uint32_t cc)
{
  if (hostreg == JIT_H_REG_NONE) {
    if (ra->emu_regs[r].hostreg == JIT_H_REG_NONE) {
      hostreg = JITRegAlloc_GetFreeHostReg(ra);
    } else {
      hostreg = ra->emu_regs[r].hostreg;
      ra->host_regs[hostreg].required += required;
      return hostreg; /* already resident */
    }
  }
  if (ra->emu_regs[r].hostreg == hostreg) {
    ra->host_regs[hostreg].required += required;
    return hostreg;
  }
  JITRegAlloc_FlushHostReg(ra, hostreg);
  if (ra->emu_regs[r].hostreg != JIT_H_REG_NONE) {
    if (cc != JIT_NV)
    {
      ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rm(cc, 
JIT_MOV, hostreg, ra->emu_regs[r].hostreg)); /* needs to deal with move to/from 
PSR? */
    }
    ra->host_regs[hostreg] = ra->host_regs[ra->emu_regs[r].hostreg];
    ra->free_host_regs |= 1 << ra->emu_regs[r].hostreg;
    ra->host_regs[ra->emu_regs[r].hostreg] = (JITHostRegState) { 
JIT_E_REG_NONE, false, 0 };
  } else if (cc != JIT_NV) {
    /* XXX make general */
    if (r < JIT_E_CycleCount) {
      ForwardCodeBlock_WriteCode(ra->block, JITEmuInterf2_LoadReg(cc, hostreg, 
ra->emu_regs[JIT_E_StateReg].hostreg, r));
    } else if (r == JIT_E_CycleCount) {
      ForwardCodeBlock_WriteCode(ra->block, JITEmuInterf2_LoadCycleCount(cc, 
hostreg, ra->emu_regs[JIT_E_StateReg].hostreg));
    }
#ifdef DEBUG_JIT_METRICS_EXEC
    else if (r == JIT_E_Exec) {
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_LoadImm(cc,hostreg,ra->emu_regs[JIT_E_StateReg].hostreg,offsetof(ARMul_State,jit)+offsetof(JITState,exec_count)));
    }
#endif
  }
  ra->emu_regs[r].hostreg = hostreg;
  ra->host_regs[hostreg].emureg = r;
  ra->host_regs[hostreg].required += required;
  ra->free_host_regs &= ~(1<<hostreg);
  return hostreg;
}

void JITRegAlloc_UnmapReg(JITRegAlloc *ra, JITEmuReg r)
{
  if (r == JIT_E_REG_NONE) {
    return;
  }
  JITHostReg hr = ra->emu_regs[r].hostreg;
  if (hr == JIT_H_REG_NONE) {
    return;
  }
  assert(!ra->host_regs[hr].required);
  JITRegAlloc_WriteBack(ra, hr);
  ra->free_host_regs |= 1 << hr;
  ra->host_regs[ra->emu_regs[r].hostreg] = (JITHostRegState) { JIT_E_REG_NONE, 
false, 0 };
  ra->emu_regs[r] = (JITEmuRegState) { JIT_H_REG_NONE };
}

JITHostReg JITRegAlloc_GetFreeHostReg(JITRegAlloc *ra)
{
  int start = ra->next_idx;
  /* weight the registers by the associated cost:
     1. any from free_host_regs
     2. any where !required && !dirty
        -> later on, might want to associate an explicit cost with 
reloading/recalculating values?
     3. highest numbered reg from callee_save
        (must be highest numbered so we can have multiple STR/STM but only one 
LDR/LDM)
        -> later on, might want to give these explicit costs as well? ldm/stm 
can transfer quickly, need to be able to represent if the next register to 
claim is going to be quick or slow
     4. any where !required && dirty
        -> later on, might want to associate an explicit cost; if the register 
is no longer needed then writing it back will be cheaper than taking from the 
callee-save list
     could do this in one pass, tracking the cost associated with using a given 
register
  */
  int best_cost = INT_MAX;
  int best_idx = 0;
#define CANDIDATE(cost) if (best_cost > cost) { best_idx = ra->next_idx; 
best_cost = cost; }
  do {
    ra->next_idx = (ra->next_idx + 1) & 15;
    if (ra->free_host_regs & (1<<ra->next_idx)) {
      CANDIDATE(1) /* rule 1 */
      break; /* always going to win, so stop here */
    } else if (ra->callee_save & (1<<ra->next_idx)) {
      CANDIDATE(3 + 16 - ra->next_idx) /* rule 3 */
    } else if ((ra->host_regs[ra->next_idx].emureg != JIT_E_REG_NONE) && 
!ra->host_regs[ra->next_idx].required) {
      if (ra->host_regs[ra->next_idx].dirty) {
        CANDIDATE(4 + 20) /* rule 4 */
      } else {
        CANDIDATE(2)
        if (!ra->free_host_regs) {
          break; /* not going to get any better than this */
        }
      }
    }
  } while(start != ra->next_idx);
  assert(best_cost != INT_MAX);
  ra->next_idx = best_idx;
  if (ra->callee_save & (1<<best_idx)) {
    /* preserve contents of callee-save register */
    JITRegAlloc_CalleeSave(ra, best_idx); /* assume this will track which 
registers need restoring, and will update ra->callee_save & ra->free_host_regs 
to indicate which register(s) have become available */
  } else if (!(ra->free_host_regs & (1<<best_idx))) {
    /* reclaim register that's already in use */
    JITRegAlloc_FlushHostReg(ra, best_idx);
  } 
  return (JITHostReg) best_idx;
}

void JITRegAlloc_UnmapAll(JITRegAlloc *ra)
{
  int i;
  for (i=0; i<JIT_H_REG_NUM; i++)
  {
    JITRegAlloc_FlushHostReg(ra, i);
  }
}

void JITRegAlloc_WriteBack(JITRegAlloc *ra, JITHostReg hr)
{
  JITEmuReg r;
  if ((hr == JIT_H_REG_NONE) || (!ra->host_regs[hr].dirty)) {
    return;
  }
  /* XXX make general */
  r = ra->host_regs[hr].emureg;
  if (r < JIT_E_CycleCount) {
    ForwardCodeBlock_WriteCode(ra->block, JITEmuInterf2_StoreReg(JIT_AL, hr, 
ra->emu_regs[JIT_E_StateReg].hostreg, r));
  } else if (r == JIT_E_CycleCount) {
    ForwardCodeBlock_WriteCode(ra->block, JITEmuInterf2_StoreCycleCount(JIT_AL, 
hr, ra->emu_regs[JIT_E_StateReg].hostreg));
  }
#ifdef DEBUG_JIT_METRICS_EXEC
  else if (r == JIT_E_Exec) {
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_StoreImm(JIT_AL,hr,ra->emu_regs[JIT_E_StateReg].hostreg,offsetof(ARMul_State,jit)+offsetof(JITState,exec_count)));
  }
#endif
  ra->host_regs[hr].dirty = false;
}

void JITRegAlloc_WriteBackUnrequired(JITRegAlloc *ra)
{
  int i;
  for (i=0; i<JIT_H_REG_NUM; i++)
  {
    if (!ra->host_regs[i].required)
    {
      JITRegAlloc_WriteBack(ra, i);
    }
  }
}

void JITRegAlloc_WriteBackDirty(JITRegAlloc *ra)
{
  int i;
  for (i=0; i<JIT_H_REG_NUM; i++)
  {
    if (ra->host_regs[i].dirty)
    {
      JITRegAlloc_WriteBack(ra, i);
    }
  }
}

void JITRegAlloc_CalleeRestore(JITRegAlloc *ra, bool ret)
{
  /* Write back everything */
  JITRegAlloc_WriteBackDirty(ra);
  /* Restore callee-save regs and/or return */
  if (ra->callee_saved)
  {
    if (ra->callee_saved == (1<<14))
    {
      ForwardCodeBlock_WriteCode(ra->block, (ret ? 0xe49df004 : 0xe49de004));  
/* LDR PC or R14 */
    }
    else
    {
      uint32_t ldm = 0xe8bd0000 | ra->callee_saved;
      if (ret)
      {
        ldm ^= 3<<14;
      }
      ForwardCodeBlock_WriteCode(ra->block,ldm); 
    }
  }
  else if (ret)
  {
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rm(JIT_AL,JIT_MOV,15,14));
  }
  /* Reset our state */
  JITRegAlloc_Init(ra, ra->block);
}

--- NEW FILE: emuinterf2.h ---
#ifndef JITEMUINTERF2_HEADER
#define JITEMUINTERF2_HEADER

#include <stddef.h>
#include "codegen.h"
#include "jit.h"
#include "../armdefs.h"
#include "regalloc.h" /* YUCK */
#include "decoder.h"

/* Extra emulator interfaces */

static inline uint32_t JITEmuInterf2_LoadReg(uint32_t cc, uint32_t hostreg, 
uint32_t statereg, uint32_t emureg)
{
  return 
JITCodeGen_LoadImm(cc,hostreg,statereg,offsetof(ARMul_State,Reg[emureg]));
}

static inline uint32_t JITEmuInterf2_StoreReg(uint32_t cc, uint32_t hostreg, 
uint32_t statereg, uint32_t emureg)
{
  return 
JITCodeGen_StoreImm(cc,hostreg,statereg,offsetof(ARMul_State,Reg[emureg]));
}

static inline uint32_t JITEmuInterf2_LoadCycleCount(uint32_t cc, uint32_t 
hostreg, uint32_t statereg)
{
  return 
JITCodeGen_LoadImm(cc,hostreg,statereg,offsetof(ARMul_State,NumCycles));
}

static inline uint32_t JITEmuInterf2_StoreCycleCount(uint32_t cc, uint32_t 
hostreg, uint32_t statereg)
{
  return 
JITCodeGen_StoreImm(cc,hostreg,statereg,offsetof(ARMul_State,NumCycles));
}

extern void JITEmuInterf2_WriteEpilogue(JITRegAlloc *ra, JITResult result);

extern void JITEmuInterf2_WriteLoad(JITRegAlloc *ra, JITHostReg Rd, JITHostReg 
Raddr, bool byte, uintptr_t abort);

extern void JITEmuInterf2_WriteLDM(JITRegAlloc *ra, JITHostReg Rn, JITHostReg 
Raddr, const Instruction *instr, uintptr_t abort);

#endif

--- NEW FILE: regalloc.h ---
#ifndef REGALLOC_HEADER
#define REGALLOC_HEADER

#include "emuinterf.h"
#include "codeblocks.h" /* YUCK */

typedef enum {
  JIT_H_R0 = 0,
  /* JIT_H_R1 ... JIT_H_R15 as expected */
  JIT_H_PC = 15,
  JIT_H_PSR = 16,
  JIT_H_REG_NUM,
  JIT_H_REG_NONE = -1
} JITHostReg;

typedef enum {
  JIT_E_R0 = 0,
  /* JIT_E_R1 ... JIT_E_R15 as expected */
  JIT_E_PC = 15,
  JIT_E_CycleCount,
  JIT_E_Temp,
  JIT_E_Temp2,
  JIT_E_Temp3,
  JIT_E_Temp4,
  JIT_E_StateReg,
#ifdef DEBUG_JIT_METRICS_EXEC
  JIT_E_Exec,
#endif
  JIT_E_REG_NUM,
  JIT_E_REG_NONE = -1
} JITEmuReg;

#define JIT_HR(x) ((JITHostReg) (JIT_H_R0 + (x)))
#define JIT_ER(x) ((JITEmuReg) (JIT_E_R0 + (x)))

typedef struct {
  JITHostReg hostreg; /* JIT_H_REG_NONE if not loaded */
} JITEmuRegState;

typedef struct {
  JITEmuReg emureg; /* JIT_E_REG_NONE if not in use */
  bool dirty; /* true if value needs writing back */
  uint8_t required; /* >0 if required */
} JITHostRegState;

typedef struct {
  JITEmuRegState emu_regs[JIT_E_REG_NUM]; /* Which host reg an emu reg maps to 
*/
  JITHostRegState host_regs[JIT_H_REG_NUM]; /* Which emu reg a host reg maps to 
*/
  uint32_t free_host_regs; /* Host registers which are free for immediate use */
  uint32_t callee_save; /* Host registers which can be made free for use by 
callee-save mechanism */
  int next_idx;
  uint32_t *callee_save_instr; /* STR/STM that implements the callee-save */
  uint32_t callee_saved; /* Which callee-save regs have been saved */
  ForwardCodeBlock *block; /* Code block to use for writing any instructions */
} JITRegAlloc;


/* Initialise a register allocator */
extern void JITRegAlloc_Init(JITRegAlloc *ra, ForwardCodeBlock *block);

/* Copy a register allocator, changing the associated code block
   Use when permanently splitting the path of execution */
extern void JITRegAlloc_Copy(JITRegAlloc *dest, JITRegAlloc *src, 
ForwardCodeBlock *block);

/* Copy a register allocator
   Use when temporarily splitting the path of execution
   Parent must not be used until joined with child */
extern void JITRegAlloc_Fork(JITRegAlloc *parent, JITRegAlloc *child);

/* Join the two paths so only parent remains */
extern void JITRegAlloc_Join(JITRegAlloc *parent, JITRegAlloc *child);

/* Trigger callee-saving of hr */
extern void JITRegAlloc_CalleeSave(JITRegAlloc *ra, JITHostReg hr);

/* Request that r is made available
   Specify hostreg of JIT_H_REG_NONE for automatic register assignment
   required should be 1 or 0
   cc specifies the condition under which the value should be loaded (JIT_NV if 
we always overwrite) */
extern JITHostReg JITRegAlloc_MapReg(JITRegAlloc *ra, JITEmuReg r, int 
required, JITHostReg hostreg, uint32_t cc);

/* Lock/unlock a register by incrementing/decrementing the required count */
static inline void JITRegAlloc_LockReg(JITRegAlloc *ra, JITHostReg hr, int 
required, bool dirty)
{
  if (hr != JIT_H_REG_NONE)
  {
    ra->host_regs[hr].required += required;
    ra->host_regs[hr].dirty |= dirty;
  }
}

/* Unmap a given emu register */
extern void JITRegAlloc_UnmapReg(JITRegAlloc *ra, JITEmuReg r);

/* Return a free host reg */
extern JITHostReg JITRegAlloc_GetFreeHostReg(JITRegAlloc *ra);

/* Unmap a given host register */
static inline void JITRegAlloc_FlushHostReg(JITRegAlloc *ra, JITHostReg hostreg)
{
  JITRegAlloc_UnmapReg(ra, ra->host_regs[hostreg].emureg);
}

/* Unmap all registers */
extern void JITRegAlloc_UnmapAll(JITRegAlloc *ra);

/* Lock and return an emu reg if it's present */
static inline JITHostReg JITRegAlloc_LockEmuReg(JITRegAlloc *ra, JITEmuReg r)
{
  JITHostReg hr = ra->emu_regs[r].hostreg;
  if (hr != JIT_H_REG_NONE)
  {
    ra->host_regs[hr].required++;
  }
  return hr;
}

/* Write back host reg if dirty (but keep resident) */
extern void JITRegAlloc_WriteBack(JITRegAlloc *ra, JITHostReg hr);

/* Write back all dirty, non-required regs (but keep resident) */
extern void JITRegAlloc_WriteBackUnrequired(JITRegAlloc *ra);

/* Write back all dirty regs (but keep resident) */
extern void JITRegAlloc_WriteBackDirty(JITRegAlloc *ra);

/* Restore callee-save regs, and/or return from function. Resets state. */
extern void JITRegAlloc_CalleeRestore(JITRegAlloc *ra, bool ret);

#endif

--- NEW FILE: jit.c ---
#include <assert.h>
#include <stdlib.h>
#include "jit.h"
#include "jitpage.h"
#include "memattr.h"
#include "decoder.h"
#include "codeblocks.h"
#include "emuinterf2.h"
#include "metrics.h"
#include "regalloc.h"

#ifdef __riscos__
#include <kernel.h>
#include <swis.h>
#endif

void JIT_Init(JITState *jit,uintptr_t romramchunk,uint32_t romramchunksize)
{
  int i;
  int count = romramchunksize/4;
  memset(jit,0,sizeof(JITState));
  jit->romramchunk = romramchunk;
  jit->phy2func = (JITFunc *) malloc(sizeof(JITFunc)*count);
  for (i=0;i<count;i++)
  {
    jit->phy2func[i] = &JIT_Generate;
  }
  jit->addr2func = ((uintptr_t) jit->phy2func)-romramchunk;
  JITPage_Init(jit,romramchunksize);
  MemAttr_Init(jit,romramchunksize);
}

static inline bool JITable(const Instruction *instr)
{
  switch (instr->type)
  {
  case InstrType_NOP:
    return true;
  case InstrType_DataProc:
    /* Don't allow R15 as dest */
    if (Decoder_DataProc_UsesRd(instr) && (Decoder_Rd(instr) == 15)) {
      return false;
    }
    return true;
#ifndef DEBUG_JIT_TEST_EXEC
  case InstrType_Branch:
    return true;
#endif
  case InstrType_LDRSTR:
    /* Only load, non-T instructions which don't write R15 */
    if (Decoder_LDRSTR_WritebackFlag(instr) && (Decoder_Rn(instr) == 15)) {
      return false;
    }
    return (Decoder_LDRSTR_LoadFlag(instr) && (Decoder_Rd(instr) != 15) && 
!Decoder_LDRSTR_TFlag(instr));
  case InstrType_LDMSTM:
    /* Only load, non-hat, non-PC using */
    if (Decoder_LDMSTM_HatFlag(instr) || (Decoder_Rn(instr) == 15) || 
(instr->instr & 0x8000) || !Decoder_LDMSTM_LoadFlag(instr)) {
      return false;
    }
    return true;
  default:
    return false;
  }
}

typedef struct {
  bool psr_loaded;
  bool psr_dirty;
  int cycles;
  int instrs;
} CodeGenState;

static void codegen_sync(JITRegAlloc *ra,CodeGenState *cgstate)
{
  if (cgstate->instrs || cgstate->cycles || cgstate->psr_dirty)
  {
    JITHostReg cyc = JITRegAlloc_LockEmuReg(ra, JIT_E_CycleCount);
    JITHostReg pc = JITRegAlloc_LockEmuReg(ra, JIT_E_PC);
    JITHostReg temp = (cgstate->psr_dirty ? JITRegAlloc_LockEmuReg(ra, 
JIT_E_Temp) : JIT_H_REG_NONE);
#ifdef DEBUG_JIT_METRICS_EXEC
    JITHostReg exec = JITRegAlloc_LockEmuReg(ra, JIT_E_Exec);
#endif
    JITRegAlloc_WriteBackUnrequired(ra); /* Free up some regs */
    if (cyc == JIT_H_REG_NONE) {
      cyc = JITRegAlloc_MapReg(ra, JIT_E_CycleCount, 1, JIT_H_REG_NONE, JIT_AL);
    }
    if (pc == JIT_H_REG_NONE) {
      pc = JITRegAlloc_MapReg(ra, JIT_E_PC, 1, JIT_H_REG_NONE, JIT_AL);
    }
    if (cgstate->psr_dirty && (temp == JIT_H_REG_NONE)) {
      temp = JITRegAlloc_MapReg(ra, JIT_E_Temp, 1, JIT_H_REG_NONE, JIT_NV);
    }
#ifdef DEBUG_JIT_METRICS_EXEC
    if (exec == JIT_H_REG_NONE) {
      exec = JITRegAlloc_MapReg(ra, JIT_E_Exec, 1, JIT_H_REG_NONE, JIT_AL);
    }
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_ADD,exec,exec,cgstate->instrs));
    JITRegAlloc_LockReg(ra, exec, -1, true);
#endif
    if (cgstate->psr_dirty) {
      ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_SavePSR(JIT_AL,temp));
    }
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_ADD,cyc,cyc,cgstate->cycles));
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_ADD,pc,pc,cgstate->instrs<<2));
    if (cgstate->psr_dirty) { /* TODO prime candidate for BFI */
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_AND,temp,temp,0xF0000000));
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_BIC,pc,pc,0xF0000000));
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Rm(JIT_AL,JIT_ORR,pc,pc,temp));
    }
    JITRegAlloc_LockReg(ra, cyc, -1, true);
    JITRegAlloc_LockReg(ra, pc, -1, true);
    JITRegAlloc_LockReg(ra, temp, -1, false);
    cgstate->psr_loaded = cgstate->psr_dirty = false;
    cgstate->cycles = cgstate->instrs = 0;
  }
}

static void codegen_sync_pc(JITRegAlloc *ra,CodeGenState *cgstate,bool psr)
{
  psr &= cgstate->psr_dirty;
  if (cgstate->instrs || psr)
  {
    JITHostReg pc = JITRegAlloc_LockEmuReg(ra, JIT_E_PC);
    JITHostReg temp = (psr ? JITRegAlloc_LockEmuReg(ra, JIT_E_Temp) : 
JIT_H_REG_NONE);
#ifdef DEBUG_JIT_METRICS_EXEC
    JITHostReg exec = JITRegAlloc_LockEmuReg(ra, JIT_E_Exec);
#endif
    if (pc == JIT_H_REG_NONE) {
      pc = JITRegAlloc_MapReg(ra, JIT_E_PC, 1, JIT_H_REG_NONE, JIT_AL);
    }
    if (psr && (temp == JIT_H_REG_NONE)) {
      temp = JITRegAlloc_MapReg(ra, JIT_E_Temp, 1, JIT_H_REG_NONE, JIT_NV);
    }
#ifdef DEBUG_JIT_METRICS_EXEC
    if (exec == JIT_H_REG_NONE) {
      exec = JITRegAlloc_MapReg(ra, JIT_E_Exec, 1, JIT_H_REG_NONE, JIT_AL);
    }
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_ADD,exec,exec,cgstate->instrs));
    JITRegAlloc_LockReg(ra, exec, -1, true);
#endif
    if (psr) {
      ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_SavePSR(JIT_AL,temp));
    }
    if (cgstate->instrs) {
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_ADD,pc,pc,cgstate->instrs<<2));
    }
    if (psr) { /* TODO prime candidate for BFI */
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_AND,temp,temp,0xF0000000));
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_BIC,pc,pc,0xF0000000));
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Rm(JIT_AL,JIT_ORR,pc,pc,temp));
      cgstate->psr_dirty = false;
    }
    JITRegAlloc_LockReg(ra, pc, -1, true);
    JITRegAlloc_LockReg(ra, temp, -1, false);
    cgstate->instrs = 0;
  }
}

static void codegen_conditional_cycles(JITRegAlloc *ra,CodeGenState 
*cgstate,uint32_t cc,int count)
{
  if (cc == JIT_NV)
  {
    return;
  }
  if (cc == JIT_AL)
  {
    cgstate->cycles += count;
  }
  else if (count)
  {
    JITHostReg cyc = JITRegAlloc_MapReg(ra, JIT_E_CycleCount, 1, 
JIT_H_REG_NONE, JIT_AL);
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(cc,JIT_ADD,cyc,cyc,cgstate->cycles));
    JITRegAlloc_LockReg(ra, cyc, -1, true);    
  }
}

static void codegen_bigadd(ForwardCodeBlock *block, JITHostReg Rd, JITHostReg 
Rn, int val)
{
  int op = JIT_ADD;
  int shift = 0;
  if (val < 0) {
    op = JIT_SUB;
    val = -val;
  } else if (!val) {
    if (Rd != Rn) {
      ForwardCodeBlock_WriteCode(block, JITCodeGen_DataProc_Rd_Rm(JIT_AL, 
JIT_MOV, Rd, Rn));
    }
    return;
  }
  while (val) {
    /* XX CLZ */
    while (!((val >> shift) & 0x3)) {
      shift += 2;
    }
    ForwardCodeBlock_WriteCode(block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,op,Rd,Rn,val & (0xff<<shift)));
    Rn = Rd;
    val &= ~(0xff<<shift);
    shift += 8;
  }
}

static void codegen_handle_branch(JITRegAlloc *ra, CodeGenState *cgstate, const 
Instruction *instr)
{
  /* Prep PC */
  JITHostReg pc = JITRegAlloc_MapReg(ra, JIT_E_PC, 1, JIT_H_REG_NONE, JIT_AL);
  cgstate->cycles += 2;
  codegen_sync(ra,cgstate);
  if (Decoder_Branch_BLFlag(instr)) {
    /* Copy PC to R14 */
    JITHostReg r14 = JITRegAlloc_MapReg(ra, JIT_ER(14), 1, JIT_H_REG_NONE, 
JIT_NV);
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_SUB,r14,pc,8));
    JITRegAlloc_LockReg(ra, r14, -1, true);
  }
  /* Add branch offset */
  int32_t offset = Decoder_Branch_Offset(instr)-4;
  if (offset) {
    /* To avoid needing a temp reg to deal with overflow, just rotate the PC + 
offset */
    /* XXX temp reg could be better if e.g. PSR is already loaded */
    offset <<= 6;
    ForwardCodeBlock_WriteCode(ra->block, 0xe1a00d60 + (pc*0x1001)); /* MOV 
pc,pc,ROR #26 */
    codegen_bigadd(ra->block, pc, pc, offset);
    ForwardCodeBlock_WriteCode(ra->block, 0xe1a00360 + (pc*0x1001)); /* MOV 
pc,pc,ROR #6 */
  }
  JITRegAlloc_LockReg(ra, pc, -1, true);
  /* Now exit */
  JITEmuInterf2_WriteEpilogue(ra,JITResult_Normal);
}

static void codegen_handle_branch_cc(JITRegAlloc *ra1, const CodeGenState 
*cgstate1, ForwardCodeBlock *block, const Instruction *instr)
{
  JITRegAlloc ra2;
  CodeGenState cgstate2;
  memcpy(&cgstate2,cgstate1,sizeof(CodeGenState));
  JITRegAlloc_Copy(&ra2,ra1,block);
  codegen_handle_branch(&ra2,&cgstate2,instr);
}

static void codegen_handle_ldr(JITRegAlloc *ra,const CodeGenState 
*cgstate,const Instruction *instr, ForwardCodeBlock *block2)
{
  JITHostReg Rn, Rd, Rm = JIT_H_REG_NONE, Raddr, Ralu, Roffset;

  /* Pre-lock registers */
  Rn = JITRegAlloc_LockEmuReg(ra, Decoder_Rn(instr));
  Rd = JITRegAlloc_LockEmuReg(ra, Decoder_Rd(instr));

  /* Map registers */
  if (!Decoder_LDRSTR_ImmFlag(instr)) {
    Rm = JITRegAlloc_MapReg(ra, Decoder_Rm(instr), 1, JIT_H_REG_NONE, JIT_AL);
  }
  if (Rn == JIT_H_REG_NONE) {
    Rn = JITRegAlloc_MapReg(ra, Decoder_Rn(instr), 1, JIT_H_REG_NONE, JIT_AL);
  }
  if (Decoder_Rn(instr) == JIT_E_PC) {
    /* Mask out the flags into a temp reg */
    JITHostReg Rpc = JITRegAlloc_MapReg(ra, JIT_E_Temp2, 1, JIT_H_REG_NONE, 
JIT_NV);
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_BIC,Rpc,Rn,0xFC000003));
    JITRegAlloc_LockReg(ra, Rn, -1, false);
    Rn = Rpc;
  }

  /* Calculate ALU output in a temporary register, so we can avoid write-back 
on abort */
  Ralu = Rn;
  if (Decoder_LDRSTR_HasOffset(instr)) {
    Ralu = JITRegAlloc_MapReg(ra, JIT_E_Temp, 1, JIT_H_REG_NONE, JIT_NV);
    if (Decoder_LDRSTR_ImmFlag(instr)) {
      codegen_bigadd(ra->block, Ralu, Rn, (instr->instr & 0xfff) * 
(Decoder_LDRSTR_UpFlag(instr) ? 1 : -1));
    } else {
      uint32_t offset = (instr->instr & 0xfe0) | Rm; /* Preserve shift, knock 
out register-shifted-register flag */
      ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Rm(JIT_AL, Decoder_LDRSTR_UpFlag(instr) ? JIT_ADD : 
JIT_SUB, Ralu, Rn, offset));
      /* Rm no longer needed */
      JITRegAlloc_LockReg(ra, Rm, -1, false);
    }
  }

  Raddr = (Decoder_LDRSTR_PreFlag(instr) ? Ralu : Rn);

  /* Now perform load */
  if (Rd == JIT_H_REG_NONE) {
    Rd = JITRegAlloc_MapReg(ra, Decoder_Rd(instr), 1, JIT_H_REG_NONE, JIT_NV);
  }
  JITEmuInterf2_WriteLoad(ra, Rd, Raddr, Decoder_LDRSTR_ByteFlag(instr), 
block2->nextinstr);

  /* Write the exit handler (WriteLoad will have inserted any necessary 
branches) */
  {
    JITRegAlloc ra2;
    CodeGenState cgstate2;
    memcpy(&cgstate2,cgstate,sizeof(CodeGenState));
    JITRegAlloc_Copy(&ra2,ra,block2);
    codegen_sync(&ra2,&cgstate2);
    JITEmuInterf2_WriteEpilogue(&ra2,JITResult_Interpret);
  }

  /* Apply writeback */
  if (Decoder_LDRSTR_WritebackFlag(instr) && (Ralu != Rn)) {
    ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rm(JIT_AL, 
JIT_MOV, Rn, Ralu));
    JITRegAlloc_LockReg(ra, Rn, -1, true);
    JITRegAlloc_LockReg(ra, Ralu, -1, false);
    Rn = JIT_H_REG_NONE;
    Ralu = JIT_H_REG_NONE;
  }

  /* Release remaining regs */
  JITRegAlloc_LockReg(ra, Rd, -1, true);
  if (Ralu != Rn) {
    JITRegAlloc_LockReg(ra, Ralu, -1, false);
  }
  if (Rn != JIT_H_REG_NONE) {
    JITRegAlloc_LockReg(ra, Rn, -1, false);
  }
}

static void codegen_handle_ldr_cc(JITRegAlloc *ra1, const CodeGenState 
*cgstate, uint32_t cc, const Instruction *instr, ForwardCodeBlock *block2)
{
  uintptr_t branch;
  JITRegAlloc ra2;
  JITHostReg cyc;
  /* Assume cycle will be consumed */
  cyc = JITRegAlloc_MapReg(ra1, JIT_E_CycleCount, 1, JIT_H_REG_NONE, JIT_AL);
  /* Fork register state */
  JITRegAlloc_Fork(ra1,&ra2);
  /* Reserve space for branch */
  branch = ForwardCodeBlock_WriteCode(ra2.block, 0);
  /* Generate code */
  codegen_handle_ldr(&ra2,cgstate,instr,block2);
  /* Consume cycle */
  ForwardCodeBlock_WriteCode(ra2.block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_ADD,cyc,cyc,1));
  /* Join state */
  JITRegAlloc_Join(ra1,&ra2);
  /* Fill in branch */
  *((uint32_t *) branch) = JITCodeGen_Branch(cc ^ 
(1<<28),ra2.block->nextinstr-branch);
  JITRegAlloc_LockReg(ra1, cyc, -1, true);
}

static void codegen_handle_ldm(JITRegAlloc *ra,const CodeGenState 
*cgstate,const Instruction *instr, ForwardCodeBlock *block2)
{
  JITHostReg Rn, Raddr;
  int regs = Decoder_LDMSTM_NumRegs(instr);
  int offset = 0;


  /* Map Rn */
  Rn = JITRegAlloc_MapReg(ra, Decoder_Rn(instr), 1, JIT_H_REG_NONE, JIT_AL);

  /* Calculate base address */
  Raddr = JITRegAlloc_MapReg(ra, JIT_E_Temp, 1, JIT_H_REG_NONE, JIT_NV);

  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_BIC,Raddr,Rn,3));
  if (Decoder_LDMSTM_PreFlag(instr)) {
    offset = 4;
  }
  if (!Decoder_LDMSTM_UpFlag(instr)) {
    offset = -offset - ((regs-1)<<2);
  }
  codegen_bigadd(ra->block, Raddr, Raddr, offset);

  /* HACK - making registers dirty in WriteLDM could break the exit handler, so 
write back everything here */
  JITRegAlloc_WriteBackDirty(ra);

  /* Now perform load */
  JITEmuInterf2_WriteLDM(ra, Rn, Raddr, instr, block2->nextinstr);

  /* Write the exit handler (WriteLDM will have inserted any necessary 
branches) */
  {
    JITRegAlloc ra2;
    CodeGenState cgstate2;
    memcpy(&cgstate2,cgstate,sizeof(CodeGenState));
    JITRegAlloc_Copy(&ra2,ra,block2);
    codegen_sync(&ra2,&cgstate2);
    JITEmuInterf2_WriteEpilogue(&ra2,JITResult_Interpret);
  }

  /* Apply writeback */
  if (Decoder_LDMSTM_WritebackFlag(instr) && !(instr->instr & 
(1<<Decoder_Rn(instr)))) {
    offset = (Decoder_LDMSTM_UpFlag(instr) ? 1 : -1) * (regs<<2);
    codegen_bigadd(ra->block, Rn, Rn, offset);    
    JITRegAlloc_LockReg(ra, Rn, -1, true);
  } else {
    JITRegAlloc_LockReg(ra, Rn, -1, false);
  }

  JITRegAlloc_LockReg(ra, Raddr, -1, false);
}

static void codegen_handle_ldm_cc(JITRegAlloc *ra1, const CodeGenState 
*cgstate, uint32_t cc, const Instruction *instr, ForwardCodeBlock *block2)
{
  uintptr_t branch;
  JITRegAlloc ra2;
  JITHostReg cyc;
  /* Assume cycles will be consumed */
  cyc = JITRegAlloc_MapReg(ra1, JIT_E_CycleCount, 1, JIT_H_REG_NONE, JIT_AL);
  /* Fork register state */
  JITRegAlloc_Fork(ra1,&ra2);
  /* Reserve space for branch */
  branch = ForwardCodeBlock_WriteCode(ra2.block, 0);
  /* Generate code */
  codegen_handle_ldm(&ra2,cgstate,instr,block2);
  /* Consume cycles */
  ForwardCodeBlock_WriteCode(ra2.block, 
JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL,JIT_ADD,cyc,cyc,Decoder_LDMSTM_Cycles(instr)));
  /* Join state */
  JITRegAlloc_Join(ra1,&ra2);
  /* Fill in branch */
  *((uint32_t *) branch) = JITCodeGen_Branch(cc ^ 
(1<<28),ra2.block->nextinstr-branch);
  JITRegAlloc_LockReg(ra1, cyc, -1, true);
}

#ifndef __riscos__
static JITResult JIT_Hack(JITEmuState *state,void *addr)
{
  return JITResult_Interpret;
}
#endif

static void JIT_GeneratePage(JITEmuState *state,JITPage *page)
{
  const JITState *jit = JIT_GetState(state);
  ForwardCodeBlock block,block2;
  void *addr = JITPage_StartOfPage(jit,page);
  int remaining = JITPAGE_SIZE;
  JITFunc *funcs = JIT_Phy2Func(jit,addr);
  CodeGenState cgstate = {false};
  bool incode = false, block2_init = false;
  JITRegAlloc ra;
#ifdef DEBUG_JIT_METRICS
  uint32_t entry_points = ~0, exit_points = -~0;
#endif
  /* Release any code that currently exists */
  JITPage_ForgetCode(jit,page);
  /* Start allocating new code */
  page->codeblock = CodeBlock_NextID();
  ForwardCodeBlock_New(&block, false);
  JITRegAlloc_Init(&ra, &block);

  while (remaining > 0)
  {
    remaining -= 4;
    if (MemAttr_GetEntryPointFlag(jit,addr))
    {
      /* Write back state */
      codegen_sync(&ra,&cgstate);
      JITRegAlloc_CalleeRestore(&ra,false);
      /* Record current location as an entry point */
#ifdef DEBUG_JIT_DUMP
      fprintf(stderr,"entry point: %08x\n",block.nextinstr);
#endif
#ifdef __riscos__
      *funcs++ = (JITFunc) block.nextinstr;
#else
      *funcs++ = &JIT_Hack;
#endif
      incode = true;
#ifdef DEBUG_JIT_METRICS
      entry_points++;
#endif
    }
    else
    {
      /* Not an entry point */
      *funcs++ = &JIT_Generate;
    }
    if (incode)
    {
      /* Decode instruction */
      Instruction instr;
      Decoder_Decode(&instr,*((uint32_t *) addr));
      if (!JITable(&instr))
      {
        /* Terminate this block */
        codegen_sync(&ra,&cgstate);
        JITEmuInterf2_WriteEpilogue(&ra,JITResult_Interpret);
        incode = false;
#ifdef DEBUG_JIT_METRICS
        exit_points++;
#endif
      }
      else
      {
        uint32_t temp,cc;
        JITHostReg Rd,Rn,Rm,Rs,Rpc;
        uintptr_t branch;
#ifdef DEBUG_JIT_DUMP
        const char *str;
        uint32_t laddr = 
((state->Reg[15]-8)&0x03fff000)+JITPAGE_SIZE-4-remaining;
        _swix(Debugger_Disassemble,_INR(0,1)|_OUT(1),instr.instr,laddr,&str);
        fprintf(stderr,"%08x %08x %s\n",laddr,instr.instr,str);
#endif
        /* Sync if we've been going for a long time (hack to avoid impossible 
imm12) */
        if ((cgstate.cycles > 128) || (cgstate.instrs > 128)) {
          codegen_sync(&ra,&cgstate);
        }
        /* Mark this as code */
        MemAttr_SetCodeFlag(jit,addr);
#ifdef DEBUG_JIT_METRICS
        jitmetrics.instructions_in++;
#endif
        cc = Decoder_CC(&instr);
        /* Work out how to generate code */
        switch (instr.type) {
        case InstrType_NOP:
          break;

        case InstrType_DataProc:
          if (!cgstate.psr_loaded && (Decoder_Conditional(&instr) || 
Decoder_DataProc_SFlag(&instr) || Decoder_DataProc_CarryIn(&instr))) {
            /* PSR needed but not loaded yet */
            JITHostReg psrreg = JITRegAlloc_MapReg(&ra, JIT_E_PC, 0, 
JIT_H_REG_NONE, JIT_AL);
            ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_LoadNZCV(JIT_AL,psrreg));
            cgstate.psr_loaded = true;
          }
          /* Load arguments, update instruction */
          temp = instr.instr & ~0xff000; /* Zap Rd, Rn fields */

          Rn = Rd = Rm = Rs = Rpc = JIT_H_REG_NONE;

          /* Pre-lock registers */
          if (Decoder_DataProc_IsShiftedReg(&instr)) {
            /* Add the extra cycle */
            codegen_conditional_cycles(&ra, &cgstate, cc, 1);
            Rs = JITRegAlloc_LockEmuReg(&ra, Decoder_Rs(&instr));
          }          
          if (Decoder_DataProc_UsesRn(&instr)) {
            Rn = JITRegAlloc_LockEmuReg(&ra, Decoder_Rn(&instr));
          }
          if (!Decoder_DataProc_ImmFlag(&instr)) {
            Rm = JITRegAlloc_LockEmuReg(&ra, Decoder_Rm(&instr));
          }
          if (Decoder_DataProc_UsesRd(&instr)) {
            Rd = JITRegAlloc_LockEmuReg(&ra, Decoder_Rd(&instr));
          }

          /* Now actually load them */          
          if (!Decoder_DataProc_ImmFlag(&instr)) {
            if (Rm == JIT_H_REG_NONE) {
              Rm = JITRegAlloc_MapReg(&ra, Decoder_Rm(&instr), 1, 
JIT_H_REG_NONE, JIT_AL);
            }
            if (Decoder_Rm(&instr) == JIT_E_PC) {
              codegen_sync_pc(&ra,&cgstate,true);
            }
            temp = (temp & ~0xf) | Rm;
          }

          if (Decoder_DataProc_UsesRn(&instr)) {
            if (Rn == JIT_H_REG_NONE) {
              Rn = JITRegAlloc_MapReg(&ra, Decoder_Rn(&instr), 1, 
JIT_H_REG_NONE, JIT_AL);
            }
            if (Decoder_Rn(&instr) == JIT_E_PC) {
              codegen_sync_pc(&ra,&cgstate,true);
              /* Mask out the flags into a temp reg */
              Rpc = JITRegAlloc_MapReg(&ra, JIT_E_Temp, 1, JIT_H_REG_NONE, 
JIT_NV);
              ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_DataProc_Rd_Rn_Imm(cc,JIT_BIC,Rpc,Rn,0xFC000003));
              JITRegAlloc_LockReg(&ra, Rn, -1, false);
              Rn = Rpc;
            }
            temp |= (Rn<<16);
          }
          /* If Rn isn't used, leave as zero (preferred MOV/MVN encoding) */

          if (Decoder_DataProc_IsShiftedReg(&instr)) {
            if (Rs == JIT_H_REG_NONE) {
              Rs = JITRegAlloc_MapReg(&ra, Decoder_Rs(&instr), 1, 
JIT_H_REG_NONE, JIT_AL);
              if (Decoder_Rs(&instr) == JIT_E_PC) {
                /* Mask out the flags into a temp reg */
                Rpc = JITRegAlloc_MapReg(&ra, JIT_E_Temp2, 1, JIT_H_REG_NONE, 
JIT_NV);
                ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_DataProc_Rd_Rn_Imm(cc,JIT_BIC,Rpc,Rs,0xFC000003));
                JITRegAlloc_LockReg(&ra, Rs, -1, false);
                Rs = Rpc;
              }
              /* If Rn or Rm are the PC, we also need to add 4 to them */
              if ((Rn != JIT_H_REG_NONE) && (Decoder_Rn(&instr) == JIT_E_PC)) {
                ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_DataProc_Rd_Rn_Imm(cc,JIT_ADD,Rpc,Rn,4));                  
              }
              if ((Rm != JIT_H_REG_NONE) && (Decoder_Rm(&instr) == JIT_E_PC)) {
                /* Yuck, this will be the full PC
                   Transfer into another temp */
                Rpc = JITRegAlloc_MapReg(&ra, JIT_E_Temp3, 1, JIT_H_REG_NONE, 
JIT_NV);
                ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_DataProc_Rd_Rn_Imm(cc,JIT_ADD,Rpc,Rm,4));
                JITRegAlloc_LockReg(&ra, Rm, -1, false);
                Rm = Rpc;
                temp = (temp & ~0xf) | Rm;
              }
            }
            temp = (temp & ~0xf00) | (Rs<<8);
          }          

          /* Map Rd last so we can conditionally load it based on CC */
          if (Decoder_DataProc_UsesRd(&instr)) {
            if (Rd == JIT_H_REG_NONE) {
              Rd = JITRegAlloc_MapReg(&ra, Decoder_Rd(&instr), 1, 
JIT_H_REG_NONE, cc ^ (1<<28));
            }
            temp |= (Rd<<12);
          }
          /* If Rd isn't used leave as zero (preferred CMP, etc. encoding)
             N.B. this will be wrong when we start supporting CMPP (need to 
decode them as a different InstrType?) */

          /* Perform the op */
          ForwardCodeBlock_WriteCode(&block, temp);

          /* Write back any result */
          if (Decoder_DataProc_SFlag(&instr)) {
            cgstate.psr_dirty = true;
          }
          JITRegAlloc_LockReg(&ra, Rn, -1, false);
          JITRegAlloc_LockReg(&ra, Rd, -1, true);
          JITRegAlloc_LockReg(&ra, Rm, -1, false);
          JITRegAlloc_LockReg(&ra, Rs, -1, false);
          break;

        case InstrType_Branch:
          if (!cgstate.psr_loaded && Decoder_Conditional(&instr)) {
            /* PSR needed but not loaded yet */
            JITHostReg psrreg = JITRegAlloc_MapReg(&ra, JIT_E_PC, 0, 
JIT_H_REG_NONE, JIT_AL);
            ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_LoadNZCV(JIT_AL,psrreg));
            cgstate.psr_loaded = true;
          }

#ifdef DEBUG_JIT_METRICS
          exit_points++;
#endif
          /* Pre-increment these? */
          cgstate.instrs++;
          cgstate.cycles++;
          if (cc != JIT_AL) {
            /* Branch to an exit on CC */
            if (!block2_init) {
              block2_init = true;
              ForwardCodeBlock_New(&block2, false);
            }
            ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_Branch(cc,block2.nextinstr-block.nextinstr));
            codegen_handle_branch_cc(&ra,&cgstate,&block2,&instr);
          } else {
            codegen_handle_branch(&ra,&cgstate,&instr);
            incode = false; /* Don't assume it will return */
          }
          goto next_instr;
          break;

        case InstrType_LDRSTR:
          if (!cgstate.psr_loaded && (Decoder_Conditional(&instr) || 
Decoder_LDRSTR_CarryIn(&instr))) {
            /* PSR needed but not loaded yet */
            JITHostReg psrreg = JITRegAlloc_MapReg(&ra, JIT_E_PC, 0, 
JIT_H_REG_NONE, JIT_AL);
            ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_LoadNZCV(JIT_AL,psrreg));
            cgstate.psr_loaded = true;
          }

          if (cgstate.psr_dirty /* Force PSR writeback (fastmap will clobber 
it) */
          || (Decoder_Rn(&instr) == JIT_E_PC) /* Force PC write-back if needed 
as input */
          || (!Decoder_LDRSTR_ImmFlag(&instr) && (Decoder_Rm(&instr) == 
JIT_E_PC))) {
            codegen_sync_pc(&ra,&cgstate,true);
          }
          cgstate.psr_loaded = false; /* About to be clobbered */

          if (!block2_init) {
            block2_init = true;
            ForwardCodeBlock_New(&block2, false);
          }
          if (cc != JIT_AL) {
            codegen_handle_ldr_cc(&ra,&cgstate,cc,&instr,&block2);
          } else {
            codegen_handle_ldr(&ra,&cgstate,&instr,&block2);
            cgstate.cycles++;
          }
          break;

        case InstrType_LDMSTM:
          if (!cgstate.psr_loaded && Decoder_Conditional(&instr)) {
            /* PSR needed but not loaded yet */
            JITHostReg psrreg = JITRegAlloc_MapReg(&ra, JIT_E_PC, 0, 
JIT_H_REG_NONE, JIT_AL);
            ForwardCodeBlock_WriteCode(&block, 
JITCodeGen_LoadNZCV(JIT_AL,psrreg));
            cgstate.psr_loaded = true;
          }

          if (cgstate.psr_dirty) { /* Force PSR writeback (fastmap will clobber 
it) */
            codegen_sync_pc(&ra,&cgstate,true);
          }
          cgstate.psr_loaded = false; /* About to be clobbered */

          if (!block2_init) {
            block2_init = true;
            ForwardCodeBlock_New(&block2, false);
          }
          if (cc != JIT_AL) {
            codegen_handle_ldm_cc(&ra,&cgstate,cc,&instr,&block2);
          } else {
            codegen_handle_ldm(&ra,&cgstate,&instr,&block2);
            cgstate.cycles+=Decoder_LDMSTM_Cycles(&instr);
          }
          break;

        default:
          assert(0);
          break;
        }
        cgstate.instrs++;
        cgstate.cycles++;
next_instr:;
#ifdef DEBUG_JIT_SINGLE_INSTR
        if (incode) {
          codegen_sync(&ra,&cgstate);
          JITEmuInterf2_WriteEpilogue(&ra,JITResult_Normal);
          incode = false;
#ifdef DEBUG_JIT_METRICS
          exit_points++;
#endif
        }
#endif
      }
    }
    addr = (void *) (((uintptr_t) addr)+4);
  }
  /* If we're still in code at the end, terminate the block with Normal result 
*/
  if (incode)
  {
    codegen_sync(&ra,&cgstate);
    JITEmuInterf2_WriteEpilogue(&ra,JITResult_Normal);
#ifdef DEBUG_JIT_METRICS
    exit_points++;
#endif
  }
  /* Finish generating code */
  CodeBlock_ClaimID(page->codeblock, page);
  DirtyRanges_Flush();
#ifdef JIT_DEBUG
  fprintf(stderr,"JIT done\n");
#endif
#ifdef DEBUG_JIT_METRICS
  if (entry_points < JITPAGE_SIZE/4) jitmetrics.entry_points[entry_points]++;
  if (exit_points < JITPAGE_SIZE/4) jitmetrics.exit_points[exit_points]++;
#endif
}

#ifdef DEBUG_JIT_TEST_EXEC
#include "../armemu.h"
extern void extern_execute_instruction(ARMul_State *state,ARMword instr,ARMword 
r15);

extern JITResult test_exec(JITEmuState *state,ARMword *addr);

JITResult test_exec(JITEmuState *state,ARMword *addr)
{
  /* Make a backup of the state */
  const JITState *jit = JIT_GetState(state);
#ifndef DEBUG_JIT_FAKE
  ARMword r[16],r2[16];
  CycleCount cyc = state->NumCycles;
#endif
  JITResult res;
  JITPage *page = JITPage_Get(jit,addr);
  ARMword *page_end = (ARMword *) JITPage_StartOfPage(jit,page);
  ARMword *initial = addr;
  JITFunc *func = JIT_Phy2Func(jit,addr);
  page_end += JITPAGE_SIZE/4;
#ifndef DEBUG_JIT_FAKE
  memcpy(r,state->Reg,sizeof(r));
#endif
  /* Single-step over all code */
  while ((addr != page_end) && MemAttr_GetCodeFlag(jit,addr)) {
    ARMword oldr15 = state->Reg[15];
    /* Stop here for any taken load instruction */
    {
      Instruction instr;
      Decoder_Decode(&instr, *addr);
      if ((instr.type == InstrType_LDRSTR) && ARMul_CCCheck(instr.instr,(oldr15 
& CCBITS))) {
        break;
      }
    }
    state->NextInstr = NORMAL;
    state->NumCycles++;
    extern_execute_instruction(state,*addr,state->Reg[15]);
    if (state->NextInstr == NORMAL)
    {
      if ((oldr15 & 0xfffffff) != (state->Reg[15] & 0xfffffff))
      {
        fprintf(stderr,"Unexpected PC modification %08x -> %08x @ 
%08x\n",oldr15,state->Reg[15],*addr);
      }
      state->Reg[15]+=4;
    }
    else if (state->NextInstr != PCINCED)
    {
      fprintf(stderr,"Unexpected pipeline state %d @ 
%08x\n",state->NextInstr,*addr);
    }
    addr++;
#ifdef DEBUG_JIT_SINGLE_INSTR
    break;
#endif
  }
#ifndef DEBUG_JIT_FAKE
  /* Copy state again */
  memcpy(r2,state->Reg,sizeof(r));
  /* Restore original state */
  memcpy(state->Reg,r,sizeof(r));
  state->NumCycles = cyc;
  /* Run the code */
  res = (*func)(state,initial);
  /* Check for consistency */
  if (memcmp(state->Reg,r2,sizeof(r))) {
    int i;
    fprintf(stderr,"JIT inconsistency!\n");
    fprintf(stderr,"Orig:");
    for(i=0;i<16;i++)
    {
      fprintf(stderr," %08x",r[i]);
    }
    fprintf(stderr,"\nStep:");
    for(i=0;i<16;i++)
    {
      fprintf(stderr," %08x",r2[i]);
    }
    fprintf(stderr,"\nJIT :");
    for(i=0;i<16;i++)
    {
      fprintf(stderr," %08x",state->Reg[i]);
    }
    fprintf(stderr,"\nCode:\n");
    for(i=0;initial+i != (addr+1);i++) {
      char *str;
      
_swix(Debugger_Disassemble,_INR(0,1)|_OUT(1),initial[i],(r[15]&0x03fffffc)-8+(i*4),&str);
      fprintf(stderr,"%08x %08x 
%s\n",(r[15]&0x03fffffc)-8+(i*4),initial[i],str);
    }
#if 0 /* Broken, MOV PC, R14 is no longer the only terminating instruction 
(plus there could be branches) */
    fprintf(stderr,"\nJIT:\n");
    addr = (ARMword *) *func;
    do {
      char *str;
      _swix(Debugger_Disassemble,_INR(0,1)|_OUT(1),*addr,addr,&str);
      fprintf(stderr,"%08x %08x %s\n",addr,*addr,str);
    } while (*(addr++) != 0xe1a0f00e);
#else
    fprintf(stderr,"\nJIT: %08x\n",*func);
#endif
    assert(0);
  }
  return res;
#else
  state->NextInstr = (addr==page_end ? PCINCED : NORMAL);
  return (addr==page_end ? JITResult_Normal : JITResult_Interpret);
#endif
}
#endif

JITResult JIT_Generate(JITEmuState *state,void *addr)
{
  const JITState *jit = JIT_GetState(state);
  Instruction instr;
  uint32_t *input = (uint32_t *) addr;
  Decoder_Decode(&instr,*input);
  if (!JITable(&instr))
  {
    return JITResult_Interpret;
  }
#ifdef DEBUG_JIT_METRICS
  jitmetrics.generate_count++;
#endif
  /* Flag this location as being an entry point */
  MemAttr_SetEntryPointFlag(jit,addr);
  /* Generate all the code for this page */
  JIT_GeneratePage(state,JITPage_Get(jit,addr));
  /* Re-fetch and execute the JIT function (mustn't be JIT_Generate!) */
#ifdef DEBUG_JIT_TEST_EXEC
  return test_exec(state,(ARMword *) addr);
#else
  return (*JIT_Phy2Func(jit,addr))(state,addr);
#endif
}

--- NEW FILE: jitpage.h ---
#ifndef JITPAGE_HEADER
#define JITPAGE_HEADER

#include "jitstate.h"

#define JITPAGE_SIZE 4096

struct JITPage {
  int codeblock; /* ID of first code block in chain, -1 if no code exists for 
this page */
  /* ??? do we need anything here? */
};

extern void JITPage_Init(JITState *jit,uint32_t romramchunksize);

/* Get the start address of this page */
static inline void *JITPage_StartOfPage(const JITState *jit,JITPage *page)
{
  int idx = page - jit->pages;
  return (void *) (jit->romramchunk + idx*JITPAGE_SIZE);  
}

/* Forget any generated code for this page (but retain memory attributes) */
extern void JITPage_ForgetCode(const JITState *jit,JITPage *page);

/* Get the page that contains this address */
static inline JITPage *JITPage_Get(const JITState *jit,void *addr)
{
  uint32_t idx = (((uintptr_t) addr) - jit->romramchunk)/JITPAGE_SIZE;
  return &jit->pages[idx];
}

/* Called when code is overwritten by data: Forget generated code, reset memory 
attributes */
extern void JITPage_ClobberCode(const JITState *jit,JITPage *page);

/* Clobber code by address */
extern void JITPage_ClobberCodeByAddr(const JITState *jit,void *addr);

#endif

--- NEW FILE: jitpage.c ---
#include <stdlib.h>
#include "jitpage.h"
#include "jit.h"
#include "codeblocks.h"
#include "memattr.h"

void JITPage_Init(JITState *jit,uint32_t romramchunksize)
{
  int numpages = romramchunksize/JITPAGE_SIZE;
  jit->pages = (JITPage *) calloc(numpages,sizeof(JITPage));
  while (numpages--)
  {
    jit->pages[numpages].codeblock = -1;
  }
}

void JITPage_ForgetCode(const JITState *jit,JITPage *page)
{
  if (page->codeblock == -1)
  {
    return;
  }
  /* Unlink with the generated code */
  CodeBlock_InvalidateID(page->codeblock);
  page->codeblock = -1;
  /* Reset all the pointers */
  void *addr = JITPage_StartOfPage(jit,page);
  JITFunc *func = JIT_Phy2Func(jit, addr);
  int i;
  for(i=0;i<JITPAGE_SIZE;i+=4)
  {
    *func++ = &JIT_Generate;
  }
}

void JITPage_ClobberCode(const JITState *jit,JITPage *page)
{
  /* Release code */
  JITPage_ForgetCode(jit,page);
  /* Reset memory attributes */
  void *phy = JITPage_StartOfPage(jit,page);
  MemAttr_ClearFlagsRange(jit, phy, ((uint8_t *) phy) + JITPAGE_SIZE);
}

void JITPage_ClobberCodeByAddr(const JITState *jit,void *addr)
{
  JITPage_ClobberCode(jit,JITPage_Get(jit,addr));
}

--- NEW FILE: jit.h ---
/* Manually include jitstate2.h first (necessary for everything to be defined 
in the right order with ArcEm) */
#include "jitstate2.h"

#ifndef JIT_HEADER
#define JIT_HEADER

#include "memattr.h"
#include "jitpage.h"

/* Initialise the JIT
   * romramchunk must be a pointer to the memory block containing all 
ROM/RAM/VRAM etc.
     Each sub-section must start at a 4K offset into the block
   * romramchunksize must be the size of romramchunk (4K multiple)
 */
extern void JIT_Init(JITState *jit,uintptr_t romramchunk,uint32_t 
romramchunksize);

/* Try to generate (and execute) code for the given address */
extern JITResult JIT_Generate(JITEmuState *state,void *addr);

/* Return location of JIT function associated with addr
   addr must be word aligned! */
static inline JITFunc *JIT_Phy2Func(const JITState *jit,void *addr)
{
  return (JITFunc *) (((uintptr_t)addr) + jit->addr2func);
}

/* Clobber any generated code at the given address */
static inline void JIT_ClobberCode(JITEmuState *state,void *addr)
{
  JITState *jit = JIT_GetState(state);  
  if (!MemAttr_GetCodeFlag(jit,addr))
  {
    return;
  }
  JITPage_ClobberCodeByAddr(jit,addr);
}

#endif

--- NEW FILE: metrics.h ---
#ifndef JITMETRICS_HEADER
#define JITMETRICS_HEADER

#ifdef DEBUG_JIT_METRICS
#include "jitpage.h"
#include "decoder.h"

typedef enum {
  /* 0 ... InstrType_Count represent specific instructions that caused 
termination */
  TerminateReason_Normal = InstrType_Count, /* Termination due to 
JITResult_Normal */
  TerminateReason_Special, /* Execution has entered special memory region (or 
prefetch abort) */

  TerminateReason_Count,
} TerminateReason;

typedef struct {
  /* generate_count gives the number of blocks that have been generated */
  uint32_t generate_count;
  /* instructions_in counts the number of instructions that have been JITed */
  uint32_t instructions_in;
  /* instructions_out counts the number of instructions output by the JIT */
  uint32_t instructions_out;
  /* interpret_count counts the number of instructions that have been 
interpreted */
  uint32_t interpret_count;
  /* terminate_reason[N] gives the number of times that a code block has exited 
due to reason N */
  uint32_t terminate_reason[TerminateReason_Count];
#ifdef DEBUG_JIT_METRICS_EXEC
  /* execute_histogram[N] gives the number of times that a block of length N-1 
instructions has been executed */
  uint32_t execute_histogram[JITPAGE_SIZE/4];
#endif
  /* entry_points[N] gives the number of times that a page has been generated 
with N-1 entry points */
  uint32_t entry_points[JITPAGE_SIZE/4];
  /* exit_points[N] gives the number of times that a page has been generated 
with N-1 exit points */
  uint32_t exit_points[JITPAGE_SIZE/4];
} JITMetrics;

extern JITMetrics jitmetrics;

extern void JITMetrics_Dump(void);

#endif /* DEBUG_JIT_METRICS */

#endif

--- NEW FILE: jitstate2.h ---
/* Manually include armdefs.h first (necessary for everything to be defined in 
the right order with ArcEm) */
#include "../armdefs.h"

#ifndef JITSTATE2_HEADER
#define JITSTATE2_HEADER

/* Accessor function to get JITState from JITEmuState */
static inline JITState *JIT_GetState(JITEmuState *state)
{
  return &state->jit;
}

#endif

--- NEW FILE: metrics.c ---
#ifdef DEBUG_JIT_METRICS

#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include "metrics.h"

JITMetrics jitmetrics;

static const char *terminate_reasons[] = {
"TerminateReason_NOP",
"TerminateReason_Branch",
"TerminateReason_DataProc",
"TerminateReason_Multiply",
"TerminateReason_LDRSTR",
"TerminateReason_LDMSTM",
"TerminateReason_SWI",
"TerminateReason_OtherInstr",
"TerminateReason_Normal",
"TerminateReason_Special",
};

void JITMetrics_Dump(void)
{
  uint64_t execute_total = 0;
  int i;
  fprintf(stderr,"%12u generate_count\n",jitmetrics.generate_count);
  fprintf(stderr,"%12u instructions_in\n",jitmetrics.instructions_in);
  fprintf(stderr,"%12u instructions_out\n",jitmetrics.instructions_out);
#ifdef DEBUG_JIT_METRICS_EXEC
  for(i=0;i<JITPAGE_SIZE/4;i++)
  {
    if (jitmetrics.execute_histogram[i])
    {
      fprintf(stderr,"%12u %4d 
execute_histogram\n",jitmetrics.execute_histogram[i],i+1);
      execute_total += ((uint64_t) jitmetrics.execute_histogram[i])*(i+1);
    }
  }
  fprintf(stderr,"%12" PRIu64 " execute_total\n",execute_total);
#endif
  fprintf(stderr,"%12u interpret_count\n",jitmetrics.interpret_count);
  for(i=0;i<TerminateReason_Count;i++)
  {
    fprintf(stderr,"%12u 
%s\n",jitmetrics.terminate_reason[i],terminate_reasons[i]);
  }
  for(i=0;i<JITPAGE_SIZE/4;i++)
  {
    if (jitmetrics.entry_points[i])
    {
      fprintf(stderr,"%12u %4d entry_points\n",jitmetrics.entry_points[i],i+1);
    }
  }
  for(i=0;i<JITPAGE_SIZE/4;i++)
  {
    if (jitmetrics.exit_points[i])
    {
      fprintf(stderr,"%12u %4d exit_points\n",jitmetrics.exit_points[i],i+1);
    }
  }
  memset(&jitmetrics,0,sizeof(jitmetrics));
}

#endif

--- NEW FILE: decoder.c ---
#include "decoder.h"

void Decoder_Decode(Instruction *out, uint32_t instr)
{
  out->instr = instr;
  if (instr >= 0xF0000000)
  {
    /* NV */
    out->type = InstrType_NOP;
    return;
  }
  switch ((instr >> 24) & 0xf)
  {
  case 0x0:
    if ((instr & 0x0fc000f0) == 0x90) {
      out->type = InstrType_Multiply;
      break;
    } 
  case 0x1:
  case 0x2:
  case 0x3:
    if (Decoder_DataProc_IsCompare(out) && !Decoder_DataProc_SFlag(out)) {
      /* Detect MRS, MSR, etc. at this decode phase */
      /* XXX cycle counting will be wrong here? */
      out->type = InstrType_NOP;
      break;
    }
    /* Non-immediate with bit 7 & 4 set isn't valid */
    if (!Decoder_DataProc_ImmFlag(out) && ((instr & 0x90) == 0x90)) {
      out->type = InstrType_Other;
      break;
    }
    out->type = InstrType_DataProc;
    break;
  case 0x4:
  case 0x5:
  case 0x6:
  case 0x7:
    out->type = InstrType_LDRSTR;
    break;
  case 0x8:
  case 0x9:
    out->type = InstrType_LDMSTM;
    break;
  case 0xa:
  case 0xb:
    out->type = InstrType_Branch;
    break;
  case 0xc: /* LDC/STC */
  case 0xd: /* LDC/STC */
  case 0xe: /* CDP */
    out->type = InstrType_Other;
  case 0xf:
    out->type = InstrType_SWI;
    break;
  /* TODO: SWP */
  }
}

int Decoder_LDMSTM_NumRegs(const Instruction *instr)
{
  int i,regs = 0;
  for(i=0;i<16;i++) {
    if (instr->instr & (1<<i)) {
      regs++;
    }
  }
  return regs;
}

int Decoder_LDMSTM_Cycles(const Instruction *instr)
{
  /* XXX fix this to be correct for all cases */
  return Decoder_LDMSTM_NumRegs(instr) + 2;
}

--- NEW FILE: emuinterf.c ---
#include "emuinterf.h"
#include "../arch/armarc.h"
#include "emuinterf2.h"
#include "decoder.h"
#include "codeblocks.h"

JITEmuState *JITEmuInterf_GetState(void)
{
  return &statestr;
}

void JITEmuInterf2_WriteEpilogue(JITRegAlloc *ra, JITResult result)
{
#ifdef DEBUG_JIT_FORCE_NORMAL
  result = JITResult_Normal;
#endif
  JITHostReg tempreg = JITRegAlloc_MapReg(ra, JIT_E_Temp, 1, JIT_H_REG_NONE, 
JIT_NV);
  /* Set correct pipeline state
     The interpreter assumes NORMAL when executing instructions, so if we're 
returning with JITResult_Interpret then we must be NORMAL
     Otherwise, set to PCINCED since we have fully advanced the PC */
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Imm(JIT_AL,JIT_MOV,tempreg,(result==JITResult_Normal?PCINCED:NORMAL)));
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_StoreImm(JIT_AL,tempreg,ra->emu_regs[JIT_E_StateReg].hostreg,offsetof(ARMul_State,NextInstr)));
  /* XXX Aborted, AbortSig? */
  /* Return */
  JITRegAlloc_LockReg(ra,tempreg,-1,false);
  JITRegAlloc_WriteBackDirty(ra);
  /* XXX hack, using R0 without claiming */
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_DataProc_Rd_Imm(JIT_AL,JIT_MOV,0,result));
  JITRegAlloc_CalleeRestore(ra, true);
}

void JITEmuInterf2_WriteLoad(JITRegAlloc *ra, JITHostReg Rd, JITHostReg Raddr, 
bool byte, uintptr_t abort)
{
  JITHostReg fastmap, fastmapmode, temp;
  /* Check for address exceptions */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rn_Imm(JIT_AL, 
JIT_CMP, Raddr, 0x04000000)); /* CMP to clear V */
  /* Load the fastmap regs */
  fastmap = JITRegAlloc_MapReg(ra, JIT_E_Temp2, 1, JIT_H_REG_NONE, JIT_NV);
  fastmapmode = JITRegAlloc_MapReg(ra, JIT_E_Temp3, 1, JIT_H_REG_NONE, JIT_NV);
  temp = JITRegAlloc_MapReg(ra, JIT_E_Temp4, 1, JIT_H_REG_NONE, JIT_NV);
  /* Now that regs are allocated, we can safely branch away */
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_LoadImm(JIT_AL,fastmap,ra->emu_regs[JIT_E_StateReg].hostreg,offsetof(ARMul_State,FastMap)));
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_LoadImm(JIT_AL,fastmapmode,ra->emu_regs[JIT_E_StateReg].hostreg,offsetof(ARMul_State,FastMapMode)));
  /* Trigger address exception */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_Branch(JIT_CS, 
abort-ra->block->nextinstr));
  /* Load FlagsAndData */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL, 
JIT_BIC, temp, Raddr, 0xf00));
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_Load_Rm(JIT_AL,temp,fastmap,temp | 0x4a0)); /* LSR #9 */
  /* Check type & flags */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rn_Rm(JIT_AL, 
JIT_TST, temp, fastmapmode));
  /* Reduced decoding: normally if all of bits 24-30 are clear it signifies an 
abort, with bit 31 signifying whether an access function is required
     But we only deal with the direct access case, so if N is set or 24-30 are 
clear we need to bail
     V=0 from CMP, so we can use LE condition code to detect failure */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_Branch(JIT_LE, 
abort-ra->block->nextinstr));
  /* Now we can load the data! */
  if (byte) {
    ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_Load_Rm(JIT_AL | 
(1<<22),Rd,Raddr,temp | 0x400)); /* LSL #8 */
  } else {
    /* TODO do rotated load if host supports it */
    ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL, 
JIT_BIC, fastmapmode, Raddr, 3));
    ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rm(JIT_AL, 
JIT_MOV | JIT_S, fastmap, Raddr | 0xf80)); /* LSL #31 */
    ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_Load_Rm(JIT_AL,Rd,fastmapmode,temp | 0x400)); /* LSL #8 */
    ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rm(JIT_CS, 
JIT_MOV, Rd, Rd | 0x860)); /* ROR #16 */    
    ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rm(JIT_MI, 
JIT_MOV, Rd, Rd | 0x460)); /* ROR #8 */
  }
  /* Release regs */
  JITRegAlloc_LockReg(ra, fastmap, -1, false);
  JITRegAlloc_LockReg(ra, fastmapmode, -1, false);
  JITRegAlloc_LockReg(ra, temp, -1, false);
}

void JITEmuInterf2_WriteLDM(JITRegAlloc *ra, JITHostReg Rn, JITHostReg Raddr, 
const Instruction *instr, uintptr_t abort)
{
  JITHostReg fastmap, fastmapmode, temp;
  int i = Decoder_LDMSTM_NumRegs(instr);
  /* Load the fastmap regs */
  fastmap = JITRegAlloc_MapReg(ra, JIT_E_Temp2, 1, JIT_H_REG_NONE, JIT_NV);
  fastmapmode = JITRegAlloc_MapReg(ra, JIT_E_Temp3, 1, JIT_H_REG_NONE, JIT_NV);
  temp = JITRegAlloc_MapReg(ra, JIT_E_Temp4, 1, JIT_H_REG_NONE, JIT_NV);
  /* Check for crossing page boundaries */
  if (i > 1) {
    ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rm(JIT_AL, 
JIT_MOV, temp, Raddr | 0xa00)); /* LSL #20 */
    ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL, 
JIT_ADD | JIT_S, temp, temp, (i-1)<<(2+20)));
  }
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_LoadImm(JIT_AL,fastmap,ra->emu_regs[JIT_E_StateReg].hostreg,offsetof(ARMul_State,FastMap)));
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_LoadImm(JIT_AL,fastmapmode,ra->emu_regs[JIT_E_StateReg].hostreg,offsetof(ARMul_State,FastMapMode)));
  /* Check for address exceptions */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rn_Imm((i > 1 ? 
JIT_CC : JIT_AL), JIT_CMP, Raddr, 0x04000000)); /* CMP to clear V */
  /* Trigger address exception */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_Branch(JIT_CS, 
abort-ra->block->nextinstr));
  /* Load FlagsAndData */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rn_Imm(JIT_AL, 
JIT_BIC, temp, Raddr, 0xf00));
  ForwardCodeBlock_WriteCode(ra->block, 
JITCodeGen_Load_Rm(JIT_AL,temp,fastmap,temp | 0x4a0)); /* LSR #9 */
  /* Check type & flags */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rn_Rm(JIT_AL, 
JIT_TST, temp, fastmapmode));
  /* Reduced decoding: normally if all of bits 24-30 are clear it signifies an 
abort, with bit 31 signifying whether an access function is required
     But we only deal with the direct access case, so if N is set or 24-30 are 
clear we need to bail
     V=0 from CMP, so we can use LE condition code to detect failure */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_Branch(JIT_LE, 
abort-ra->block->nextinstr));
  /* Update Raddr to be the actual address */
  ForwardCodeBlock_WriteCode(ra->block, JITCodeGen_DataProc_Rd_Rn_Rm(JIT_AL, 
JIT_ADD, Raddr, Raddr, temp | 0x400)); /* LSL #8 */
  /* Load into the register if it's resident, otherwise go via temp */
  for(i=0;i<16;i++) {
    if (!(instr->instr & (1<<i))) {
      continue;
    }
    JITHostReg dest = ra->emu_regs[i].hostreg;
    if (dest == JIT_H_REG_NONE) {
      dest = temp;
    }
    ForwardCodeBlock_WriteCode(ra->block, 0xe4900004 | (dest << 12) | (Raddr << 
16)); /* LDR dest, [Raddr], #4 */
    /* Immediately write back the value (yuck) */
    ForwardCodeBlock_WriteCode(ra->block, JITEmuInterf2_StoreReg(JIT_AL, dest, 
ra->emu_regs[JIT_E_StateReg].hostreg, i));
  }
  /* Release regs */
  JITRegAlloc_LockReg(ra, fastmap, -1, false);
  JITRegAlloc_LockReg(ra, fastmapmode, -1, false);
  JITRegAlloc_LockReg(ra, temp, -1, false);
}

--- NEW FILE: decoder.h ---
#ifndef DECODER_HEADER
#define DECODER_HEADER

#include "emuinterf.h"

/* ARMv2 instruction decoder */

#define JIT_EQ (0u<<28)
#define JIT_NE (1u<<28)
#define JIT_CS (2u<<28)
#define JIT_CC (3u<<28)
#define JIT_MI (4u<<28)
#define JIT_PL (5u<<28)
#define JIT_VS (6u<<28)
#define JIT_VC (7u<<28)
#define JIT_HI (8u<<28)
#define JIT_LS (9u<<28)
#define JIT_GE (10u<<28)
#define JIT_LT (11u<<28)
#define JIT_GT (12u<<28)
#define JIT_LE (13u<<28)
#define JIT_AL (14u<<28)
#define JIT_NV (15u<<28)

#define JIT_AND (0u<<20)
#define JIT_EOR (2u<<20)
#define JIT_SUB (4u<<20)
#define JIT_RSB (6u<<20)
#define JIT_ADD (8u<<20)
#define JIT_ADC (10u<<20)
#define JIT_SBC (12u<<20)
#define JIT_RSC (14u<<20)
#define JIT_TST (16u<<20)
#define JIT_TEQ (18u<<20)
#define JIT_CMP (20u<<20)
#define JIT_CMN (22u<<20)
#define JIT_ORR (24u<<20)
#define JIT_MOV (26u<<20)
#define JIT_BIC (28u<<20)
#define JIT_MVN (30u<<20)

#define JIT_S (1<<20)

typedef enum {
  InstrType_NOP,
  InstrType_Branch,
  InstrType_DataProc,
  InstrType_Multiply,
  InstrType_LDRSTR,
  InstrType_LDMSTM,
  InstrType_SWI,
  InstrType_Other,

  InstrType_Count,
} InstrType;

typedef struct {
  InstrType type;
  uint32_t instr;
} Instruction;

extern void Decoder_Decode(Instruction *out, uint32_t instr);

static inline uint32_t Decoder_CC(const Instruction *instr)
{
  return instr->instr & 0xF0000000;
}

static inline uint32_t Decoder_Conditional(const Instruction *instr)
{
  return instr->instr < JIT_AL;
}

static inline uint32_t Decoder_Branch_BLFlag(const Instruction *instr)
{
  return instr->instr & (1<<24);
}

static inline int32_t Decoder_Branch_Offset(const Instruction *instr)
{
  return (((int32_t) (instr->instr << 8)) >> 6) + 8;
}

/* =0 if op2 is a (shifted) register */
static inline uint32_t Decoder_DataProc_ImmFlag(const Instruction *instr)
{
  return instr->instr & (1<<25);
}

static inline uint32_t Decoder_DataProc_SFlag(const Instruction *instr)
{
  return instr->instr & (1<<20);
}

static inline bool Decoder_DataProc_IsShiftedReg(const Instruction *instr)
{
  return !Decoder_DataProc_ImmFlag(instr) && ((instr->instr & 0x90) == 0x10);
}

static inline uint32_t Decoder_DataProc_Op(const Instruction *instr)
{
  return instr->instr & (15<<21);
}

static inline bool Decoder_DataProc_IsCompare(const Instruction *instr)
{
  return (Decoder_DataProc_Op(instr)>>23) == 2;
}

static inline bool Decoder_DataProc_UsesRn(const Instruction *instr)
{
  /* MOV, MVN don't use Rn */
  uint32_t op = Decoder_DataProc_Op(instr);
  return (op != JIT_MOV) && (op != JIT_MVN);
}

/* Non-compare instructions always use Rd
   Compare instructions which have Rd==15 use Rd (P suffix) */
static inline bool Decoder_DataProc_UsesRd(const Instruction *instr)
{
  return (((instr->instr>>12)&15) == 15) || !Decoder_DataProc_IsCompare(instr);
}

/* Returns true if the C flag is used as an input of the ALU */
static inline bool Decoder_DataProc_CarryIn(const Instruction *instr)
{
  /* ADC, SBC, RSC use carry */
  uint32_t op = Decoder_DataProc_Op(instr);
  if ((op == JIT_ADC) || (op == JIT_SBC) || (op == JIT_RSC)) {
    return true;
  }
  /* RRX uses carry */
  if (Decoder_DataProc_ImmFlag(instr)) {
    return false;
  }
  return ((instr->instr & 0xff0) == 0x060);
}

/* =0 if MUL, else MLA */
static inline uint32_t Decoder_Multiply_AccumFlag(const Instruction *instr)
{
  return instr->instr & (1<<21);
}

static inline uint32_t Decoder_Multiply_SFlag(const Instruction *instr)
{
  return instr->instr & (1<<20);
}

/* =0 if store */
static inline uint32_t Decoder_LDRSTR_LoadFlag(const Instruction *instr)
{
  return instr->instr & (1<<20);
}

/* !=0 if pre-indexed */
static inline uint32_t Decoder_LDRSTR_PreFlag(const Instruction *instr)
{
  return instr->instr & (1<<24);
}

/* true if writeback */
static inline bool Decoder_LDRSTR_WritebackFlag(const Instruction *instr)
{
  return !Decoder_LDRSTR_PreFlag(instr) || (instr->instr & (1<<21));
}

/* true if T flag */
static inline bool Decoder_LDRSTR_TFlag(const Instruction *instr)
{
  return !Decoder_LDRSTR_PreFlag(instr) && (instr->instr & (1<<21));
}

/* !=0 if byte */
static inline uint32_t Decoder_LDRSTR_ByteFlag(const Instruction *instr)
{
  return instr->instr & (1<<22);
}

/* !=0 if up */
static inline uint32_t Decoder_LDRSTR_UpFlag(const Instruction *instr)
{
  return instr->instr & (1<<23);
}

/* =0 if op2 is a (shifted) register */
static inline bool Decoder_LDRSTR_ImmFlag(const Instruction *instr)
{
  return !(instr->instr & (1<<25));
}

/* true if offset is (probably) non-zero */
static inline bool Decoder_LDRSTR_HasOffset(const Instruction *instr)
{
  return !Decoder_LDRSTR_ImmFlag(instr) || (instr->instr & 0xfff);
}

/* Returns true if the C flag is used as an input of the ALU */
static inline bool Decoder_LDRSTR_CarryIn(const Instruction *instr)
{
  /* RRX uses carry */
  if (Decoder_LDRSTR_ImmFlag(instr)) {
    return false;
  }
  return ((instr->instr & 0xfe0) == 0x060);
}

/* =0 if store */
static inline uint32_t Decoder_LDMSTM_LoadFlag(const Instruction *instr)
{
  return instr->instr & (1<<20);
}
 
/* !=0 if writeback */
static inline uint32_t Decoder_LDMSTM_WritebackFlag(const Instruction *instr)
{
  return instr->instr & (1<<21);
}

/* !=0 if ^ */
static inline uint32_t Decoder_LDMSTM_HatFlag(const Instruction *instr)
{
  return instr->instr & (1<<22);
}

/* !=0 if up */
static inline uint32_t Decoder_LDMSTM_UpFlag(const Instruction *instr)
{
  return instr->instr & (1<<23);
}

/* !=0 if pre-indexed */
static inline uint32_t Decoder_LDMSTM_PreFlag(const Instruction *instr)
{
  return instr->instr & (1<<24);
}

extern int Decoder_LDMSTM_NumRegs(const Instruction *instr);

extern int Decoder_LDMSTM_Cycles(const Instruction *instr);

/* Validity: DataProc && UsesRn
             Multiply && AccumFlag
             LDRSTR
             LDMSTM */
static inline uint32_t Decoder_Rn(const Instruction *instr)
{
  return (instr->instr >> 16) & 0xf;
}

/* Validity: DataProc && UsesRd
             Multiply
             LDRSTR */
static inline uint32_t Decoder_Rd(const Instruction *instr)
{
  return (instr->instr >> 12) & 0xf;
}

/* Validity: DataProc && !ImmFlag
             Multiply
             LDRSTR && !ImmFlag */
static inline uint32_t Decoder_Rm(const Instruction *instr)
{
  return instr->instr & 0xf;
}

/* Validity: DataProc && IsShiftedReg
             Multiply */
static inline uint32_t Decoder_Rs(const Instruction *instr)
{
  return (instr->instr >> 8) & 0xf;
}

#endif

--- NEW FILE: emuinterf.h ---
#ifndef JITEMUINTERF_HEADER
#define JITEMUINTERF_HEADER

/* Interface from the JIT to the main emulator
   Emulator-specific stuff goes here! */

#include "../c99.h"

/* Type used by emulator to store its state */
typedef struct ARMul_State JITEmuState;

/* Return pointer to global state object */
extern JITEmuState *JITEmuInterf_GetState(void);

#endif


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
-- 
arcem-cvs mailing list
arcem-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/arcem-cvs

Reply via email to