Modified: tomcat/jk/trunk/native/iis/pcre/sljit/sljitNativeARM_32.c
URL: 
http://svn.apache.org/viewvc/tomcat/jk/trunk/native/iis/pcre/sljit/sljitNativeARM_32.c?rev=1815927&r1=1815926&r2=1815927&view=diff
==============================================================================
--- tomcat/jk/trunk/native/iis/pcre/sljit/sljitNativeARM_32.c (original)
+++ tomcat/jk/trunk/native/iis/pcre/sljit/sljitNativeARM_32.c Tue Nov 21 
14:37:37 2017
@@ -1,7 +1,7 @@
 /*
  *    Stack-less Just-In-Time compiler
  *
- *    Copyright 2009-2012 Zoltan Herczeg (hzmes...@freemail.hu). All rights 
reserved.
+ *    Copyright Zoltan Herczeg (hzmes...@freemail.hu). All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without 
modification, are
  * permitted provided that the following conditions are met:
@@ -38,8 +38,7 @@ SLJIT_API_FUNC_ATTRIBUTE const char* slj
 /* Last register + 1. */
 #define TMP_REG1       (SLJIT_NUMBER_OF_REGISTERS + 2)
 #define TMP_REG2       (SLJIT_NUMBER_OF_REGISTERS + 3)
-#define TMP_REG3       (SLJIT_NUMBER_OF_REGISTERS + 4)
-#define TMP_PC         (SLJIT_NUMBER_OF_REGISTERS + 5)
+#define TMP_PC         (SLJIT_NUMBER_OF_REGISTERS + 4)
 
 #define TMP_FREG1      (0)
 #define TMP_FREG2      (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
@@ -55,8 +54,8 @@ SLJIT_API_FUNC_ATTRIBUTE const char* slj
        (((max_diff) / (sljit_s32)sizeof(sljit_uw)) - (CONST_POOL_ALIGNMENT - 
1))
 
 /* See sljit_emit_enter and sljit_emit_op0 if you want to change them. */
-static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 6] = {
-       0, 0, 1, 2, 11, 10, 9, 8, 7, 6, 5, 4, 13, 3, 12, 14, 15
+static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
+       0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 14, 12, 15
 };
 
 #define RM(rm) (reg_map[rm])
@@ -83,6 +82,7 @@ static const sljit_u8 reg_map[SLJIT_NUMB
 #define BLX            0xe12fff30
 #define BX             0xe12fff10
 #define CLZ            0xe16f0f10
+#define CMN_DP         0xb
 #define CMP_DP         0xa
 #define BKPT           0xe1200070
 #define EOR_DP         0x1
@@ -260,7 +260,7 @@ static SLJIT_INLINE sljit_s32 emit_blx(s
 {
        /* Must follow tightly the previous instruction (to be able to convert 
it to bl instruction). */
        SLJIT_ASSERT(compiler->cpool_diff == CONST_POOL_EMPTY || compiler->size 
- compiler->cpool_diff < MAX_DIFFERENCE(4092));
-       return push_inst(compiler, BLX | RM(TMP_REG1));
+       return push_inst(compiler, BLX | RM(TMP_REG2));
 }
 
 static sljit_uw patch_pc_relative_loads(sljit_uw *last_pc_patch, sljit_uw 
*code_ptr, sljit_uw* const_pool, sljit_uw cpool_size)
@@ -389,7 +389,7 @@ static SLJIT_INLINE sljit_s32 emit_imm(s
 
 #endif
 
-static SLJIT_INLINE sljit_s32 detect_jump_type(struct sljit_jump *jump, 
sljit_uw *code_ptr, sljit_uw *code)
+static SLJIT_INLINE sljit_s32 detect_jump_type(struct sljit_jump *jump, 
sljit_uw *code_ptr, sljit_uw *code, sljit_sw executable_offset)
 {
        sljit_sw diff;
 
@@ -401,7 +401,7 @@ static SLJIT_INLINE sljit_s32 detect_jum
                code_ptr--;
 
        if (jump->flags & JUMP_ADDR)
-               diff = ((sljit_sw)jump->u.target - (sljit_sw)(code_ptr + 2));
+               diff = ((sljit_sw)jump->u.target - (sljit_sw)(code_ptr + 2) - 
executable_offset);
        else {
                SLJIT_ASSERT(jump->flags & JUMP_LABEL);
                diff = ((sljit_sw)(code + jump->u.label->size) - 
(sljit_sw)(code_ptr + 2));
@@ -426,7 +426,7 @@ static SLJIT_INLINE sljit_s32 detect_jum
        }
 #else
        if (jump->flags & JUMP_ADDR)
-               diff = ((sljit_sw)jump->u.target - (sljit_sw)code_ptr);
+               diff = ((sljit_sw)jump->u.target - (sljit_sw)code_ptr - 
executable_offset);
        else {
                SLJIT_ASSERT(jump->flags & JUMP_LABEL);
                diff = ((sljit_sw)(code + jump->u.label->size) - 
(sljit_sw)code_ptr);
@@ -446,26 +446,28 @@ static SLJIT_INLINE sljit_s32 detect_jum
        return 0;
 }
 
-static SLJIT_INLINE void inline_set_jump_addr(sljit_uw addr, sljit_uw 
new_addr, sljit_s32 flush)
+static SLJIT_INLINE void inline_set_jump_addr(sljit_uw jump_ptr, sljit_sw 
executable_offset, sljit_uw new_addr, sljit_s32 flush_cache)
 {
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-       sljit_uw *ptr = (sljit_uw*)addr;
-       sljit_uw *inst = (sljit_uw*)ptr[0];
+       sljit_uw *ptr = (sljit_uw *)jump_ptr;
+       sljit_uw *inst = (sljit_uw *)ptr[0];
        sljit_uw mov_pc = ptr[1];
        sljit_s32 bl = (mov_pc & 0x0000f000) != RD(TMP_PC);
-       sljit_sw diff = (sljit_sw)(((sljit_sw)new_addr - (sljit_sw)(inst + 2)) 
>> 2);
+       sljit_sw diff = (sljit_sw)(((sljit_sw)new_addr - (sljit_sw)(inst + 2) - 
executable_offset) >> 2);
 
        if (diff <= 0x7fffff && diff >= -0x800000) {
                /* Turn to branch. */
                if (!bl) {
                        inst[0] = (mov_pc & COND_MASK) | (B - CONDITIONAL) | 
(diff & 0xffffff);
-                       if (flush) {
+                       if (flush_cache) {
+                               inst = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(inst, 
executable_offset);
                                SLJIT_CACHE_FLUSH(inst, inst + 1);
                        }
                } else {
                        inst[0] = (mov_pc & COND_MASK) | (BL - CONDITIONAL) | 
(diff & 0xffffff);
                        inst[1] = NOP;
-                       if (flush) {
+                       if (flush_cache) {
+                               inst = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(inst, 
executable_offset);
                                SLJIT_CACHE_FLUSH(inst, inst + 2);
                        }
                }
@@ -479,12 +481,14 @@ static SLJIT_INLINE void inline_set_jump
                if (*inst != mov_pc) {
                        inst[0] = mov_pc;
                        if (!bl) {
-                               if (flush) {
+                               if (flush_cache) {
+                                       inst = (sljit_uw 
*)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
                                        SLJIT_CACHE_FLUSH(inst, inst + 1);
                                }
                        } else {
                                inst[1] = BLX | RM(TMP_REG1);
-                               if (flush) {
+                               if (flush_cache) {
+                                       inst = (sljit_uw 
*)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
                                        SLJIT_CACHE_FLUSH(inst, inst + 2);
                                }
                        }
@@ -492,11 +496,12 @@ static SLJIT_INLINE void inline_set_jump
                *ptr = new_addr;
        }
 #else
-       sljit_uw *inst = (sljit_uw*)addr;
+       sljit_uw *inst = (sljit_uw*)jump_ptr;
        SLJIT_ASSERT((inst[0] & 0xfff00000) == MOVW && (inst[1] & 0xfff00000) 
== MOVT);
        inst[0] = MOVW | (inst[0] & 0xf000) | ((new_addr << 4) & 0xf0000) | 
(new_addr & 0xfff);
        inst[1] = MOVT | (inst[1] & 0xf000) | ((new_addr >> 12) & 0xf0000) | 
((new_addr >> 16) & 0xfff);
-       if (flush) {
+       if (flush_cache) {
+               inst = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(inst, 
executable_offset);
                SLJIT_CACHE_FLUSH(inst, inst + 2);
        }
 #endif
@@ -504,7 +509,7 @@ static SLJIT_INLINE void inline_set_jump
 
 static sljit_uw get_imm(sljit_uw imm);
 
-static SLJIT_INLINE void inline_set_const(sljit_uw addr, sljit_sw 
new_constant, sljit_s32 flush)
+static SLJIT_INLINE void inline_set_const(sljit_uw addr, sljit_sw 
executable_offset, sljit_sw new_constant, sljit_s32 flush_cache)
 {
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
        sljit_uw *ptr = (sljit_uw*)addr;
@@ -515,7 +520,8 @@ static SLJIT_INLINE void inline_set_cons
        src2 = get_imm(new_constant);
        if (src2) {
                *inst = 0xe3a00000 | (ldr_literal & 0xf000) | src2;
-               if (flush) {
+               if (flush_cache) {
+                       inst = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(inst, 
executable_offset);
                        SLJIT_CACHE_FLUSH(inst, inst + 1);
                }
                return;
@@ -524,7 +530,8 @@ static SLJIT_INLINE void inline_set_cons
        src2 = get_imm(~new_constant);
        if (src2) {
                *inst = 0xe3e00000 | (ldr_literal & 0xf000) | src2;
-               if (flush) {
+               if (flush_cache) {
+                       inst = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(inst, 
executable_offset);
                        SLJIT_CACHE_FLUSH(inst, inst + 1);
                }
                return;
@@ -537,7 +544,8 @@ static SLJIT_INLINE void inline_set_cons
 
        if (*inst != ldr_literal) {
                *inst = ldr_literal;
-               if (flush) {
+               if (flush_cache) {
+                       inst = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(inst, 
executable_offset);
                        SLJIT_CACHE_FLUSH(inst, inst + 1);
                }
        }
@@ -547,7 +555,8 @@ static SLJIT_INLINE void inline_set_cons
        SLJIT_ASSERT((inst[0] & 0xfff00000) == MOVW && (inst[1] & 0xfff00000) 
== MOVT);
        inst[0] = MOVW | (inst[0] & 0xf000) | ((new_constant << 4) & 0xf0000) | 
(new_constant & 0xfff);
        inst[1] = MOVT | (inst[1] & 0xf000) | ((new_constant >> 12) & 0xf0000) 
| ((new_constant >> 16) & 0xfff);
-       if (flush) {
+       if (flush_cache) {
+               inst = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(inst, 
executable_offset);
                SLJIT_CACHE_FLUSH(inst, inst + 2);
        }
 #endif
@@ -562,6 +571,8 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
        sljit_uw *buf_end;
        sljit_uw size;
        sljit_uw word_count;
+       sljit_sw executable_offset;
+       sljit_sw jump_addr;
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
        sljit_uw cpool_size;
        sljit_uw cpool_skip_alignment;
@@ -602,14 +613,14 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
 
        code_ptr = code;
        word_count = 0;
+       executable_offset = SLJIT_EXEC_OFFSET(code);
 
        label = compiler->labels;
        jump = compiler->jumps;
        const_ = compiler->consts;
 
        if (label && label->size == 0) {
-               label->addr = (sljit_uw)code;
-               label->size = 0;
+               label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code, 
executable_offset);
                label = label->next;
        }
 
@@ -636,7 +647,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
                                                cpool_size = 0;
                                                if (label && label->size == 
word_count) {
                                                        /* Points after the 
current instruction. */
-                                                       label->addr = 
(sljit_uw)code_ptr;
+                                                       label->addr = 
(sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
                                                        label->size = code_ptr 
- code;
                                                        label = label->next;
                                                }
@@ -652,19 +663,19 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
                                SLJIT_ASSERT(!const_ || const_->addr >= 
word_count);
                                if (jump && jump->addr == word_count) {
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-                                       if (detect_jump_type(jump, code_ptr, 
code))
+                                       if (detect_jump_type(jump, code_ptr, 
code, executable_offset))
                                                code_ptr--;
                                        jump->addr = (sljit_uw)code_ptr;
 #else
                                        jump->addr = (sljit_uw)(code_ptr - 2);
-                                       if (detect_jump_type(jump, code_ptr, 
code))
+                                       if (detect_jump_type(jump, code_ptr, 
code, executable_offset))
                                                code_ptr -= 2;
 #endif
                                        jump = jump->next;
                                }
                                if (label && label->size == word_count) {
                                        /* code_ptr can be affected above. */
-                                       label->addr = (sljit_uw)(code_ptr + 1);
+                                       label->addr = 
(sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr + 1, executable_offset);
                                        label->size = (code_ptr + 1) - code;
                                        label = label->next;
                                }
@@ -729,17 +740,18 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
 
        jump = compiler->jumps;
        while (jump) {
-               buf_ptr = (sljit_uw*)jump->addr;
+               buf_ptr = (sljit_uw *)jump->addr;
 
                if (jump->flags & PATCH_B) {
+                       jump_addr = (sljit_sw)SLJIT_ADD_EXEC_OFFSET(buf_ptr + 
2, executable_offset);
                        if (!(jump->flags & JUMP_ADDR)) {
                                SLJIT_ASSERT(jump->flags & JUMP_LABEL);
-                               SLJIT_ASSERT(((sljit_sw)jump->u.label->addr - 
(sljit_sw)(buf_ptr + 2)) <= 0x01ffffff && ((sljit_sw)jump->u.label->addr - 
(sljit_sw)(buf_ptr + 2)) >= -0x02000000);
-                               *buf_ptr |= (((sljit_sw)jump->u.label->addr - 
(sljit_sw)(buf_ptr + 2)) >> 2) & 0x00ffffff;
+                               SLJIT_ASSERT(((sljit_sw)jump->u.label->addr - 
jump_addr) <= 0x01ffffff && ((sljit_sw)jump->u.label->addr - jump_addr) >= 
-0x02000000);
+                               *buf_ptr |= (((sljit_sw)jump->u.label->addr - 
jump_addr) >> 2) & 0x00ffffff;
                        }
                        else {
-                               SLJIT_ASSERT(((sljit_sw)jump->u.target - 
(sljit_sw)(buf_ptr + 2)) <= 0x01ffffff && ((sljit_sw)jump->u.target - 
(sljit_sw)(buf_ptr + 2)) >= -0x02000000);
-                               *buf_ptr |= (((sljit_sw)jump->u.target - 
(sljit_sw)(buf_ptr + 2)) >> 2) & 0x00ffffff;
+                               SLJIT_ASSERT(((sljit_sw)jump->u.target - 
jump_addr) <= 0x01ffffff && ((sljit_sw)jump->u.target - jump_addr) >= 
-0x02000000);
+                               *buf_ptr |= (((sljit_sw)jump->u.target - 
jump_addr) >> 2) & 0x00ffffff;
                        }
                }
                else if (jump->flags & SLJIT_REWRITABLE_JUMP) {
@@ -747,10 +759,10 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
                        jump->addr = (sljit_uw)code_ptr;
                        code_ptr[0] = (sljit_uw)buf_ptr;
                        code_ptr[1] = *buf_ptr;
-                       inline_set_jump_addr((sljit_uw)code_ptr, (jump->flags & 
JUMP_LABEL) ? jump->u.label->addr : jump->u.target, 0);
+                       inline_set_jump_addr((sljit_uw)code_ptr, 
executable_offset, (jump->flags & JUMP_LABEL) ? jump->u.label->addr : 
jump->u.target, 0);
                        code_ptr += 2;
 #else
-                       inline_set_jump_addr((sljit_uw)buf_ptr, (jump->flags & 
JUMP_LABEL) ? jump->u.label->addr : jump->u.target, 0);
+                       inline_set_jump_addr((sljit_uw)buf_ptr, 
executable_offset, (jump->flags & JUMP_LABEL) ? jump->u.label->addr : 
jump->u.target, 0);
 #endif
                }
                else {
@@ -763,7 +775,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
                                buf_ptr += 1;
                        *buf_ptr = (jump->flags & JUMP_LABEL) ? 
jump->u.label->addr : jump->u.target;
 #else
-                       inline_set_jump_addr((sljit_uw)buf_ptr, (jump->flags & 
JUMP_LABEL) ? jump->u.label->addr : jump->u.target, 0);
+                       inline_set_jump_addr((sljit_uw)buf_ptr, 
executable_offset, (jump->flags & JUMP_LABEL) ? jump->u.label->addr : 
jump->u.target, 0);
 #endif
                }
                jump = jump->next;
@@ -782,7 +794,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
                else
                        buf_ptr += 1;
                /* Set the value again (can be a simple constant). */
-               inline_set_const((sljit_uw)code_ptr, *buf_ptr, 0);
+               inline_set_const((sljit_uw)code_ptr, executable_offset, 
*buf_ptr, 0);
                code_ptr += 2;
 
                const_ = const_->next;
@@ -792,29 +804,90 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
        SLJIT_ASSERT(code_ptr - code <= (sljit_s32)size);
 
        compiler->error = SLJIT_ERR_COMPILED;
+       compiler->executable_offset = executable_offset;
        compiler->executable_size = (code_ptr - code) * sizeof(sljit_uw);
+
+       code = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
+       code_ptr = (sljit_uw *)SLJIT_ADD_EXEC_OFFSET(code_ptr, 
executable_offset);
+
        SLJIT_CACHE_FLUSH(code, code_ptr);
        return code;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 
feature_type)
+{
+       switch (feature_type) {
+       case SLJIT_HAS_FPU:
+#ifdef SLJIT_IS_FPU_AVAILABLE
+               return SLJIT_IS_FPU_AVAILABLE;
+#else
+               /* Available by default. */
+               return 1;
+#endif
+
+       case SLJIT_HAS_PRE_UPDATE:
+       case SLJIT_HAS_CLZ:
+       case SLJIT_HAS_CMOV:
+               return 1;
+
+       default:
+               return 0;
+       }
+}
+
 /* --------------------------------------------------------------------- */
 /*  Entry, exit                                                          */
 /* --------------------------------------------------------------------- */
 
+/* Creates an index in data_transfer_insts array. */
+#define WORD_DATA      0x00
+#define BYTE_DATA      0x01
+#define HALF_DATA      0x02
+#define PRELOAD_DATA   0x03
+#define SIGNED_DATA    0x04
+#define LOAD_DATA      0x08
+
 /* emit_op inp_flags.
    WRITE_BACK must be the first, since it is a flag. */
-#define WRITE_BACK     0x01
-#define ALLOW_IMM      0x02
-#define ALLOW_INV_IMM  0x04
+#define WRITE_BACK     0x10
+#define ALLOW_IMM      0x20
+#define ALLOW_INV_IMM  0x40
 #define ALLOW_ANY_IMM  (ALLOW_IMM | ALLOW_INV_IMM)
-#define ARG_TEST       0x08
 
-/* Creates an index in data_transfer_insts array. */
-#define WORD_DATA      0x00
-#define BYTE_DATA      0x10
-#define HALF_DATA      0x20
-#define SIGNED_DATA    0x40
-#define LOAD_DATA      0x80
+/* s/l - store/load (1 bit)
+   u/s - signed/unsigned (1 bit)
+   w/b/h/N - word/byte/half/NOT allowed (2 bit)
+   Storing signed and unsigned values are the same operations. */
+
+static const sljit_uw data_transfer_insts[16] = {
+/* s u w */ 0xe5000000 /* str */,
+/* s u b */ 0xe5400000 /* strb */,
+/* s u h */ 0xe10000b0 /* strh */,
+/* s u N */ 0x00000000 /* not allowed */,
+/* s s w */ 0xe5000000 /* str */,
+/* s s b */ 0xe5400000 /* strb */,
+/* s s h */ 0xe10000b0 /* strh */,
+/* s s N */ 0x00000000 /* not allowed */,
+
+/* l u w */ 0xe5100000 /* ldr */,
+/* l u b */ 0xe5500000 /* ldrb */,
+/* l u h */ 0xe11000b0 /* ldrh */,
+/* l u p */ 0xf5500000 /* preload data */,
+/* l s w */ 0xe5100000 /* ldr */,
+/* l s b */ 0xe11000d0 /* ldrsb */,
+/* l s h */ 0xe11000f0 /* ldrsh */,
+/* l s N */ 0x00000000 /* not allowed */,
+};
+
+#define EMIT_DATA_TRANSFER(type, add, wb, target_reg, base_reg, arg) \
+       (data_transfer_insts[(type) & 0xf] | ((add) << 23) | ((wb) << (21 - 4)) 
| RD(target_reg) | RN(base_reg) | (arg))
+
+/* Normal ldr/str instruction.
+   Type2: ldrsb, ldrh, ldrsh */
+#define IS_TYPE1_TRANSFER(type) \
+       (data_transfer_insts[(type) & 0xf] & 0x04000000)
+#define TYPE2_TRANSFER_IMM(imm) \
+       (((imm) & 0xf) | (((imm) & 0xf0) << 4) | (1 << 22))
 
 /* Condition: AL. */
 #define EMIT_DATA_PROCESS_INS(opcode, set_flags, dst, src1, src2) \
@@ -912,52 +985,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
 
-/* s/l - store/load (1 bit)
-   u/s - signed/unsigned (1 bit)
-   w/b/h/N - word/byte/half/NOT allowed (2 bit)
-   It contans 16 items, but not all are different. */
-
-static sljit_sw data_transfer_insts[16] = {
-/* s u w */ 0xe5000000 /* str */,
-/* s u b */ 0xe5400000 /* strb */,
-/* s u h */ 0xe10000b0 /* strh */,
-/* s u N */ 0x00000000 /* not allowed */,
-/* s s w */ 0xe5000000 /* str */,
-/* s s b */ 0xe5400000 /* strb */,
-/* s s h */ 0xe10000b0 /* strh */,
-/* s s N */ 0x00000000 /* not allowed */,
-
-/* l u w */ 0xe5100000 /* ldr */,
-/* l u b */ 0xe5500000 /* ldrb */,
-/* l u h */ 0xe11000b0 /* ldrh */,
-/* l u N */ 0x00000000 /* not allowed */,
-/* l s w */ 0xe5100000 /* ldr */,
-/* l s b */ 0xe11000d0 /* ldrsb */,
-/* l s h */ 0xe11000f0 /* ldrsh */,
-/* l s N */ 0x00000000 /* not allowed */,
-};
-
-#define EMIT_DATA_TRANSFER(type, add, wb, target, base1, base2) \
-       (data_transfer_insts[(type) >> 4] | ((add) << 23) | ((wb) << 21) | 
(reg_map[target] << 12) | (reg_map[base1] << 16) | (base2))
-/* Normal ldr/str instruction.
-   Type2: ldrsb, ldrh, ldrsh */
-#define IS_TYPE1_TRANSFER(type) \
-       (data_transfer_insts[(type) >> 4] & 0x04000000)
-#define TYPE2_TRANSFER_IMM(imm) \
-       (((imm) & 0xf) | (((imm) & 0xf0) << 4) | (1 << 22))
-
 /* flags: */
   /* Arguments are swapped. */
 #define ARGS_SWAPPED   0x01
   /* Inverted immediate. */
 #define INV_IMM                0x02
   /* Source and destination is register. */
-#define REG_DEST       0x04
-#define REG_SOURCE     0x08
-  /* One instruction is enough. */
-#define FAST_DEST      0x10
-  /* Multiple instructions are required. */
-#define SLOW_DEST      0x20
+#define MOVE_REG_CONV  0x04
+  /* Unused return value. */
+#define UNUSED_RETURN  0x08
 /* SET_FLAGS must be (1 << 20) as it is also the value of S bit (can be used 
for optimization). */
 #define SET_FLAGS      (1 << 20)
 /* dst: reg
@@ -966,157 +1002,135 @@ static sljit_sw data_transfer_insts[16]
    SRC2_IMM must be (1 << 25) as it is also the value of I bit (can be used 
for optimization). */
 #define SRC2_IMM       (1 << 25)
 
-#define EMIT_DATA_PROCESS_INS_AND_RETURN(opcode) \
-       return push_inst(compiler, EMIT_DATA_PROCESS_INS(opcode, flags & 
SET_FLAGS, dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)))
-
-#define EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(opcode, dst, src1, src2) \
-       return push_inst(compiler, EMIT_DATA_PROCESS_INS(opcode, flags & 
SET_FLAGS, dst, src1, src2))
-
 #define EMIT_SHIFT_INS_AND_RETURN(opcode) \
        SLJIT_ASSERT(!(flags & INV_IMM) && !(src2 & SRC2_IMM)); \
        if (compiler->shift_imm != 0x20) { \
                SLJIT_ASSERT(src1 == TMP_REG1); \
                SLJIT_ASSERT(!(flags & ARGS_SWAPPED)); \
+               \
                if (compiler->shift_imm != 0) \
-                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, flags & SET_FLAGS, dst, SLJIT_UNUSED, 
(compiler->shift_imm << 7) | (opcode << 5) | reg_map[src2])); \
-               return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, flags 
& SET_FLAGS, dst, SLJIT_UNUSED, reg_map[src2])); \
+                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, flags & SET_FLAGS, \
+                               dst, SLJIT_UNUSED, (compiler->shift_imm << 7) | 
(opcode << 5) | RM(src2))); \
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, flags 
& SET_FLAGS, dst, SLJIT_UNUSED, RM(src2))); \
        } \
-       return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, flags & 
SET_FLAGS, dst, SLJIT_UNUSED, (reg_map[(flags & ARGS_SWAPPED) ? src1 : src2] << 
8) | (opcode << 5) | 0x10 | ((flags & ARGS_SWAPPED) ? reg_map[src2] : 
reg_map[src1])));
+       return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, flags & 
SET_FLAGS, \
+               dst, SLJIT_UNUSED, (reg_map[(flags & ARGS_SWAPPED) ? src1 : 
src2] << 8) | (opcode << 5) | 0x10 | RM((flags & ARGS_SWAPPED) ? src2 : src1)));
 
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, 
sljit_s32 op, sljit_s32 flags,
        sljit_s32 dst, sljit_s32 src1, sljit_s32 src2)
 {
-       sljit_sw mul_inst;
-
        switch (GET_OPCODE(op)) {
        case SLJIT_MOV:
                SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
                if (dst != src2) {
                        if (src2 & SRC2_IMM) {
-                               if (flags & INV_IMM)
-                                       
EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MVN_DP, dst, SLJIT_UNUSED, src2);
-                               EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MOV_DP, 
dst, SLJIT_UNUSED, src2);
+                               return push_inst(compiler, 
EMIT_DATA_PROCESS_INS((flags & INV_IMM) ? MVN_DP : MOV_DP, 0,
+                                       dst, SLJIT_UNUSED, src2));
                        }
-                       EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MOV_DP, dst, 
SLJIT_UNUSED, reg_map[src2]);
+                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, RM(src2)));
                }
                return SLJIT_SUCCESS;
 
        case SLJIT_MOV_U8:
        case SLJIT_MOV_S8:
                SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
-               if ((flags & (REG_DEST | REG_SOURCE)) == (REG_DEST | 
REG_SOURCE)) {
+               if (flags & MOVE_REG_CONV) {
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
                        if (op == SLJIT_MOV_U8)
                                return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(AND_DP, 0, dst, src2, SRC2_IMM | 0xff));
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (24 << 7) | 
reg_map[src2])));
-                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (24 << 7) | (op == 
SLJIT_MOV_U8 ? 0x20 : 0x40) | reg_map[dst]));
+                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (24 << 7) | RM(src2))));
+                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (24 << 7) | (op == 
SLJIT_MOV_U8 ? 0x20 : 0x40) | RM(dst)));
 #else
                        return push_inst(compiler, (op == SLJIT_MOV_U8 ? UXTB : 
SXTB) | RD(dst) | RM(src2));
 #endif
                }
                else if (dst != src2) {
                        SLJIT_ASSERT(src2 & SRC2_IMM);
-                       if (flags & INV_IMM)
-                               EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MVN_DP, 
dst, SLJIT_UNUSED, src2);
-                       EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MOV_DP, dst, 
SLJIT_UNUSED, src2);
+                       return push_inst(compiler, EMIT_DATA_PROCESS_INS((flags 
& INV_IMM) ? MVN_DP : MOV_DP, 0,
+                               dst, SLJIT_UNUSED, src2));
                }
                return SLJIT_SUCCESS;
 
        case SLJIT_MOV_U16:
        case SLJIT_MOV_S16:
                SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
-               if ((flags & (REG_DEST | REG_SOURCE)) == (REG_DEST | 
REG_SOURCE)) {
+               if (flags & MOVE_REG_CONV) {
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (16 << 7) | 
reg_map[src2])));
-                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (16 << 7) | (op == 
SLJIT_MOV_U16 ? 0x20 : 0x40) | reg_map[dst]));
+                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (16 << 7) | RM(src2))));
+                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst, SLJIT_UNUSED, (16 << 7) | (op == 
SLJIT_MOV_U16 ? 0x20 : 0x40) | RM(dst)));
 #else
                        return push_inst(compiler, (op == SLJIT_MOV_U16 ? UXTH 
: SXTH) | RD(dst) | RM(src2));
 #endif
                }
                else if (dst != src2) {
                        SLJIT_ASSERT(src2 & SRC2_IMM);
-                       if (flags & INV_IMM)
-                               EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MVN_DP, 
dst, SLJIT_UNUSED, src2);
-                       EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MOV_DP, dst, 
SLJIT_UNUSED, src2);
+                       return push_inst(compiler, EMIT_DATA_PROCESS_INS((flags 
& INV_IMM) ? MVN_DP : MOV_DP, 0,
+                               dst, SLJIT_UNUSED, src2));
                }
                return SLJIT_SUCCESS;
 
        case SLJIT_NOT:
                if (src2 & SRC2_IMM) {
-                       if (flags & INV_IMM)
-                               EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MOV_DP, 
dst, SLJIT_UNUSED, src2);
-                       EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MVN_DP, dst, 
SLJIT_UNUSED, src2);
+                       return push_inst(compiler, EMIT_DATA_PROCESS_INS((flags 
& INV_IMM) ? MOV_DP : MVN_DP, flags & SET_FLAGS,
+                               dst, SLJIT_UNUSED, src2));
                }
-               EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(MVN_DP, dst, 
SLJIT_UNUSED, RM(src2));
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(MVN_DP, flags 
& SET_FLAGS, dst, SLJIT_UNUSED, RM(src2)));
 
        case SLJIT_CLZ:
                SLJIT_ASSERT(!(flags & INV_IMM));
                SLJIT_ASSERT(!(src2 & SRC2_IMM));
                FAIL_IF(push_inst(compiler, CLZ | RD(dst) | RM(src2)));
-               if (flags & SET_FLAGS)
-                       EMIT_FULL_DATA_PROCESS_INS_AND_RETURN(CMP_DP, 
SLJIT_UNUSED, dst, SRC2_IMM);
                return SLJIT_SUCCESS;
 
        case SLJIT_ADD:
                SLJIT_ASSERT(!(flags & INV_IMM));
-               EMIT_DATA_PROCESS_INS_AND_RETURN(ADD_DP);
+               if ((flags & (UNUSED_RETURN | SET_FLAGS)) == (UNUSED_RETURN | 
SET_FLAGS) && !(flags & ARGS_SWAPPED))
+                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(CMN_DP, SET_FLAGS,
+                               SLJIT_UNUSED, src1, (src2 & SRC2_IMM) ? src2 : 
RM(src2)));
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, flags 
& SET_FLAGS,
+                       dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
 
        case SLJIT_ADDC:
                SLJIT_ASSERT(!(flags & INV_IMM));
-               EMIT_DATA_PROCESS_INS_AND_RETURN(ADC_DP);
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(ADC_DP, flags 
& SET_FLAGS,
+                       dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
 
        case SLJIT_SUB:
                SLJIT_ASSERT(!(flags & INV_IMM));
-               if (!(flags & ARGS_SWAPPED))
-                       EMIT_DATA_PROCESS_INS_AND_RETURN(SUB_DP);
-               EMIT_DATA_PROCESS_INS_AND_RETURN(RSB_DP);
+               if ((flags & (UNUSED_RETURN | SET_FLAGS)) == (UNUSED_RETURN | 
SET_FLAGS) && !(flags & ARGS_SWAPPED))
+                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(CMP_DP, SET_FLAGS,
+                               SLJIT_UNUSED, src1, (src2 & SRC2_IMM) ? src2 : 
RM(src2)));
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(!(flags & 
ARGS_SWAPPED) ? SUB_DP : RSB_DP, flags & SET_FLAGS,
+                       dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
 
        case SLJIT_SUBC:
                SLJIT_ASSERT(!(flags & INV_IMM));
-               if (!(flags & ARGS_SWAPPED))
-                       EMIT_DATA_PROCESS_INS_AND_RETURN(SBC_DP);
-               EMIT_DATA_PROCESS_INS_AND_RETURN(RSC_DP);
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(!(flags & 
ARGS_SWAPPED) ? SBC_DP : RSC_DP, flags & SET_FLAGS,
+                       dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
 
        case SLJIT_MUL:
                SLJIT_ASSERT(!(flags & INV_IMM));
                SLJIT_ASSERT(!(src2 & SRC2_IMM));
-               if (SLJIT_UNLIKELY(op & SLJIT_SET_O))
-                       mul_inst = SMULL | (reg_map[TMP_REG3] << 16) | 
(reg_map[dst] << 12);
-               else
-                       mul_inst = MUL | (reg_map[dst] << 16);
 
-               if (dst != src2)
-                       FAIL_IF(push_inst(compiler, mul_inst | (reg_map[src1] 
<< 8) | reg_map[src2]));
-               else if (dst != src1)
-                       FAIL_IF(push_inst(compiler, mul_inst | (reg_map[src2] 
<< 8) | reg_map[src1]));
-               else {
-                       /* Rm and Rd must not be the same register. */
-                       SLJIT_ASSERT(dst != TMP_REG1);
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, TMP_REG1, SLJIT_UNUSED, reg_map[src2])));
-                       FAIL_IF(push_inst(compiler, mul_inst | (reg_map[src2] 
<< 8) | reg_map[TMP_REG1]));
-               }
+               if (!HAS_FLAGS(op))
+                       return push_inst(compiler, MUL | (reg_map[dst] << 16) | 
(reg_map[src2] << 8) | reg_map[src1]);
 
-               if (!(op & SLJIT_SET_O))
-                       return SLJIT_SUCCESS;
+               FAIL_IF(push_inst(compiler, SMULL | (reg_map[TMP_REG1] << 16) | 
(reg_map[dst] << 12) | (reg_map[src2] << 8) | reg_map[src1]));
 
-               /* We need to use TMP_REG3. */
-               compiler->cache_arg = 0;
-               compiler->cache_argw = 0;
-               /* cmp TMP_REG2, dst asr #31. */
-               return push_inst(compiler, EMIT_DATA_PROCESS_INS(CMP_DP, 
SET_FLAGS, SLJIT_UNUSED, TMP_REG3, RM(dst) | 0xfc0));
+               /* cmp TMP_REG1, dst asr #31. */
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(CMP_DP, 
SET_FLAGS, SLJIT_UNUSED, TMP_REG1, RM(dst) | 0xfc0));
 
        case SLJIT_AND:
-               if (!(flags & INV_IMM))
-                       EMIT_DATA_PROCESS_INS_AND_RETURN(AND_DP);
-               EMIT_DATA_PROCESS_INS_AND_RETURN(BIC_DP);
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(!(flags & 
INV_IMM) ? AND_DP : BIC_DP, flags & SET_FLAGS,
+                       dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
 
        case SLJIT_OR:
                SLJIT_ASSERT(!(flags & INV_IMM));
-               EMIT_DATA_PROCESS_INS_AND_RETURN(ORR_DP);
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(ORR_DP, flags 
& SET_FLAGS, dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
 
        case SLJIT_XOR:
                SLJIT_ASSERT(!(flags & INV_IMM));
-               EMIT_DATA_PROCESS_INS_AND_RETURN(EOR_DP);
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(EOR_DP, flags 
& SET_FLAGS, dst, src1, (src2 & SRC2_IMM) ? src2 : RM(src2)));
 
        case SLJIT_SHL:
                EMIT_SHIFT_INS_AND_RETURN(0);
@@ -1127,12 +1141,11 @@ static SLJIT_INLINE sljit_s32 emit_singl
        case SLJIT_ASHR:
                EMIT_SHIFT_INS_AND_RETURN(2);
        }
-       SLJIT_ASSERT_STOP();
+
+       SLJIT_UNREACHABLE();
        return SLJIT_SUCCESS;
 }
 
-#undef EMIT_DATA_PROCESS_INS_AND_RETURN
-#undef EMIT_FULL_DATA_PROCESS_INS_AND_RETURN
 #undef EMIT_SHIFT_INS_AND_RETURN
 
 /* Tests whether the immediate can be stored in the 12 bit imm field.
@@ -1312,291 +1325,116 @@ static sljit_s32 load_immediate(struct s
        /* Load integer. */
        return push_inst_with_literal(compiler, EMIT_DATA_TRANSFER(WORD_DATA | 
LOAD_DATA, 1, 0, reg, TMP_PC, 0), imm);
 #else
-       return emit_imm(compiler, reg, imm);
+       FAIL_IF(push_inst(compiler, MOVW | RD(reg) | ((imm << 4) & 0xf0000) | 
(imm & 0xfff)));
+       if (imm <= 0xffff)
+               return SLJIT_SUCCESS;
+       return push_inst(compiler, MOVT | RD(reg) | ((imm >> 12) & 0xf0000) | 
((imm >> 16) & 0xfff));
 #endif
 }
 
-/* Helper function. Dst should be reg + value, using at most 1 instruction, 
flags does not set. */
-static sljit_s32 emit_set_delta(struct sljit_compiler *compiler, sljit_s32 
dst, sljit_s32 reg, sljit_sw value)
+static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, 
sljit_s32 flags, sljit_s32 reg,
+       sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg)
 {
-       if (value >= 0) {
-               value = get_imm(value);
-               if (value)
-                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(ADD_DP, 0, dst, reg, value));
-       }
-       else {
-               value = get_imm(-value);
-               if (value)
-                       return push_inst(compiler, 
EMIT_DATA_PROCESS_INS(SUB_DP, 0, dst, reg, value));
-       }
-       return SLJIT_ERR_UNSUPPORTED;
-}
+       sljit_uw offset_reg, imm;
+       sljit_uw is_type1_transfer = IS_TYPE1_TRANSFER(flags);
 
-/* Can perform an operation using at most 1 instruction. */
-static sljit_s32 getput_arg_fast(struct sljit_compiler *compiler, sljit_s32 
inp_flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
-{
-       sljit_uw imm;
+       SLJIT_ASSERT (arg & SLJIT_MEM);
+       SLJIT_ASSERT((arg & REG_MASK) != tmp_reg);
 
-       if (arg & SLJIT_IMM) {
-               imm = get_imm(argw);
-               if (imm) {
-                       if (inp_flags & ARG_TEST)
-                               return 1;
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, reg, SLJIT_UNUSED, imm)));
-                       return -1;
+       SLJIT_COMPILE_ASSERT(WRITE_BACK == 0x10, 
optimized_for_emit_data_transfer);
+
+       if ((arg & REG_MASK) == SLJIT_UNUSED) {
+               /* Write back is not used. */
+               if (is_type1_transfer) {
+                       FAIL_IF(load_immediate(compiler, tmp_reg, argw & 
~0xfff));
+                       argw &= 0xfff;
                }
-               imm = get_imm(~argw);
-               if (imm) {
-                       if (inp_flags & ARG_TEST)
-                               return 1;
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MVN_DP, 0, reg, SLJIT_UNUSED, imm)));
-                       return -1;
+               else {
+                       FAIL_IF(load_immediate(compiler, tmp_reg, argw & 
~0xff));
+                       argw &= 0xff;
                }
-               return 0;
+
+               return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 0, reg, 
tmp_reg, is_type1_transfer ? argw : TYPE2_TRANSFER_IMM(argw)));
        }
 
-       SLJIT_ASSERT(arg & SLJIT_MEM);
+       if (arg & OFFS_REG_MASK) {
+               offset_reg = OFFS_REG(arg);
+               arg &= REG_MASK;
+               argw &= 0x3;
 
-       /* Fast loads/stores. */
-       if (!(arg & REG_MASK))
-               return 0;
+               if (argw != 0 && !is_type1_transfer) {
+                       SLJIT_ASSERT(!(flags & WRITE_BACK));
 
-       if (arg & OFFS_REG_MASK) {
-               if ((argw & 0x3) != 0 && !IS_TYPE1_TRANSFER(inp_flags))
-                       return 0;
+                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(ADD_DP, 0, tmp_reg, arg, RM(offset_reg) | (argw << 7))));
+                       return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 
0, reg, tmp_reg, TYPE2_TRANSFER_IMM(0)));
+               }
 
-               if (inp_flags & ARG_TEST)
-                       return 1;
-               FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(inp_flags, 1, 
inp_flags & WRITE_BACK, reg, arg & REG_MASK,
-                       RM(OFFS_REG(arg)) | (IS_TYPE1_TRANSFER(inp_flags) ? 
SRC2_IMM : 0) | ((argw & 0x3) << 7))));
-               return -1;
+               /* Bit 25: RM is offset. */
+               return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, flags & 
WRITE_BACK, reg, arg,
+                       RM(offset_reg) | (is_type1_transfer ? (1 << 25) : 0) | 
(argw << 7)));
        }
 
-       if (IS_TYPE1_TRANSFER(inp_flags)) {
+       arg &= REG_MASK;
+
+       if (is_type1_transfer) {
+               if (argw > 0xfff) {
+                       imm = get_imm(argw & ~0xfff);
+                       if (imm) {
+                               offset_reg = (flags & WRITE_BACK) ? arg : 
tmp_reg;
+                               FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(ADD_DP, 0, offset_reg, arg, imm)));
+                               argw = argw & 0xfff;
+                               arg = offset_reg;
+                       }
+               }
+               else if (argw < -0xfff) {
+                       imm = get_imm(-argw & ~0xfff);
+                       if (imm) {
+                               offset_reg = (flags & WRITE_BACK) ? arg : 
tmp_reg;
+                               FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(SUB_DP, 0, offset_reg, arg, imm)));
+                               argw = -(-argw & 0xfff);
+                               arg = offset_reg;
+                       }
+               }
+
                if (argw >= 0 && argw <= 0xfff) {
-                       if (inp_flags & ARG_TEST)
-                               return 1;
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & REG_MASK, 
argw)));
-                       return -1;
+                       return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 
flags & WRITE_BACK, reg, arg & REG_MASK, argw));
                }
                if (argw < 0 && argw >= -0xfff) {
-                       if (inp_flags & ARG_TEST)
-                               return 1;
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_TRANSFER(inp_flags, 0, inp_flags & WRITE_BACK, reg, arg & REG_MASK, 
-argw)));
-                       return -1;
+                       return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, 
flags & WRITE_BACK, reg, arg & REG_MASK, -argw));
                }
        }
        else {
+               if (argw > 0xff) {
+                       imm = get_imm(argw & ~0xff);
+                       if (imm) {
+                               offset_reg = (flags & WRITE_BACK) ? arg : 
tmp_reg;
+                               FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(ADD_DP, 0, offset_reg, arg, imm)));
+                               argw = argw & 0xff;
+                               arg = offset_reg;
+                       }
+               }
+               else if (argw < -0xff) {
+                       imm = get_imm(-argw & ~0xff);
+                       if (imm) {
+                               offset_reg = (flags & WRITE_BACK) ? arg : 
tmp_reg;
+                               FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(SUB_DP, 0, offset_reg, arg, imm)));
+                               argw = -(-argw & 0xff);
+                               arg = offset_reg;
+                       }
+               }
+
                if (argw >= 0 && argw <= 0xff) {
-                       if (inp_flags & ARG_TEST)
-                               return 1;
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & WRITE_BACK, reg, arg & REG_MASK, 
TYPE2_TRANSFER_IMM(argw))));
-                       return -1;
+                       return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, 
flags & WRITE_BACK, reg, arg, TYPE2_TRANSFER_IMM(argw)));
                }
                if (argw < 0 && argw >= -0xff) {
-                       if (inp_flags & ARG_TEST)
-                               return 1;
                        argw = -argw;
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_TRANSFER(inp_flags, 0, inp_flags & WRITE_BACK, reg, arg & REG_MASK, 
TYPE2_TRANSFER_IMM(argw))));
-                       return -1;
-               }
-       }
-
-       return 0;
-}
-
-/* See getput_arg below.
-   Note: can_cache is called only for binary operators. Those
-   operators always uses word arguments without write back. */
-static sljit_s32 can_cache(sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, 
sljit_sw next_argw)
-{
-       /* Immediate caching is not supported as it would be an operation on 
constant arguments. */
-       if (arg & SLJIT_IMM)
-               return 0;
-
-       /* Always a simple operation. */
-       if (arg & OFFS_REG_MASK)
-               return 0;
-
-       if (!(arg & REG_MASK)) {
-               /* Immediate access. */
-               if ((next_arg & SLJIT_MEM) && ((sljit_uw)argw - 
(sljit_uw)next_argw <= 0xfff || (sljit_uw)next_argw - (sljit_uw)argw <= 0xfff))
-                       return 1;
-               return 0;
-       }
-
-       if (argw <= 0xfffff && argw >= -0xfffff)
-               return 0;
-
-       if (argw == next_argw && (next_arg & SLJIT_MEM))
-               return 1;
-
-       if (arg == next_arg && ((sljit_uw)argw - (sljit_uw)next_argw <= 0xfff 
|| (sljit_uw)next_argw - (sljit_uw)argw <= 0xfff))
-               return 1;
-
-       return 0;
-}
-
-#define GETPUT_ARG_DATA_TRANSFER(add, wb, target, base, imm) \
-       if (max_delta & 0xf00) \
-               FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(inp_flags, add, 
wb, target, base, imm))); \
-       else \
-               FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(inp_flags, add, 
wb, target, base, TYPE2_TRANSFER_IMM(imm))));
-
-#define TEST_WRITE_BACK() \
-       if (inp_flags & WRITE_BACK) { \
-               tmp_r = arg & REG_MASK; \
-               if (reg == tmp_r) { \
-                       /* This can only happen for stores */ \
-                       /* since ldr reg, [reg, ...]! has no meaning */ \
-                       SLJIT_ASSERT(!(inp_flags & LOAD_DATA)); \
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, TMP_REG3, SLJIT_UNUSED, RM(reg)))); \
-                       reg = TMP_REG3; \
-               } \
-       }
-
-/* Emit the necessary instructions. See can_cache above. */
-static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 
inp_flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, 
sljit_sw next_argw)
-{
-       sljit_s32 tmp_r;
-       sljit_sw max_delta;
-       sljit_sw sign;
-       sljit_uw imm;
-
-       if (arg & SLJIT_IMM) {
-               SLJIT_ASSERT(inp_flags & LOAD_DATA);
-               return load_immediate(compiler, reg, argw);
-       }
-
-       SLJIT_ASSERT(arg & SLJIT_MEM);
-
-       tmp_r = (inp_flags & LOAD_DATA) ? reg : TMP_REG3;
-       max_delta = IS_TYPE1_TRANSFER(inp_flags) ? 0xfff : 0xff;
-
-       if ((arg & REG_MASK) == SLJIT_UNUSED) {
-               /* Write back is not used. */
-               imm = (sljit_uw)(argw - compiler->cache_argw);
-               if ((compiler->cache_arg & SLJIT_IMM) && (imm <= 
(sljit_uw)max_delta || imm >= (sljit_uw)-max_delta)) {
-                       if (imm <= (sljit_uw)max_delta) {
-                               sign = 1;
-                               argw = argw - compiler->cache_argw;
-                       }
-                       else {
-                               sign = 0;
-                               argw = compiler->cache_argw - argw;
-                       }
-
-                       GETPUT_ARG_DATA_TRANSFER(sign, 0, reg, TMP_REG3, argw);
-                       return SLJIT_SUCCESS;
+                       return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, 
flags & WRITE_BACK, reg, arg, TYPE2_TRANSFER_IMM(argw)));
                }
-
-               /* With write back, we can create some sophisticated loads, but
-                  it is hard to decide whether we should convert downward (0s) 
or upward (1s). */
-               imm = (sljit_uw)(argw - next_argw);
-               if ((next_arg & SLJIT_MEM) && (imm <= (sljit_uw)max_delta || 
imm >= (sljit_uw)-max_delta)) {
-                       SLJIT_ASSERT(inp_flags & LOAD_DATA);
-
-                       compiler->cache_arg = SLJIT_IMM;
-                       compiler->cache_argw = argw;
-                       tmp_r = TMP_REG3;
-               }
-
-               FAIL_IF(load_immediate(compiler, tmp_r, argw));
-               GETPUT_ARG_DATA_TRANSFER(1, 0, reg, tmp_r, 0);
-               return SLJIT_SUCCESS;
-       }
-
-       if (arg & OFFS_REG_MASK) {
-               SLJIT_ASSERT((argw & 0x3) && !(max_delta & 0xf00));
-               if (inp_flags & WRITE_BACK)
-                       tmp_r = arg & REG_MASK;
-               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, 
tmp_r, arg & REG_MASK, RM(OFFS_REG(arg)) | ((argw & 0x3) << 7))));
-               return push_inst(compiler, EMIT_DATA_TRANSFER(inp_flags, 1, 0, 
reg, tmp_r, TYPE2_TRANSFER_IMM(0)));
-       }
-
-       imm = (sljit_uw)(argw - compiler->cache_argw);
-       if (compiler->cache_arg == arg && imm <= (sljit_uw)max_delta) {
-               SLJIT_ASSERT(!(inp_flags & WRITE_BACK));
-               GETPUT_ARG_DATA_TRANSFER(1, 0, reg, TMP_REG3, imm);
-               return SLJIT_SUCCESS;
-       }
-       if (compiler->cache_arg == arg && imm >= (sljit_uw)-max_delta) {
-               SLJIT_ASSERT(!(inp_flags & WRITE_BACK));
-               imm = (sljit_uw)-(sljit_sw)imm;
-               GETPUT_ARG_DATA_TRANSFER(0, 0, reg, TMP_REG3, imm);
-               return SLJIT_SUCCESS;
-       }
-
-       imm = get_imm(argw & ~max_delta);
-       if (imm) {
-               TEST_WRITE_BACK();
-               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, 
tmp_r, arg & REG_MASK, imm)));
-               GETPUT_ARG_DATA_TRANSFER(1, inp_flags & WRITE_BACK, reg, tmp_r, 
argw & max_delta);
-               return SLJIT_SUCCESS;
-       }
-
-       imm = get_imm(-argw & ~max_delta);
-       if (imm) {
-               argw = -argw;
-               TEST_WRITE_BACK();
-               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(SUB_DP, 0, 
tmp_r, arg & REG_MASK, imm)));
-               GETPUT_ARG_DATA_TRANSFER(0, inp_flags & WRITE_BACK, reg, tmp_r, 
argw & max_delta);
-               return SLJIT_SUCCESS;
-       }
-
-       if ((compiler->cache_arg & SLJIT_IMM) && compiler->cache_argw == argw) {
-               TEST_WRITE_BACK();
-               return push_inst(compiler, EMIT_DATA_TRANSFER(inp_flags, 1, 
inp_flags & WRITE_BACK, reg, arg & REG_MASK, RM(TMP_REG3) | (max_delta & 0xf00 
? SRC2_IMM : 0)));
-       }
-
-       if (argw == next_argw && (next_arg & SLJIT_MEM)) {
-               SLJIT_ASSERT(inp_flags & LOAD_DATA);
-               FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-
-               compiler->cache_arg = SLJIT_IMM;
-               compiler->cache_argw = argw;
-
-               TEST_WRITE_BACK();
-               return push_inst(compiler, EMIT_DATA_TRANSFER(inp_flags, 1, 
inp_flags & WRITE_BACK, reg, arg & REG_MASK, RM(TMP_REG3) | (max_delta & 0xf00 
? SRC2_IMM : 0)));
-       }
-
-       imm = (sljit_uw)(argw - next_argw);
-       if (arg == next_arg && !(inp_flags & WRITE_BACK) && (imm <= 
(sljit_uw)max_delta || imm >= (sljit_uw)-max_delta)) {
-               SLJIT_ASSERT(inp_flags & LOAD_DATA);
-               FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
-               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, 
TMP_REG3, TMP_REG3, reg_map[arg & REG_MASK])));
-
-               compiler->cache_arg = arg;
-               compiler->cache_argw = argw;
-
-               GETPUT_ARG_DATA_TRANSFER(1, 0, reg, TMP_REG3, 0);
-               return SLJIT_SUCCESS;
-       }
-
-       if ((arg & REG_MASK) == tmp_r) {
-               compiler->cache_arg = SLJIT_IMM;
-               compiler->cache_argw = argw;
-               tmp_r = TMP_REG3;
        }
 
-       FAIL_IF(load_immediate(compiler, tmp_r, argw));
-       return push_inst(compiler, EMIT_DATA_TRANSFER(inp_flags, 1, inp_flags & 
WRITE_BACK, reg, arg & REG_MASK, reg_map[tmp_r] | (max_delta & 0xf00 ? SRC2_IMM 
: 0)));
-}
-
-static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, 
sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
-{
-       if (getput_arg_fast(compiler, flags, reg, arg, argw))
-               return compiler->error;
-       compiler->cache_arg = 0;
-       compiler->cache_argw = 0;
-       return getput_arg(compiler, flags, reg, arg, argw, 0, 0);
-}
-
-static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, 
sljit_s32 flags, sljit_s32 reg, sljit_s32 arg1, sljit_sw arg1w, sljit_s32 arg2, 
sljit_sw arg2w)
-{
-       if (getput_arg_fast(compiler, flags, reg, arg1, arg1w))
-               return compiler->error;
-       return getput_arg(compiler, flags, reg, arg1, arg1w, arg2, arg2w);
+       FAIL_IF(load_immediate(compiler, tmp_reg, argw));
+       return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, flags & 
WRITE_BACK, reg, arg,
+               RM(tmp_reg) | (is_type1_transfer ? (1 << 25) : 0)));
 }
 
 static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, 
sljit_s32 inp_flags,
@@ -1604,68 +1442,66 @@ static sljit_s32 emit_op(struct sljit_co
        sljit_s32 src1, sljit_sw src1w,
        sljit_s32 src2, sljit_sw src2w)
 {
-       /* arg1 goes to TMP_REG1 or src reg
-          arg2 goes to TMP_REG2, imm or src reg
-          TMP_REG3 can be used for caching
-          result goes to TMP_REG2, so put result can use TMP_REG1 and 
TMP_REG3. */
+       /* src1 is reg or TMP_REG1
+          src2 is reg, TMP_REG2, or imm
+          result goes to TMP_REG2, so put result can use TMP_REG1. */
 
        /* We prefers register and simple consts. */
-       sljit_s32 dst_r;
-       sljit_s32 src1_r;
-       sljit_s32 src2_r = 0;
-       sljit_s32 sugg_src2_r = TMP_REG2;
-       sljit_s32 flags = GET_FLAGS(op) ? SET_FLAGS : 0;
-
-       compiler->cache_arg = 0;
-       compiler->cache_argw = 0;
+       sljit_s32 dst_reg;
+       sljit_s32 src1_reg;
+       sljit_s32 src2_reg;
+       sljit_s32 flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
 
        /* Destination check. */
-       if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
-               if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32 && !(src2 & 
SLJIT_MEM))
-                       return SLJIT_SUCCESS;
-               dst_r = TMP_REG2;
-       }
-       else if (FAST_IS_REG(dst)) {
-               dst_r = dst;
-               flags |= REG_DEST;
-               if (op >= SLJIT_MOV && op <= SLJIT_MOVU_S32)
-                       sugg_src2_r = dst_r;
-       }
-       else {
-               SLJIT_ASSERT(dst & SLJIT_MEM);
-               if (getput_arg_fast(compiler, inp_flags | ARG_TEST, TMP_REG2, 
dst, dstw)) {
-                       flags |= FAST_DEST;
-                       dst_r = TMP_REG2;
-               }
-               else {
-                       flags |= SLOW_DEST;
-                       dst_r = 0;
+       if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED))
+               flags |= UNUSED_RETURN;
+
+       SLJIT_ASSERT(!(inp_flags & ALLOW_INV_IMM) || (inp_flags & ALLOW_IMM));
+
+       src2_reg = 0;
+
+       do {
+               if (!(inp_flags & ALLOW_IMM))
+                       break;
+
+               if (src2 & SLJIT_IMM) {
+                       src2_reg = get_imm(src2w);
+                       if (src2_reg)
+                               break;
+                       if (inp_flags & ALLOW_INV_IMM) {
+                               src2_reg = get_imm(~src2w);
+                               if (src2_reg) {
+                                       flags |= INV_IMM;
+                                       break;
+                               }
+                       }
+                       if (GET_OPCODE(op) == SLJIT_ADD) {
+                               src2_reg = get_imm(-src2w);
+                               if (src2_reg) {
+                                       op = SLJIT_SUB | GET_ALL_FLAGS(op);
+                                       break;
+                               }
+                       }
+                       if (GET_OPCODE(op) == SLJIT_SUB) {
+                               src2_reg = get_imm(-src2w);
+                               if (src2_reg) {
+                                       op = SLJIT_ADD | GET_ALL_FLAGS(op);
+                                       break;
+                               }
+                       }
                }
-       }
 
-       /* Source 1. */
-       if (FAST_IS_REG(src1))
-               src1_r = src1;
-       else if (FAST_IS_REG(src2)) {
-               flags |= ARGS_SWAPPED;
-               src1_r = src2;
-               src2 = src1;
-               src2w = src1w;
-       }
-       else do { /* do { } while(0) is used because of breaks. */
-               src1_r = 0;
-               if ((inp_flags & ALLOW_ANY_IMM) && (src1 & SLJIT_IMM)) {
-                       /* The second check will generate a hit. */
-                       src2_r = get_imm(src1w);
-                       if (src2_r) {
+               if (src1 & SLJIT_IMM) {
+                       src2_reg = get_imm(src1w);
+                       if (src2_reg) {
                                flags |= ARGS_SWAPPED;
                                src1 = src2;
                                src1w = src2w;
                                break;
                        }
                        if (inp_flags & ALLOW_INV_IMM) {
-                               src2_r = get_imm(~src1w);
-                               if (src2_r) {
+                               src2_reg = get_imm(~src1w);
+                               if (src2_reg) {
                                        flags |= ARGS_SWAPPED | INV_IMM;
                                        src1 = src2;
                                        src1w = src2w;
@@ -1673,9 +1509,9 @@ static sljit_s32 emit_op(struct sljit_co
                                }
                        }
                        if (GET_OPCODE(op) == SLJIT_ADD) {
-                               src2_r = get_imm(-src1w);
-                               if (src2_r) {
-                                       /* Note: ARGS_SWAPPED is intentionally 
not applied! */
+                               src2_reg = get_imm(-src1w);
+                               if (src2_reg) {
+                                       /* Note: add is commutative operation. 
*/
                                        src1 = src2;
                                        src1w = src2w;
                                        op = SLJIT_SUB | GET_ALL_FLAGS(op);
@@ -1683,110 +1519,54 @@ static sljit_s32 emit_op(struct sljit_co
                                }
                        }
                }
+       } while(0);
 
-               if (getput_arg_fast(compiler, inp_flags | LOAD_DATA, TMP_REG1, 
src1, src1w)) {
-                       FAIL_IF(compiler->error);
-                       src1_r = TMP_REG1;
-               }
-       } while (0);
-
-       /* Source 2. */
-       if (src2_r == 0) {
-               if (FAST_IS_REG(src2)) {
-                       src2_r = src2;
-                       flags |= REG_SOURCE;
-                       if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= 
SLJIT_MOVU_S32)
-                               dst_r = src2_r;
-               }
-               else do { /* do { } while(0) is used because of breaks. */
-                       if ((inp_flags & ALLOW_ANY_IMM) && (src2 & SLJIT_IMM)) {
-                               src2_r = get_imm(src2w);
-                               if (src2_r)
-                                       break;
-                               if (inp_flags & ALLOW_INV_IMM) {
-                                       src2_r = get_imm(~src2w);
-                                       if (src2_r) {
-                                               flags |= INV_IMM;
-                                               break;
-                                       }
-                               }
-                               if (GET_OPCODE(op) == SLJIT_ADD) {
-                                       src2_r = get_imm(-src2w);
-                                       if (src2_r) {
-                                               op = SLJIT_SUB | 
GET_ALL_FLAGS(op);
-                                               flags &= ~ARGS_SWAPPED;
-                                               break;
-                                       }
-                               }
-                               if (GET_OPCODE(op) == SLJIT_SUB && !(flags & 
ARGS_SWAPPED)) {
-                                       src2_r = get_imm(-src2w);
-                                       if (src2_r) {
-                                               op = SLJIT_ADD | 
GET_ALL_FLAGS(op);
-                                               flags &= ~ARGS_SWAPPED;
-                                               break;
-                                       }
-                               }
-                       }
-
-                       /* src2_r is 0. */
-                       if (getput_arg_fast(compiler, inp_flags | LOAD_DATA, 
sugg_src2_r, src2, src2w)) {
-                               FAIL_IF(compiler->error);
-                               src2_r = sugg_src2_r;
-                       }
-               } while (0);
-       }
-
-       /* src1_r, src2_r and dst_r can be zero (=unprocessed) or non-zero.
-          If they are zero, they must not be registers. */
-       if (src1_r == 0 && src2_r == 0 && dst_r == 0) {
-               if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, 
src1w, dst, dstw)) {
-                       SLJIT_ASSERT(!(flags & ARGS_SWAPPED));
-                       flags |= ARGS_SWAPPED;
-                       FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, 
TMP_REG1, src2, src2w, src1, src1w));
-                       FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, 
TMP_REG2, src1, src1w, dst, dstw));
-               }
-               else {
-                       FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, 
TMP_REG1, src1, src1w, src2, src2w));
-                       FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, 
TMP_REG2, src2, src2w, dst, dstw));
-               }
-               src1_r = TMP_REG1;
-               src2_r = TMP_REG2;
-       }
-       else if (src1_r == 0 && src2_r == 0) {
-               FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, TMP_REG1, 
src1, src1w, src2, src2w));
-               src1_r = TMP_REG1;
-       }
-       else if (src1_r == 0 && dst_r == 0) {
-               FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, TMP_REG1, 
src1, src1w, dst, dstw));
-               src1_r = TMP_REG1;
+       /* Source 1. */
+       if (FAST_IS_REG(src1))
+               src1_reg = src1;
+       else if (src1 & SLJIT_MEM) {
+               FAIL_IF(emit_op_mem(compiler, inp_flags | LOAD_DATA, TMP_REG1, 
src1, src1w, TMP_REG1));
+               src1_reg = TMP_REG1;
        }
-       else if (src2_r == 0 && dst_r == 0) {
-               FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, 
sugg_src2_r, src2, src2w, dst, dstw));
-               src2_r = sugg_src2_r;
+       else {
+               FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
+               src1_reg = TMP_REG1;
        }
 
-       if (dst_r == 0)
-               dst_r = TMP_REG2;
+       /* Destination. */
+       dst_reg = SLOW_IS_REG(dst) ? dst : TMP_REG2;
 
-       if (src1_r == 0) {
-               FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, TMP_REG1, 
src1, src1w, 0, 0));
-               src1_r = TMP_REG1;
-       }
+       if (op <= SLJIT_MOVU_P) {
+               if (dst & SLJIT_MEM) {
+                       if (inp_flags & BYTE_DATA)
+                               inp_flags &= ~SIGNED_DATA;
 
-       if (src2_r == 0) {
-               FAIL_IF(getput_arg(compiler, inp_flags | LOAD_DATA, 
sugg_src2_r, src2, src2w, 0, 0));
-               src2_r = sugg_src2_r;
+                       if (FAST_IS_REG(src2))
+                               return emit_op_mem(compiler, inp_flags, src2, 
dst, dstw, TMP_REG2);
+               }
+
+               if (FAST_IS_REG(src2) && dst_reg != TMP_REG2)
+                       flags |= MOVE_REG_CONV;
        }
 
-       FAIL_IF(emit_single_op(compiler, op, flags, dst_r, src1_r, src2_r));
+       /* Source 2. */
+       if (src2_reg == 0) {
+               src2_reg = (op <= SLJIT_MOVU_P) ? dst_reg : TMP_REG2;
 
-       if (flags & (FAST_DEST | SLOW_DEST)) {
-               if (flags & FAST_DEST)
-                       FAIL_IF(getput_arg_fast(compiler, inp_flags, dst_r, 
dst, dstw));
+               if (FAST_IS_REG(src2))
+                       src2_reg = src2;
+               else if (src2 & SLJIT_MEM)
+                       FAIL_IF(emit_op_mem(compiler, inp_flags | LOAD_DATA, 
src2_reg, src2, src2w, TMP_REG2));
                else
-                       FAIL_IF(getput_arg(compiler, inp_flags, dst_r, dst, 
dstw, 0, 0));
+                       FAIL_IF(load_immediate(compiler, src2_reg, src2w));
        }
-       return SLJIT_SUCCESS;
+
+       FAIL_IF(emit_single_op(compiler, op, flags, dst_reg, src1_reg, 
src2_reg));
+
+       if (!(dst & SLJIT_MEM))
+               return SLJIT_SUCCESS;
+
+       return emit_op_mem(compiler, inp_flags, dst_reg, dst, dstw, TMP_REG1);
 }
 
 #ifdef __cplusplus
@@ -1806,6 +1586,9 @@ extern int __aeabi_idivmod(int numerator
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler 
*compiler, sljit_s32 op)
 {
+       sljit_sw saved_reg_list[3];
+       sljit_sw saved_reg_count;
+
        CHECK_ERROR();
        CHECK(check_sljit_emit_op0(compiler, op));
 
@@ -1819,33 +1602,38 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
                break;
        case SLJIT_LMUL_UW:
        case SLJIT_LMUL_SW:
-#if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
                return push_inst(compiler, (op == SLJIT_LMUL_UW ? UMULL : SMULL)
                        | (reg_map[SLJIT_R1] << 16)
                        | (reg_map[SLJIT_R0] << 12)
                        | (reg_map[SLJIT_R0] << 8)
                        | reg_map[SLJIT_R1]);
-#else
-               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, 
TMP_REG1, SLJIT_UNUSED, RM(SLJIT_R1))));
-               return push_inst(compiler, (op == SLJIT_LMUL_UW ? UMULL : SMULL)
-                       | (reg_map[SLJIT_R1] << 16)
-                       | (reg_map[SLJIT_R0] << 12)
-                       | (reg_map[SLJIT_R0] << 8)
-                       | reg_map[TMP_REG1]);
-#endif
        case SLJIT_DIVMOD_UW:
        case SLJIT_DIVMOD_SW:
        case SLJIT_DIV_UW:
        case SLJIT_DIV_SW:
                SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && 
SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
-               SLJIT_COMPILE_ASSERT(reg_map[2] == 1 && reg_map[3] == 2, 
bad_register_mapping);
+               SLJIT_ASSERT(reg_map[2] == 1 && reg_map[3] == 2 && reg_map[4] 
== 3);
 
-               if ((op >= SLJIT_DIV_UW) && (compiler->scratches >= 3)) {
-                       FAIL_IF(push_inst(compiler, 0xe52d2008 /* str r2, [sp, 
#-8]! */));
-                       FAIL_IF(push_inst(compiler, 0xe58d1004 /* str r1, [sp, 
#4] */));
+               saved_reg_count = 0;
+               if (compiler->scratches >= 4)
+                       saved_reg_list[saved_reg_count++] = 3;
+               if (compiler->scratches >= 3)
+                       saved_reg_list[saved_reg_count++] = 2;
+               if (op >= SLJIT_DIV_UW)
+                       saved_reg_list[saved_reg_count++] = 1;
+
+               if (saved_reg_count > 0) {
+                       FAIL_IF(push_inst(compiler, 0xe52d0000 | 
(saved_reg_count >= 3 ? 16 : 8)
+                                               | (saved_reg_list[0] << 12) /* 
str rX, [sp, #-8/-16]! */));
+                       if (saved_reg_count >= 2) {
+                               SLJIT_ASSERT(saved_reg_list[1] < 8);
+                               FAIL_IF(push_inst(compiler, 0xe58d0004 | 
(saved_reg_list[1] << 12) /* str rX, [sp, #4] */));
+                       }
+                       if (saved_reg_count >= 3) {
+                               SLJIT_ASSERT(saved_reg_list[2] < 8);
+                               FAIL_IF(push_inst(compiler, 0xe58d0008 | 
(saved_reg_list[2] << 12) /* str rX, [sp, #8] */));
+                       }
                }
-               else if ((op >= SLJIT_DIV_UW) || (compiler->scratches >= 3))
-                       FAIL_IF(push_inst(compiler, 0xe52d0008 | (op >= 
SLJIT_DIV_UW ? 0x1000 : 0x2000) /* str r1/r2, [sp, #-8]! */));
 
 #if defined(__GNUC__)
                FAIL_IF(sljit_emit_ijump(compiler, SLJIT_FAST_CALL, SLJIT_IMM,
@@ -1854,12 +1642,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 #error "Software divmod functions are needed"
 #endif
 
-               if ((op >= SLJIT_DIV_UW) && (compiler->scratches >= 3)) {
-                       FAIL_IF(push_inst(compiler, 0xe59d1004 /* ldr r1, [sp, 
#4] */));
-                       FAIL_IF(push_inst(compiler, 0xe49d2008 /* ldr r2, [sp], 
#8 */));
+               if (saved_reg_count > 0) {
+                       if (saved_reg_count >= 3) {
+                               SLJIT_ASSERT(saved_reg_list[2] < 8);
+                               FAIL_IF(push_inst(compiler, 0xe59d0008 | 
(saved_reg_list[2] << 12) /* ldr rX, [sp, #8] */));
+                       }
+                       if (saved_reg_count >= 2) {
+                               SLJIT_ASSERT(saved_reg_list[1] < 8);
+                               FAIL_IF(push_inst(compiler, 0xe59d0004 | 
(saved_reg_list[1] << 12) /* ldr rX, [sp, #4] */));
+                       }
+                       return push_inst(compiler, 0xe49d0000 | 
(saved_reg_count >= 3 ? 16 : 8)
+                                               | (saved_reg_list[0] << 12) /* 
ldr rX, [sp], #8/16 */);
                }
-               else if ((op >= SLJIT_DIV_UW) || (compiler->scratches >= 3))
-                       return push_inst(compiler, 0xe49d0008 | (op >= 
SLJIT_DIV_UW ? 0x1000 : 0x2000) /* ldr r1/r2, [sp], #8 */);
                return SLJIT_SUCCESS;
        }
 
@@ -1875,6 +1669,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        ADJUST_LOCAL_OFFSET(dst, dstw);
        ADJUST_LOCAL_OFFSET(src, srcw);
 
+       if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
+#if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
+               if (op <= SLJIT_MOV_P && (src & SLJIT_MEM))
+                       return emit_op_mem(compiler, PRELOAD_DATA | LOAD_DATA, 
TMP_PC, src, srcw, TMP_REG1);
+#endif
+               return SLJIT_SUCCESS;
+       }
+
        switch (GET_OPCODE(op)) {
        case SLJIT_MOV:
        case SLJIT_MOV_U32:
@@ -1940,6 +1742,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        ADJUST_LOCAL_OFFSET(src1, src1w);
        ADJUST_LOCAL_OFFSET(src2, src2w);
 
+       if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+               return SLJIT_SUCCESS;
+
        switch (GET_OPCODE(op)) {
        case SLJIT_ADD:
        case SLJIT_ADDC:
@@ -1996,43 +1801,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 /*  Floating point operators                                             */
 /* --------------------------------------------------------------------- */
 
-#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
-
-/* 0 - no fpu
-   1 - vfp */
-static sljit_s32 arm_fpu_type = -1;
-
-static void init_compiler(void)
-{
-       if (arm_fpu_type != -1)
-               return;
-
-       /* TODO: Only the OS can help to determine the correct fpu type. */
-       arm_fpu_type = 1;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
-{
-#ifdef SLJIT_IS_FPU_AVAILABLE
-       return SLJIT_IS_FPU_AVAILABLE;
-#else
-       if (arm_fpu_type == -1)
-               init_compiler();
-       return arm_fpu_type;
-#endif
-}
-
-#else
-
-#define arm_fpu_type 1
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
-{
-       /* Always available. */
-       return 1;
-}
-
-#endif
 
 #define FPU_LOAD (1 << 20)
 #define EMIT_FPU_DATA_TRANSFER(inst, add, base, freg, offs) \
@@ -2042,72 +1810,54 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 
 static sljit_s32 emit_fop_mem(struct sljit_compiler *compiler, sljit_s32 
flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
 {
-       sljit_sw tmp;
        sljit_uw imm;
        sljit_sw inst = VSTR_F32 | (flags & (SLJIT_F32_OP | FPU_LOAD));
+
        SLJIT_ASSERT(arg & SLJIT_MEM);
+       arg &= ~SLJIT_MEM;
 
        if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
-               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, 
TMP_REG1, arg & REG_MASK, RM(OFFS_REG(arg)) | ((argw & 0x3) << 7))));
-               arg = SLJIT_MEM | TMP_REG1;
+               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, 
TMP_REG2, arg & REG_MASK, RM(OFFS_REG(arg)) | ((argw & 0x3) << 7))));
+               arg = TMP_REG2;
                argw = 0;
        }
 
        /* Fast loads and stores. */
-       if ((arg & REG_MASK)) {
+       if (arg) {
                if (!(argw & ~0x3fc))
                        return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
1, arg & REG_MASK, reg, argw >> 2));
                if (!(-argw & ~0x3fc))
                        return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
0, arg & REG_MASK, reg, (-argw) >> 2));
-       }
 
-       if (compiler->cache_arg == arg) {
-               tmp = argw - compiler->cache_argw;
-               if (!(tmp & ~0x3fc))
-                       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
1, TMP_REG3, reg, tmp >> 2));
-               if (!(-tmp & ~0x3fc))
-                       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
0, TMP_REG3, reg, -tmp >> 2));
-               if (emit_set_delta(compiler, TMP_REG3, TMP_REG3, tmp) != 
SLJIT_ERR_UNSUPPORTED) {
-                       FAIL_IF(compiler->error);
-                       compiler->cache_argw = argw;
-                       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
1, TMP_REG3, reg, 0));
-               }
-       }
-
-       if (arg & REG_MASK) {
-               if (emit_set_delta(compiler, TMP_REG1, arg & REG_MASK, argw) != 
SLJIT_ERR_UNSUPPORTED) {
-                       FAIL_IF(compiler->error);
-                       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
1, TMP_REG1, reg, 0));
-               }
                imm = get_imm(argw & ~0x3fc);
                if (imm) {
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG1, arg & REG_MASK, imm)));
-                       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
1, TMP_REG1, reg, (argw & 0x3fc) >> 2));
+                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(ADD_DP, 0, TMP_REG2, arg & REG_MASK, imm)));
+                       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
1, TMP_REG2, reg, (argw & 0x3fc) >> 2));
                }
                imm = get_imm(-argw & ~0x3fc);
                if (imm) {
                        argw = -argw;
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(SUB_DP, 0, TMP_REG1, arg & REG_MASK, imm)));
-                       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
0, TMP_REG1, reg, (argw & 0x3fc) >> 2));
+                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(SUB_DP, 0, TMP_REG2, arg & REG_MASK, imm)));
+                       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 
0, TMP_REG2, reg, (argw & 0x3fc) >> 2));
                }
        }
 
-       compiler->cache_arg = arg;
-       compiler->cache_argw = argw;
-       if (arg & REG_MASK) {
-               FAIL_IF(load_immediate(compiler, TMP_REG1, argw));
-               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, 
TMP_REG3, arg & REG_MASK, reg_map[TMP_REG1])));
+       if (arg) {
+               FAIL_IF(load_immediate(compiler, TMP_REG2, argw));
+               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(ADD_DP, 0, 
TMP_REG2, arg & REG_MASK, RM(TMP_REG2))));
        }
        else
-               FAIL_IF(load_immediate(compiler, TMP_REG3, argw));
+               FAIL_IF(load_immediate(compiler, TMP_REG2, argw));
 
-       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 1, TMP_REG3, 
reg, 0));
+       return push_inst(compiler, EMIT_FPU_DATA_TRANSFER(inst, 1, TMP_REG2, 
reg, 0));
 }
 
 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct 
sljit_compiler *compiler, sljit_s32 op,
        sljit_s32 dst, sljit_sw dstw,
        sljit_s32 src, sljit_sw srcw)
 {
+       op ^= SLJIT_F32_OP;
+
        if (src & SLJIT_MEM) {
                FAIL_IF(emit_fop_mem(compiler, (op & SLJIT_F32_OP) | FPU_LOAD, 
TMP_FREG1, src, srcw));
                src = TMP_FREG1;
@@ -2115,9 +1865,6 @@ static SLJIT_INLINE sljit_s32 sljit_emit
 
        FAIL_IF(push_inst(compiler, EMIT_FPU_OPERATION(VCVT_S32_F32, op & 
SLJIT_F32_OP, TMP_FREG1, src, 0)));
 
-       if (dst == SLJIT_UNUSED)
-               return SLJIT_SUCCESS;
-
        if (FAST_IS_REG(dst))
                return push_inst(compiler, VMOV | (1 << 20) | RD(dst) | 
(TMP_FREG1 << 16));
 
@@ -2131,6 +1878,8 @@ static SLJIT_INLINE sljit_s32 sljit_emit
 {
        sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 
+       op ^= SLJIT_F32_OP;
+
        if (FAST_IS_REG(src))
                FAIL_IF(push_inst(compiler, VMOV | RD(src) | (TMP_FREG1 << 
16)));
        else if (src & SLJIT_MEM) {
@@ -2153,6 +1902,8 @@ static SLJIT_INLINE sljit_s32 sljit_emit
        sljit_s32 src1, sljit_sw src1w,
        sljit_s32 src2, sljit_sw src2w)
 {
+       op ^= SLJIT_F32_OP;
+
        if (src1 & SLJIT_MEM) {
                FAIL_IF(emit_fop_mem(compiler, (op & SLJIT_F32_OP) | FPU_LOAD, 
TMP_FREG1, src1, src1w));
                src1 = TMP_FREG1;
@@ -2174,16 +1925,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        sljit_s32 dst_r;
 
        CHECK_ERROR();
-       compiler->cache_arg = 0;
-       compiler->cache_argw = 0;
-       if (GET_OPCODE(op) != SLJIT_CONV_F64_FROM_F32)
-               op ^= SLJIT_F32_OP;
 
        SLJIT_COMPILE_ASSERT((SLJIT_F32_OP == 0x100), float_transfer_bit_error);
        SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
 
        dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 
+       if (GET_OPCODE(op) != SLJIT_CONV_F64_FROM_F32)
+               op ^= SLJIT_F32_OP;
+
        if (src & SLJIT_MEM) {
                FAIL_IF(emit_fop_mem(compiler, (op & SLJIT_F32_OP) | FPU_LOAD, 
dst_r, src, srcw));
                src = dst_r;
@@ -2228,8 +1978,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        ADJUST_LOCAL_OFFSET(src1, src1w);
        ADJUST_LOCAL_OFFSET(src2, src2w);
 
-       compiler->cache_arg = 0;
-       compiler->cache_argw = 0;
        op ^= SLJIT_F32_OP;
 
        dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
@@ -2282,21 +2030,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
        ADJUST_LOCAL_OFFSET(dst, dstw);
 
-       /* For UNUSED dst. Uncommon, but possible. */
-       if (dst == SLJIT_UNUSED)
-               return SLJIT_SUCCESS;
+       SLJIT_ASSERT(reg_map[TMP_REG1] == 14);
 
        if (FAST_IS_REG(dst))
-               return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, 
dst, SLJIT_UNUSED, RM(TMP_REG3)));
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, 
dst, SLJIT_UNUSED, RM(TMP_REG1)));
 
        /* Memory. */
-       if (getput_arg_fast(compiler, WORD_DATA, TMP_REG3, dst, dstw))
-               return compiler->error;
-       /* TMP_REG3 is used for caching. */
-       FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, TMP_REG2, 
SLJIT_UNUSED, RM(TMP_REG3))));
-       compiler->cache_arg = 0;
-       compiler->cache_argw = 0;
-       return getput_arg(compiler, WORD_DATA, TMP_REG2, dst, dstw, 0, 0);
+       return emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, TMP_REG2);
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_return(struct 
sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw)
@@ -2305,21 +2045,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        CHECK(check_sljit_emit_fast_return(compiler, src, srcw));
        ADJUST_LOCAL_OFFSET(src, srcw);
 
+       SLJIT_ASSERT(reg_map[TMP_REG1] == 14);
+
        if (FAST_IS_REG(src))
-               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, 
TMP_REG3, SLJIT_UNUSED, RM(src))));
-       else if (src & SLJIT_MEM) {
-               if (getput_arg_fast(compiler, WORD_DATA | LOAD_DATA, TMP_REG3, 
src, srcw))
-                       FAIL_IF(compiler->error);
-               else {
-                       compiler->cache_arg = 0;
-                       compiler->cache_argw = 0;
-                       FAIL_IF(getput_arg(compiler, WORD_DATA | LOAD_DATA, 
TMP_REG2, src, srcw, 0, 0));
-                       FAIL_IF(push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, 0, TMP_REG3, SLJIT_UNUSED, RM(TMP_REG2))));
-               }
-       }
+               FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, 
TMP_REG1, 0, RM(src))));
+       else if (src & SLJIT_MEM)
+               FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, 
src, srcw, TMP_REG2));
        else if (src & SLJIT_IMM)
-               FAIL_IF(load_immediate(compiler, TMP_REG3, srcw));
-       return push_inst(compiler, BLX | RM(TMP_REG3));
+               FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
+
+       return push_inst(compiler, BX | RM(TMP_REG1));
 }
 
 /* --------------------------------------------------------------------- */
@@ -2414,7 +2149,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_ju
        if (type >= SLJIT_FAST_CALL)
                PTR_FAIL_IF(prepare_blx(compiler));
        PTR_FAIL_IF(push_inst_with_unique_literal(compiler, 
((EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0,
-               type <= SLJIT_JUMP ? TMP_PC : TMP_REG1, TMP_PC, 0)) & 
~COND_MASK) | get_cc(type), 0));
+               type <= SLJIT_JUMP ? TMP_PC : TMP_REG2, TMP_PC, 0)) & 
~COND_MASK) | get_cc(type), 0));
 
        if (jump->flags & SLJIT_REWRITABLE_JUMP) {
                jump->addr = compiler->size;
@@ -2431,8 +2166,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_ju
 #else
        if (type >= SLJIT_FAST_CALL)
                jump->flags |= IS_BL;
-       PTR_FAIL_IF(emit_imm(compiler, TMP_REG1, 0));
-       PTR_FAIL_IF(push_inst(compiler, (((type <= SLJIT_JUMP ? BX : BLX) | 
RM(TMP_REG1)) & ~COND_MASK) | get_cc(type)));
+       PTR_FAIL_IF(emit_imm(compiler, TMP_REG2, 0));
+       PTR_FAIL_IF(push_inst(compiler, (((type <= SLJIT_JUMP ? BX : BLX) | 
RM(TMP_REG2)) & ~COND_MASK) | get_cc(type)));
        jump->addr = compiler->size;
 #endif
        return jump;
@@ -2452,7 +2187,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
                        return push_inst(compiler, (type <= SLJIT_JUMP ? BX : 
BLX) | RM(src));
 
                SLJIT_ASSERT(src & SLJIT_MEM);
-               FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG2, 
src, srcw));
+               FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG2, 
src, srcw, TMP_REG2));
                return push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | 
RM(TMP_REG2));
        }
 
@@ -2464,12 +2199,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
        if (type >= SLJIT_FAST_CALL)
                FAIL_IF(prepare_blx(compiler));
-       FAIL_IF(push_inst_with_unique_literal(compiler, 
EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, type <= SLJIT_JUMP ? TMP_PC : 
TMP_REG1, TMP_PC, 0), 0));
+       FAIL_IF(push_inst_with_unique_literal(compiler, 
EMIT_DATA_TRANSFER(WORD_DATA | LOAD_DATA, 1, 0, type <= SLJIT_JUMP ? TMP_PC : 
TMP_REG2, TMP_PC, 0), 0));
        if (type >= SLJIT_FAST_CALL)
                FAIL_IF(emit_blx(compiler));
 #else
-       FAIL_IF(emit_imm(compiler, TMP_REG1, 0));
-       FAIL_IF(push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | 
RM(TMP_REG1)));
+       FAIL_IF(emit_imm(compiler, TMP_REG2, 0));
+       FAIL_IF(push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | 
RM(TMP_REG2)));
 #endif
        jump->addr = compiler->size;
        return SLJIT_SUCCESS;
@@ -2477,55 +2212,80 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler 
*compiler, sljit_s32 op,
        sljit_s32 dst, sljit_sw dstw,
-       sljit_s32 src, sljit_sw srcw,
        sljit_s32 type)
 {
        sljit_s32 dst_r, flags = GET_ALL_FLAGS(op);
        sljit_uw cc, ins;
 
        CHECK_ERROR();
-       CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, 
type));
+       CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
        ADJUST_LOCAL_OFFSET(dst, dstw);
-       ADJUST_LOCAL_OFFSET(src, srcw);
-
-       if (dst == SLJIT_UNUSED)
-               return SLJIT_SUCCESS;
 
        op = GET_OPCODE(op);
        cc = get_cc(type & 0xff);
-       dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
+       dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
 
        if (op < SLJIT_ADD) {
                FAIL_IF(push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 0, 
dst_r, SLJIT_UNUSED, SRC2_IMM | 0)));
                FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(MOV_DP, 0, 
dst_r, SLJIT_UNUSED, SRC2_IMM | 1) & ~COND_MASK) | cc));
-               return (dst_r == TMP_REG2) ? emit_op_mem(compiler, WORD_DATA, 
TMP_REG2, dst, dstw) : SLJIT_SUCCESS;
+               if (dst & SLJIT_MEM)
+                       return emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, 
dstw, TMP_REG2);
+               return SLJIT_SUCCESS;
        }
 
        ins = (op == SLJIT_AND ? AND_DP : (op == SLJIT_OR ? ORR_DP : EOR_DP));
-       if ((op == SLJIT_OR || op == SLJIT_XOR) && FAST_IS_REG(dst) && dst == 
src) {
-               FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst, 
dst, SRC2_IMM | 1) & ~COND_MASK) | cc));
-               /* The condition must always be set, even if the ORR/EOR is not 
executed above. */
-               return (flags & SLJIT_SET_E) ? push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, SET_FLAGS, TMP_REG1, SLJIT_UNUSED, RM(dst))) : 
SLJIT_SUCCESS;
-       }
 
-       compiler->cache_arg = 0;
-       compiler->cache_argw = 0;
-       if (src & SLJIT_MEM) {
-               FAIL_IF(emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, 
src, srcw, dst, dstw));
-               src = TMP_REG1;
-               srcw = 0;
-       } else if (src & SLJIT_IMM) {
+       if (dst & SLJIT_MEM)
+               FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, 
dst, dstw, TMP_REG2));
+
+       FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_r, 
dst_r, SRC2_IMM | 1) & ~COND_MASK) | cc));
+
+       if (op == SLJIT_AND)
+               FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, 
dst_r, dst_r, SRC2_IMM | 0) & ~COND_MASK) | (cc ^ 0x10000000)));
+
+       if (dst & SLJIT_MEM)
+               FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG1, dst, dstw, 
TMP_REG2));
+
+       if (flags & SLJIT_SET_Z)
+               return push_inst(compiler, EMIT_DATA_PROCESS_INS(MOV_DP, 
SET_FLAGS, TMP_REG2, SLJIT_UNUSED, RM(dst_r)));
+       return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler 
*compiler, sljit_s32 type,
+       sljit_s32 dst_reg,
+       sljit_s32 src, sljit_sw srcw)
+{
+       sljit_uw cc, tmp;
+
+       CHECK_ERROR();
+       CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
+
+       dst_reg &= ~SLJIT_I32_OP;
+
+       cc = get_cc(type & 0xff);
+
+       if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
+               tmp = get_imm(srcw);
+               if (tmp)
+                       return push_inst(compiler, 
(EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_reg, SLJIT_UNUSED, tmp) & ~COND_MASK) | 
cc);
+
+               tmp = get_imm(~srcw);
+               if (tmp)
+                       return push_inst(compiler, 
(EMIT_DATA_PROCESS_INS(MVN_DP, 0, dst_reg, SLJIT_UNUSED, tmp) & ~COND_MASK) | 
cc);
+
+#if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
+               tmp = (sljit_uw) srcw;
+               FAIL_IF(push_inst(compiler, (MOVW & ~COND_MASK) | cc | 
RD(dst_reg) | ((tmp << 4) & 0xf0000) | (tmp & 0xfff)));
+               if (tmp <= 0xffff)
+                       return SLJIT_SUCCESS;
+               return push_inst(compiler, (MOVT & ~COND_MASK) | cc | 
RD(dst_reg) | ((tmp >> 12) & 0xf0000) | ((tmp >> 16) & 0xfff));
+#else
                FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
                src = TMP_REG1;
-               srcw = 0;
+#endif
        }
 
-       FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_r, src, 
SRC2_IMM | 1) & ~COND_MASK) | cc));
-       FAIL_IF(push_inst(compiler, (EMIT_DATA_PROCESS_INS(ins, 0, dst_r, src, 
SRC2_IMM | 0) & ~COND_MASK) | (cc ^ 0x10000000)));
-       if (dst_r == TMP_REG2)
-               FAIL_IF(emit_op_mem2(compiler, WORD_DATA, TMP_REG2, dst, dstw, 
0, 0));
-
-       return (flags & SLJIT_SET_E) ? push_inst(compiler, 
EMIT_DATA_PROCESS_INS(MOV_DP, SET_FLAGS, TMP_REG1, SLJIT_UNUSED, RM(dst_r))) : 
SLJIT_SUCCESS;
+       return push_inst(compiler, (EMIT_DATA_PROCESS_INS(MOV_DP, 0, dst_reg, 
SLJIT_UNUSED, RM(src)) & ~COND_MASK) | cc);
 }
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct 
sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
@@ -2551,16 +2311,16 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_co
        set_const(const_, compiler);
 
        if (dst & SLJIT_MEM)
-               PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, 
dstw));
+               PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, 
dstw, TMP_REG1));
        return const_;
 }
 
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw 
new_addr)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw 
new_target, sljit_sw executable_offset)
 {
-       inline_set_jump_addr(addr, new_addr, 1);
+       inline_set_jump_addr(addr, executable_offset, new_target, 1);
 }
 
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw 
new_constant)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw 
new_constant, sljit_sw executable_offset)
 {
-       inline_set_const(addr, new_constant, 1);
+       inline_set_const(addr, executable_offset, new_constant, 1);
 }

Modified: tomcat/jk/trunk/native/iis/pcre/sljit/sljitNativeARM_64.c
URL: 
http://svn.apache.org/viewvc/tomcat/jk/trunk/native/iis/pcre/sljit/sljitNativeARM_64.c?rev=1815927&r1=1815926&r2=1815927&view=diff
==============================================================================
--- tomcat/jk/trunk/native/iis/pcre/sljit/sljitNativeARM_64.c (original)
+++ tomcat/jk/trunk/native/iis/pcre/sljit/sljitNativeARM_64.c Tue Nov 21 
14:37:37 2017
@@ -1,7 +1,7 @@
 /*
  *    Stack-less Just-In-Time compiler
  *
- *    Copyright 2009-2012 Zoltan Herczeg (hzmes...@freemail.hu). All rights 
reserved.
+ *    Copyright Zoltan Herczeg (hzmes...@freemail.hu). All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without 
modification, are
  * permitted provided that the following conditions are met:
@@ -76,6 +76,7 @@ static const sljit_u8 reg_map[SLJIT_NUMB
 #define BRK 0xd4200000
 #define CBZ 0xb4000000
 #define CLZ 0xdac01000
+#define CSEL 0x9a800000
 #define CSINC 0x9a800400
 #define EOR 0xca000000
 #define EORI 0xd2000000
@@ -151,7 +152,7 @@ static SLJIT_INLINE void modify_imm64_co
        inst[3] = MOVK | dst | ((new_imm >> 48) << 5) | (3 << 21);
 }
 
-static SLJIT_INLINE sljit_s32 detect_jump_type(struct sljit_jump *jump, 
sljit_ins *code_ptr, sljit_ins *code)
+static SLJIT_INLINE sljit_s32 detect_jump_type(struct sljit_jump *jump, 
sljit_ins *code_ptr, sljit_ins *code, sljit_sw executable_offset)
 {
        sljit_sw diff;
        sljit_uw target_addr;
@@ -165,9 +166,10 @@ static SLJIT_INLINE sljit_s32 detect_jum
                target_addr = jump->u.target;
        else {
                SLJIT_ASSERT(jump->flags & JUMP_LABEL);
-               target_addr = (sljit_uw)(code + jump->u.label->size);
+               target_addr = (sljit_uw)(code + jump->u.label->size) + 
(sljit_uw)executable_offset;
        }
-       diff = (sljit_sw)target_addr - (sljit_sw)(code_ptr + 4);
+
+       diff = (sljit_sw)target_addr - (sljit_sw)(code_ptr + 4) - 
executable_offset;
 
        if (jump->flags & IS_COND) {
                diff += sizeof(sljit_ins);
@@ -211,6 +213,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
        sljit_ins *buf_ptr;
        sljit_ins *buf_end;
        sljit_uw word_count;
+       sljit_sw executable_offset;
        sljit_uw addr;
        sljit_s32 dst;
 
@@ -228,6 +231,8 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
 
        code_ptr = code;
        word_count = 0;
+       executable_offset = SLJIT_EXEC_OFFSET(code);
+
        label = compiler->labels;
        jump = compiler->jumps;
        const_ = compiler->consts;
@@ -242,13 +247,13 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
                        SLJIT_ASSERT(!jump || jump->addr >= word_count);
                        SLJIT_ASSERT(!const_ || const_->addr >= word_count);
                        if (label && label->size == word_count) {
-                               label->addr = (sljit_uw)code_ptr;
+                               label->addr = 
(sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
                                label->size = code_ptr - code;
                                label = label->next;
                        }
                        if (jump && jump->addr == word_count) {
                                        jump->addr = (sljit_uw)(code_ptr - 4);
-                                       code_ptr -= detect_jump_type(jump, 
code_ptr, code);
+                                       code_ptr -= detect_jump_type(jump, 
code_ptr, code, executable_offset);
                                        jump = jump->next;
                        }
                        if (const_ && const_->addr == word_count) {
@@ -263,7 +268,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
        } while (buf);
 
        if (label && label->size == word_count) {
-               label->addr = (sljit_uw)code_ptr;
+               label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, 
executable_offset);
                label->size = code_ptr - code;
                label = label->next;
        }
@@ -277,9 +282,10 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
        while (jump) {
                do {
                        addr = (jump->flags & JUMP_LABEL) ? jump->u.label->addr 
: jump->u.target;
-                       buf_ptr = (sljit_ins*)jump->addr;
+                       buf_ptr = (sljit_ins *)jump->addr;
+
                        if (jump->flags & PATCH_B) {
-                               addr = (sljit_sw)(addr - jump->addr) >> 2;
+                               addr = (sljit_sw)(addr - 
(sljit_uw)SLJIT_ADD_EXEC_OFFSET(buf_ptr, executable_offset)) >> 2;
                                SLJIT_ASSERT((sljit_sw)addr <= 0x1ffffff && 
(sljit_sw)addr >= -0x2000000);
                                buf_ptr[0] = ((jump->flags & IS_BL) ? BL : B) | 
(addr & 0x3ffffff);
                                if (jump->flags & IS_COND)
@@ -287,7 +293,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
                                break;
                        }
                        if (jump->flags & PATCH_COND) {
-                               addr = (sljit_sw)(addr - jump->addr) >> 2;
+                               addr = (sljit_sw)(addr - 
(sljit_uw)SLJIT_ADD_EXEC_OFFSET(buf_ptr, executable_offset)) >> 2;
                                SLJIT_ASSERT((sljit_sw)addr <= 0x3ffff && 
(sljit_sw)addr >= -0x40000);
                                buf_ptr[0] = (buf_ptr[0] & ~0xffffe0) | ((addr 
& 0x7ffff) << 5);
                                break;
@@ -308,11 +314,37 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_gen
        }
 
        compiler->error = SLJIT_ERR_COMPILED;
+       compiler->executable_offset = executable_offset;
        compiler->executable_size = (code_ptr - code) * sizeof(sljit_ins);
+
+       code = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
+       code_ptr = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(code_ptr, 
executable_offset);
+
        SLJIT_CACHE_FLUSH(code, code_ptr);
        return code;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 
feature_type)
+{
+       switch (feature_type) {
+       case SLJIT_HAS_FPU:
+#ifdef SLJIT_IS_FPU_AVAILABLE
+               return SLJIT_IS_FPU_AVAILABLE;
+#else
+               /* Available by default. */
+               return 1;
+#endif
+
+       case SLJIT_HAS_PRE_UPDATE:
+       case SLJIT_HAS_CLZ:
+       case SLJIT_HAS_CMOV:
+               return 1;
+
+       default:
+               return 0;
+       }
+}
+
 /* --------------------------------------------------------------------- */
 /*  Core code generator functions.                                       */
 /* --------------------------------------------------------------------- */
@@ -365,7 +397,7 @@ static sljit_ins logical_imm(sljit_sw im
        uimm = (sljit_uw)imm;
        while (1) {
                if (len <= 0) {
-                       SLJIT_ASSERT_STOP();
+                       SLJIT_UNREACHABLE();
                        return 0;
                }
                mask = ((sljit_uw)1 << len) - 1;
@@ -635,7 +667,7 @@ static sljit_s32 emit_op_imm(struct slji
                        }
                        goto set_flags;
                default:
-                       SLJIT_ASSERT_STOP();
+                       SLJIT_UNREACHABLE();
                        break;
                }
 
@@ -702,7 +734,7 @@ static sljit_s32 emit_op_imm(struct slji
        case SLJIT_NOT:
                SLJIT_ASSERT(arg1 == TMP_REG1);
                FAIL_IF(push_inst(compiler, (ORN ^ inv_bits) | RD(dst) | 
RN(TMP_ZERO) | RM(arg2)));
-               goto set_flags;
+               break; /* Set flags. */
        case SLJIT_NEG:
                SLJIT_ASSERT(arg1 == TMP_REG1);
                if (flags & SET_FLAGS)
@@ -710,8 +742,7 @@ static sljit_s32 emit_op_imm(struct slji
                return push_inst(compiler, (SUB ^ inv_bits) | RD(dst) | 
RN(TMP_ZERO) | RM(arg2));
        case SLJIT_CLZ:
                SLJIT_ASSERT(arg1 == TMP_REG1);
-               FAIL_IF(push_inst(compiler, (CLZ ^ inv_bits) | RD(dst) | 
RN(arg2)));
-               goto set_flags;
+               return push_inst(compiler, (CLZ ^ inv_bits) | RD(dst) | 
RN(arg2));
        case SLJIT_ADD:
                CHECK_FLAGS(1 << 29);
                return push_inst(compiler, (ADD ^ inv_bits) | RD(dst) | 
RN(arg1) | RM(arg2));
@@ -740,24 +771,24 @@ static sljit_s32 emit_op_imm(struct slji
                return push_inst(compiler, (AND ^ inv_bits) | RD(dst) | 
RN(arg1) | RM(arg2));
        case SLJIT_OR:
                FAIL_IF(push_inst(compiler, (ORR ^ inv_bits) | RD(dst) | 
RN(arg1) | RM(arg2)));
-               goto set_flags;
+               break; /* Set flags. */
        case SLJIT_XOR:
                FAIL_IF(push_inst(compiler, (EOR ^ inv_bits) | RD(dst) | 
RN(arg1) | RM(arg2)));
-               goto set_flags;
+               break; /* Set flags. */
        case SLJIT_SHL:
                FAIL_IF(push_inst(compiler, (LSLV ^ inv_bits) | RD(dst) | 
RN(arg1) | RM(arg2)));
-               goto set_flags;
+               break; /* Set flags. */
        case SLJIT_LSHR:
                FAIL_IF(push_inst(compiler, (LSRV ^ inv_bits) | RD(dst) | 
RN(arg1) | RM(arg2)));
-               goto set_flags;
+               break; /* Set flags. */
        case SLJIT_ASHR:
                FAIL_IF(push_inst(compiler, (ASRV ^ inv_bits) | RD(dst) | 
RN(arg1) | RM(arg2)));
-               goto set_flags;
+               break; /* Set flags. */
+       default:
+               SLJIT_UNREACHABLE();
+               return SLJIT_SUCCESS;
        }
 
-       SLJIT_ASSERT_STOP();
-       return SLJIT_SUCCESS;
-
 set_flags:
        if (flags & SET_FLAGS)
                return push_inst(compiler, (SUBS ^ inv_bits) | RD(TMP_ZERO) | 
RN(dst) | RM(TMP_ZERO));
@@ -859,6 +890,10 @@ static sljit_s32 getput_arg_fast(struct
        }
 
        arg &= REG_MASK;
+
+       if (arg == SLJIT_UNUSED)
+               return 0;
+
        if (argw >= 0 && (argw >> shift) <= 0xfff && (argw & ((1 << shift) - 
1)) == 0) {
                if (SLJIT_UNLIKELY(flags & ARG_TEST))
                        return 1;
@@ -919,21 +954,23 @@ static sljit_s32 getput_arg(struct sljit
                next_argw = 0;
        }
 
-       tmp_r = (flags & STORE) ? TMP_REG3 : reg;
+       tmp_r = ((flags & STORE) || (flags == (WORD_SIZE | SIGNED))) ? TMP_REG3 
: reg;
 
        if (SLJIT_UNLIKELY((flags & UPDATE) && (arg & REG_MASK))) {
                /* Update only applies if a base register exists. */
                other_r = OFFS_REG(arg);
                if (!other_r) {
                        other_r = arg & REG_MASK;
-                       if (other_r != reg && argw >= 0 && argw <= 0xffffff) {
+                       SLJIT_ASSERT(other_r != reg);
+
+                       if (argw >= 0 && argw <= 0xffffff) {
                                if ((argw & 0xfff) != 0)
                                        FAIL_IF(push_inst(compiler, ADDI | 
RD(other_r) | RN(other_r) | ((argw & 0xfff) << 10)));
                                if (argw >> 12)
                                        FAIL_IF(push_inst(compiler, ADDI | (1 
<< 22) | RD(other_r) | RN(other_r) | ((argw >> 12) << 10)));
                                return push_inst(compiler, sljit_mem_imm[flags 
& 0x3] | (shift << 30) | RT(reg) | RN(other_r));
                        }
-                       else if (other_r != reg && argw < 0 && argw >= 
-0xffffff) {
+                       else if (argw < 0 && argw >= -0xffffff) {
                                argw = -argw;
                                if ((argw & 0xfff) != 0)
                                        FAIL_IF(push_inst(compiler, SUBI | 
RD(other_r) | RN(other_r) | ((argw & 0xfff) << 10)));
@@ -966,18 +1003,8 @@ static sljit_s32 getput_arg(struct sljit
 
                /* No caching here. */
                arg &= REG_MASK;
-               argw &= 0x3;
-               if (!argw || argw == shift) {
-                       FAIL_IF(push_inst(compiler, sljit_mem_reg[flags & 0x3] 
| (shift << 30) | RT(reg) | RN(arg) | RM(other_r) | (argw ? (1 << 12) : 0)));
-                       return push_inst(compiler, ADD | RD(arg) | RN(arg) | 
RM(other_r) | (argw << 10));
-               }
-               if (arg != reg) {
-                       FAIL_IF(push_inst(compiler, ADD | RD(arg) | RN(arg) | 
RM(other_r) | (argw << 10)));
-                       return push_inst(compiler, sljit_mem_imm[flags & 0x3] | 
(shift << 30) | RT(reg) | RN(arg));
-               }
-               FAIL_IF(push_inst(compiler, ADD | RD(TMP_LR) | RN(arg) | 
RM(other_r) | (argw << 10)));
-               FAIL_IF(push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift 
<< 30) | RT(reg) | RN(TMP_LR)));
-               return push_inst(compiler, ORR | RD(arg) | RN(TMP_ZERO) | 
RM(TMP_LR));
+               FAIL_IF(push_inst(compiler, sljit_mem_reg[flags & 0x3] | (shift 
<< 30) | RT(reg) | RN(arg) | RM(other_r)));
+               return push_inst(compiler, ADD | RD(arg) | RN(arg) | 
RM(other_r));
        }
 
        if (arg & OFFS_REG_MASK) {
@@ -998,16 +1025,16 @@ static sljit_s32 getput_arg(struct sljit
                }
        }
 
-       if (argw >= 0 && argw <= 0xffffff && (argw & ((1 << shift) - 1)) == 0) {
-               FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_r) | 
RN(arg & REG_MASK) | ((argw >> 12) << 10)));
-               return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift 
<< 30)
-                       | RT(reg) | RN(tmp_r) | ((argw & 0xfff) << (10 - 
shift)));
-       }
-
        diff = argw - next_argw;
        next_arg = (arg & REG_MASK) && (arg == next_arg) && diff <= 0xfff && 
diff >= -0xfff && diff != 0;
        arg &= REG_MASK;
 
+       if (arg != SLJIT_UNUSED && argw >= 0 && argw <= 0xffffff && (argw & ((1 
<< shift) - 1)) == 0) {
+               FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_r) | 
RN(arg) | ((argw >> 12) << 10)));
+               return push_inst(compiler, sljit_mem_imm[flags & 0x3] | (shift 
<< 30)
+                       | RT(reg) | RN(tmp_r) | ((argw & 0xfff) << (10 - 
shift)));
+       }
+
        if (arg && compiler->cache_arg == SLJIT_MEM) {
                if (compiler->cache_argw == argw)
                        return push_inst(compiler, sljit_mem_reg[flags & 0x3] | 
(shift << 30) | RT(reg) | RN(arg) | RM(TMP_REG3));
@@ -1290,6 +1317,23 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        compiler->cache_arg = 0;
        compiler->cache_argw = 0;
 
+       if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
+               if (op <= SLJIT_MOV_P && (src & SLJIT_MEM)) {
+                       SLJIT_ASSERT(reg_map[1] == 0 && reg_map[3] == 2 && 
reg_map[5] == 4);
+
+                       if (op >= SLJIT_MOV_U8 && op <= SLJIT_MOV_S8)
+                               dst = 5;
+                       else if (op >= SLJIT_MOV_U16 && op <= SLJIT_MOV_S16)
+                               dst = 3;
+                       else
+                               dst = 1;
+
+                       /* Signed word sized load is the prefetch instruction. 
*/
+                       return emit_op_mem(compiler, WORD_SIZE | SIGNED, dst, 
src, srcw);
+               }
+               return SLJIT_SUCCESS;
+       }
+
        dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
 
        op = GET_OPCODE(op);
@@ -1364,7 +1408,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
                                srcw = (sljit_s32)srcw;
                        break;
                default:
-                       SLJIT_ASSERT_STOP();
+                       SLJIT_UNREACHABLE();
                        flags = 0;
                        break;
                }
@@ -1391,7 +1435,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
                return SLJIT_SUCCESS;
        }
 
-       flags = GET_FLAGS(op_flags) ? SET_FLAGS : 0;
+       flags = HAS_FLAGS(op_flags) ? SET_FLAGS : 0;
        mem_flags = WORD_SIZE;
        if (op_flags & SLJIT_I32_OP) {
                flags |= INT_OP;
@@ -1443,8 +1487,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        compiler->cache_arg = 0;
        compiler->cache_argw = 0;
 
+       if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
+               return SLJIT_SUCCESS;
+
        dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
-       flags = GET_FLAGS(op) ? SET_FLAGS : 0;
+       flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
        mem_flags = WORD_SIZE;
        if (op & SLJIT_I32_OP) {
                flags |= INT_OP;
@@ -1537,16 +1584,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 /*  Floating point operators                                             */
 /* --------------------------------------------------------------------- */
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
-{
-#ifdef SLJIT_IS_FPU_AVAILABLE
-       return SLJIT_IS_FPU_AVAILABLE;
-#else
-       /* Available by default. */
-       return 1;
-#endif
-}
-
 static sljit_s32 emit_fop_mem(struct sljit_compiler *compiler, sljit_s32 
flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
 {
        sljit_u32 shift = MEM_SIZE_SHIFT(flags);
@@ -1604,7 +1641,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit
        sljit_s32 dst, sljit_sw dstw,
        sljit_s32 src, sljit_sw srcw)
 {
-       sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+       sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
        sljit_ins inv_bits = (op & SLJIT_F32_OP) ? (1 << 22) : 0;
 
        if (GET_OPCODE(op) == SLJIT_CONV_S32_FROM_F64)
@@ -1617,7 +1654,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit
 
        FAIL_IF(push_inst(compiler, (FCVTZS ^ inv_bits) | RD(dst_r) | VN(src)));
 
-       if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
+       if (dst & SLJIT_MEM)
                return emit_op_mem(compiler, ((GET_OPCODE(op) == 
SLJIT_CONV_S32_FROM_F64) ? INT_SIZE : WORD_SIZE) | STORE, TMP_REG1, dst, dstw);
        return SLJIT_SUCCESS;
 }
@@ -1775,10 +1812,6 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
        CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
        ADJUST_LOCAL_OFFSET(dst, dstw);
 
-       /* For UNUSED dst. Uncommon, but possible. */
-       if (dst == SLJIT_UNUSED)
-               return SLJIT_SUCCESS;
-
        if (FAST_IS_REG(dst))
                return push_inst(compiler, ORR | RD(dst) | RN(TMP_ZERO) | 
RM(TMP_LR));
 
@@ -1856,7 +1889,7 @@ static sljit_uw get_cc(sljit_s32 type)
                return 0x6;
 
        default:
-               SLJIT_ASSERT_STOP();
+               SLJIT_UNREACHABLE();
                return 0xe;
        }
 }
@@ -1966,19 +1999,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler 
*compiler, sljit_s32 op,
        sljit_s32 dst, sljit_sw dstw,
-       sljit_s32 src, sljit_sw srcw,
        sljit_s32 type)
 {
-       sljit_s32 dst_r, flags, mem_flags;
+       sljit_s32 dst_r, src_r, flags, mem_flags;
        sljit_ins cc;
 
        CHECK_ERROR();
-       CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, 
type));
+       CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
        ADJUST_LOCAL_OFFSET(dst, dstw);
-       ADJUST_LOCAL_OFFSET(src, srcw);
-
-       if (dst == SLJIT_UNUSED)
-               return SLJIT_SUCCESS;
 
        cc = get_cc(type & 0xff);
        dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
@@ -1992,26 +2020,50 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit
 
        compiler->cache_arg = 0;
        compiler->cache_argw = 0;
-       flags = GET_FLAGS(op) ? SET_FLAGS : 0;
+       flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
        mem_flags = WORD_SIZE;
        if (op & SLJIT_I32_OP) {
                flags |= INT_OP;
                mem_flags = INT_SIZE;
        }
 
-       if (src & SLJIT_MEM) {
-               FAIL_IF(emit_op_mem2(compiler, mem_flags, TMP_REG1, src, srcw, 
dst, dstw));
+       src_r = dst;
+
+       if (dst & SLJIT_MEM) {
+               FAIL_IF(emit_op_mem2(compiler, mem_flags, TMP_REG1, dst, dstw, 
dst, dstw));
+               src_r = TMP_REG1;
+       }
+
+       FAIL_IF(push_inst(compiler, CSINC | (cc << 12) | RD(TMP_REG2) | 
RN(TMP_ZERO) | RM(TMP_ZERO)));
+       emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src_r, TMP_REG2);
+
+       if (dst & SLJIT_MEM)
+               return emit_op_mem2(compiler, mem_flags | STORE, TMP_REG1, dst, 
dstw, 0, 0);
+       return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler 
*compiler, sljit_s32 type,
+       sljit_s32 dst_reg,
+       sljit_s32 src, sljit_sw srcw)
+{
+       sljit_ins inv_bits = (dst_reg & SLJIT_I32_OP) ? (1 << 31) : 0;
+       sljit_ins cc;
+
+       CHECK_ERROR();
+       CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
+
+       if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
+               if (dst_reg & SLJIT_I32_OP)
+                       srcw = (sljit_s32)srcw;
+               FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
                src = TMP_REG1;
                srcw = 0;
-       } else if (src & SLJIT_IMM)
-               flags |= ARG1_IMM;
+       }
 
-       FAIL_IF(push_inst(compiler, CSINC | (cc << 12) | RD(TMP_REG2) | 
RN(TMP_ZERO) | RM(TMP_ZERO)));
-       emit_op_imm(compiler, flags | GET_OPCODE(op), dst_r, src, TMP_REG2);
+       cc = get_cc(type & 0xff);
+       dst_reg &= ~SLJIT_I32_OP;
 
-       if (dst_r != TMP_REG1)
-               return SLJIT_SUCCESS;
-       return emit_op_mem2(compiler, mem_flags | STORE, TMP_REG1, dst, dstw, 
0, 0);
+       return push_inst(compiler, (CSEL ^ inv_bits) | (cc << 12) | RD(dst_reg) 
| RN(dst_reg) | RM(src));
 }
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct 
sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
@@ -2027,7 +2079,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_co
        PTR_FAIL_IF(!const_);
        set_const(const_, compiler);
 
-       dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
+       dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
        PTR_FAIL_IF(emit_imm64_const(compiler, dst_r, init_value));
 
        if (dst & SLJIT_MEM)
@@ -2035,16 +2087,18 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_co
        return const_;
 }
 
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw 
new_addr)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw 
new_target, sljit_sw executable_offset)
 {
        sljit_ins* inst = (sljit_ins*)addr;
-       modify_imm64_const(inst, new_addr);
+       modify_imm64_const(inst, new_target);
+       inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
        SLJIT_CACHE_FLUSH(inst, inst + 4);
 }
 
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw 
new_constant)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw 
new_constant, sljit_sw executable_offset)
 {
        sljit_ins* inst = (sljit_ins*)addr;
        modify_imm64_const(inst, new_constant);
+       inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
        SLJIT_CACHE_FLUSH(inst, inst + 4);
 }



---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org
For additional commands, e-mail: dev-h...@tomcat.apache.org

Reply via email to