As present on Atom and Haswell processors. Signed-off-by: Richard Henderson <r...@twiddle.net> --- disas/i386.c | 8 ++-- tcg/i386/tcg-target.c | 127 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 91 insertions(+), 44 deletions(-)
Here's to "stress testing" a Haswell laptop before Santa delivers it. ;-) r~ diff --git a/disas/i386.c b/disas/i386.c index 47f1f2e..beb33f0 100644 --- a/disas/i386.c +++ b/disas/i386.c @@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = { /* PREGRP87 */ { - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { "movbe", { Gv, M } }, + { "movbe", { Gw, M } }, { "(bad)", { XX } }, { "crc32", { Gdq, { CRC32_Fixup, b_mode } } }, }, /* PREGRP88 */ { - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { "movbe", { M, Gv } }, + { "movbe", { M, Gw } }, { "(bad)", { XX } }, { "crc32", { Gdq, { CRC32_Fixup, v_mode } } }, }, diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 7ac8e45..e76f3f3 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = { # define TCG_REG_L1 TCG_REG_EDX #endif +#ifdef CONFIG_CPUID_H +#include <cpuid.h> +#endif + /* For 32-bit, we are going to attempt to determine at runtime whether cmov is available. However, the host compiler must supply <cpuid.h>, as we're not going to go so far as our own inline assembly. */ #if TCG_TARGET_REG_BITS == 64 # define have_cmov 1 #elif defined(CONFIG_CPUID_H) -#include <cpuid.h> static bool have_cmov; #else # define have_cmov 0 #endif +/* There is no pre-processor definition for MOVBE to fall back on. */ +#ifdef CONFIG_CPUID_H +static bool have_movbe; +#else +#define have_movbe 0 +#endif + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long val, # define P_REXB_RM 0 # define P_GS 0 #endif +#define P_EXT38 0x8000 /* 0x0f 0x38 opcode prefix */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) @@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_MOVB_EvIz (0xc6) #define OPC_MOVL_EvIz (0xc7) #define OPC_MOVL_Iv (0xb8) +#define OPC_MOVBE_GyMy (0xf0 | P_EXT | P_EXT38) +#define OPC_MOVBE_MyGy (0xf1 | P_EXT | P_EXT38) #define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSLQ (0x63 | P_REXW) @@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) if (opc & P_EXT) { tcg_out8(s, 0x0f); + if (opc & P_EXT38) { + tcg_out8(s, 0x38); + } } tcg_out8(s, opc); } @@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc) } if (opc & P_EXT) { tcg_out8(s, 0x0f); + if (opc & P_EXT38) { + tcg_out8(s, 0x38); + } } tcg_out8(s, opc); } @@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, TCGReg base, intptr_t ofs, int seg, TCGMemOp memop) { - const TCGMemOp bswap = memop & MO_BSWAP; + const TCGMemOp real_bswap = memop & MO_BSWAP; + TCGMemOp bswap = real_bswap; + int movop = OPC_MOVL_GvEv; + + if (real_bswap && have_movbe) { + bswap = 0; + movop = OPC_MOVBE_GyMy; + } switch (memop & MO_SSIZE) { case MO_UB: @@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs); break; case MO_UW: - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); - if (bswap) { - tcg_out_rolw_8(s, datalo); + if (real_bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, + datalo, base, ofs); + tcg_out_ext16u(s, datalo, datalo); + } else { + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); + if (bswap) { + tcg_out_rolw_8(s, datalo); + } } break; case MO_SW: - if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); - tcg_out_rolw_8(s, datalo); - tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo); + if (real_bswap) { + if (have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); + tcg_out_rolw_8(s, datalo); + } + tcg_out_ext16s(s, datalo, datalo, P_REXW); } else { tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg, datalo, base, ofs); } break; case MO_UL: - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); if (bswap) { tcg_out_bswap32(s, datalo); } break; #if TCG_TARGET_REG_BITS == 64 case MO_SL: - if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); - tcg_out_bswap32(s, datalo); + if (real_bswap) { + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + if (bswap) { + tcg_out_bswap32(s, datalo); + } tcg_out_ext32s(s, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs); @@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, #endif case MO_Q: if (TCG_TARGET_REG_BITS == 64) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); if (bswap) { tcg_out_bswap64(s, datalo); } } else { - if (bswap) { + if (real_bswap) { int t = datalo; datalo = datahi; datahi = t; } if (base != datalo) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); } else { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); } if (bswap) { tcg_out_bswap32(s, datalo); @@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, TCGReg base, intptr_t ofs, int seg, TCGMemOp memop) { - const TCGMemOp bswap = memop & MO_BSWAP; - /* ??? Ideally we wouldn't need a scratch register. For user-only, we could perform the bswap twice to restore the original value instead of moving to the scratch. But as it is, the L constraint means that TCG_REG_L0 is definitely free here. */ const TCGReg scratch = TCG_REG_L0; + const TCGMemOp real_bswap = memop & MO_BSWAP; + TCGMemOp bswap = real_bswap; + int movop = OPC_MOVL_EvGv; + + if (real_bswap && have_movbe) { + bswap = 0; + movop = OPC_MOVBE_MyGy; + } switch (memop & MO_SIZE) { case MO_8: @@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_rolw_8(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs); break; case MO_32: if (bswap) { @@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap32(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); break; case MO_64: if (TCG_TARGET_REG_BITS == 64) { @@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap64(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); } else if (bswap) { tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi); tcg_out_bswap32(s, scratch); @@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap32(s, scratch); tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4); } else { - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4); + if (real_bswap) { + int t = datalo; + datalo = datahi; + datahi = t; + } + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4); } break; default: @@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { - /* For 32-bit, 99% certainty that we're running on hardware that supports - cmov, but we still need to check. In case cmov is not available, we'll - use a small forward branch. */ + unsigned a, b, c, d; + + if (__get_cpuid(1, &a, &b, &c, &d)) { #ifndef have_cmov - { - unsigned a, b, c, d; - have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV)); - } + /* For 32-bit, 99% certainty that we're running on hardware that + supports cmov, but we still need to check. In case cmov is not + available, we'll use a small forward branch. */ + have_cmov = (d & bit_CMOV) != 0; #endif +#ifndef have_movbe + have_movbe = (c & bit_MOVBE) != 0; +#endif + } if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff); -- 1.8.4.2