On Fri, Jan 31, 2014 at 08:43:38AM -0600, Richard Henderson wrote: > These three-operand shift instructions do not require the shift count > to be placed into ECX. This reduces the number of mov insns required, > with the mere addition of a new register constraint. > > Don't attempt to get rid of the matching constraint, as that's impossible > to manipulate with just a new constraint. In addition, constant shifts > still need the matching constraint. > > Signed-off-by: Richard Henderson <r...@twiddle.net> > --- > tcg/i386/tcg-target.c | 61 > +++++++++++++++++++++++++++++++++++++++++---------- > 1 file changed, 50 insertions(+), 11 deletions(-) > > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index 4f6b9c1..fef1717 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -133,6 +133,12 @@ static bool have_movbe; > it there. Therefore we always define the variable. */ > bool have_bmi1; > > +#if defined(CONFIG_CPUID_H) && defined(bit_BMI2) > +static bool have_bmi2; > +#else > +# define have_bmi2 0 > +#endif > + > static uint8_t *tb_ret_addr; > > static void patch_reloc(uint8_t *code_ptr, int type, > @@ -175,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, > const char **pct_str) > tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX); > break; > case 'c': > + case_c: > ct->ct |= TCG_CT_REG; > tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX); > break; > @@ -203,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, > const char **pct_str) > tcg_regset_set32(ct->u.regs, 0, 0xf); > break; > case 'r': > + case_r: > ct->ct |= TCG_CT_REG; > if (TCG_TARGET_REG_BITS == 64) { > tcg_regset_set32(ct->u.regs, 0, 0xffff); > @@ -210,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, > const char **pct_str) > tcg_regset_set32(ct->u.regs, 0, 0xff); > } > break; > + case 'C': > + /* With SHRX et al, we need not use ECX as shift count register. */ > + if (have_bmi2) { > + goto case_r; > + } else { > + goto case_c; > + } > > /* qemu_ld/st address constraint */ > case 'L': > @@ -283,6 +298,8 @@ static inline int tcg_target_const_match(tcg_target_long > val, > # define P_REXB_RM 0 > # define P_GS 0 > #endif > +#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */ > +#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */ > > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > @@ -325,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long > val, > #define OPC_SHIFT_1 (0xd1) > #define OPC_SHIFT_Ib (0xc1) > #define OPC_SHIFT_cl (0xd3) > +#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) > +#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) > +#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) > #define OPC_TESTL (0x85) > #define OPC_XCHG_ax_r32 (0x90) > > @@ -493,7 +513,14 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, > int r, int v, int rm) > > tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ > } > - tmp |= (opc & P_DATA16 ? 1 : 0); /* VEX.pp */ > + /* VEX.pp */ > + if (opc & P_DATA16) { > + tmp |= 1; /* 0x66 */ > + } else if (opc & P_SIMDF3) { > + tmp |= 2; /* 0xf3 */ > + } else if (opc & P_SIMDF2) { > + tmp |= 3; /* 0xf2 */ > + } > tmp |= (~v & 15) << 3; /* VEX.vvvv */ > tcg_out8(s, tmp); > tcg_out8(s, opc); > @@ -1689,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg > *args, bool is64) > static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > const TCGArg *args, const int *const_args) > { > - int c, rexw = 0; > + int c, vexop, rexw = 0; > > #if TCG_TARGET_REG_BITS == 64 > # define OP_32_64(x) \ > @@ -1860,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, > TCGOpcode opc, > > OP_32_64(shl): > c = SHIFT_SHL; > - goto gen_shift; > + vexop = OPC_SHLX; > + goto gen_shift_maybe_vex; > OP_32_64(shr): > c = SHIFT_SHR; > - goto gen_shift; > + vexop = OPC_SHRX; > + goto gen_shift_maybe_vex; > OP_32_64(sar): > c = SHIFT_SAR; > - goto gen_shift; > + vexop = OPC_SARX; > + goto gen_shift_maybe_vex; > OP_32_64(rotl): > c = SHIFT_ROL; > goto gen_shift; > OP_32_64(rotr): > c = SHIFT_ROR; > goto gen_shift; > + gen_shift_maybe_vex: > + if (have_bmi2 && !const_args[2]) { > + tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); > + break; > + } > + /* FALLTHRU */ > gen_shift: > if (const_args[2]) { > tcg_out_shifti(s, c + rexw, args[0], args[2]); > @@ -2065,9 +2101,9 @@ static const TCGTargetOpDef x86_op_defs[] = { > { INDEX_op_xor_i32, { "r", "0", "ri" } }, > { INDEX_op_andc_i32, { "r", "r", "ri" } }, > > - { INDEX_op_shl_i32, { "r", "0", "ci" } }, > - { INDEX_op_shr_i32, { "r", "0", "ci" } }, > - { INDEX_op_sar_i32, { "r", "0", "ci" } }, > + { INDEX_op_shl_i32, { "r", "0", "Ci" } }, > + { INDEX_op_shr_i32, { "r", "0", "Ci" } }, > + { INDEX_op_sar_i32, { "r", "0", "Ci" } }, > { INDEX_op_rotl_i32, { "r", "0", "ci" } }, > { INDEX_op_rotr_i32, { "r", "0", "ci" } }, > > @@ -2123,9 +2159,9 @@ static const TCGTargetOpDef x86_op_defs[] = { > { INDEX_op_xor_i64, { "r", "0", "re" } }, > { INDEX_op_andc_i64, { "r", "r", "rI" } }, > > - { INDEX_op_shl_i64, { "r", "0", "ci" } }, > - { INDEX_op_shr_i64, { "r", "0", "ci" } }, > - { INDEX_op_sar_i64, { "r", "0", "ci" } }, > + { INDEX_op_shl_i64, { "r", "0", "Ci" } }, > + { INDEX_op_shr_i64, { "r", "0", "Ci" } }, > + { INDEX_op_sar_i64, { "r", "0", "Ci" } }, > { INDEX_op_rotl_i64, { "r", "0", "ci" } }, > { INDEX_op_rotr_i64, { "r", "0", "ci" } }, > > @@ -2283,6 +2319,9 @@ static void tcg_target_init(TCGContext *s) > #ifdef bit_BMI > have_bmi1 = (b & bit_BMI) != 0; > #endif > +#ifndef have_bmi2 > + have_bmi2 = (b & bit_BMI2) != 0; > +#endif > } > > if (TCG_TARGET_REG_BITS == 64) {
Reviewed-by: Aurelien Jarno <aurel...@aurel32.net> -- Aurelien Jarno GPG: 1024D/F1BCDB73 aurel...@aurel32.net http://www.aurel32.net