Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
On 02/16/2014 08:21 AM, Paolo Bonzini wrote: > Il 31/01/2014 15:43, Richard Henderson ha scritto: >> +gen_shift_maybe_vex: >> +if (have_bmi2 && !const_args[2]) { >> +tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); >> +break; >> +} >> +/* FALLTHRU */ > > What if args[2] happens to be ECX? I ran some measurements and as I expected this basically never happens. For 64-bit, I never saw it occur. For 32-bit, 1/800 of all shifts used ecx. For 64-bit, the use of shlx et al is always a size win. The mov and shift, including their rex prefixes, are 3 bytes each, while the shlx is 5 byes. For 32-bit, things are more complicated. The mov and shift are 2 bytes each, so the use of shlx is by itself a 1 byte size penalty. Except that sometimes the avoidance of the mov results in fewer spills, and thus fewer bytes overall. So overall I see the barest fraction (< 0.01%) size decrease across all TBs. r~
Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
On Fri, Jan 31, 2014 at 08:43:38AM -0600, Richard Henderson wrote: > These three-operand shift instructions do not require the shift count > to be placed into ECX. This reduces the number of mov insns required, > with the mere addition of a new register constraint. > > Don't attempt to get rid of the matching constraint, as that's impossible > to manipulate with just a new constraint. In addition, constant shifts > still need the matching constraint. > > Signed-off-by: Richard Henderson > --- > tcg/i386/tcg-target.c | 61 > +-- > 1 file changed, 50 insertions(+), 11 deletions(-) > > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index 4f6b9c1..fef1717 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -133,6 +133,12 @@ static bool have_movbe; > it there. Therefore we always define the variable. */ > bool have_bmi1; > > +#if defined(CONFIG_CPUID_H) && defined(bit_BMI2) > +static bool have_bmi2; > +#else > +# define have_bmi2 0 > +#endif > + > static uint8_t *tb_ret_addr; > > static void patch_reloc(uint8_t *code_ptr, int type, > @@ -175,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, > const char **pct_str) > tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX); > break; > case 'c': > +case_c: > ct->ct |= TCG_CT_REG; > tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX); > break; > @@ -203,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, > const char **pct_str) > tcg_regset_set32(ct->u.regs, 0, 0xf); > break; > case 'r': > +case_r: > ct->ct |= TCG_CT_REG; > if (TCG_TARGET_REG_BITS == 64) { > tcg_regset_set32(ct->u.regs, 0, 0x); > @@ -210,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, > const char **pct_str) > tcg_regset_set32(ct->u.regs, 0, 0xff); > } > break; > +case 'C': > +/* With SHRX et al, we need not use ECX as shift count register. */ > +if (have_bmi2) { > +goto case_r; > +} else { > +goto case_c; > +} > > /* qemu_ld/st address constraint */ > case 'L': > @@ -283,6 +298,8 @@ static inline int tcg_target_const_match(tcg_target_long > val, > # define P_REXB_RM 0 > # define P_GS 0 > #endif > +#define P_SIMDF30x1 /* 0xf3 opcode prefix */ > +#define P_SIMDF20x2 /* 0xf2 opcode prefix */ > > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > @@ -325,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long > val, > #define OPC_SHIFT_1 (0xd1) > #define OPC_SHIFT_Ib (0xc1) > #define OPC_SHIFT_cl (0xd3) > +#define OPC_SARX(0xf7 | P_EXT38 | P_SIMDF3) > +#define OPC_SHLX(0xf7 | P_EXT38 | P_DATA16) > +#define OPC_SHRX(0xf7 | P_EXT38 | P_SIMDF2) > #define OPC_TESTL(0x85) > #define OPC_XCHG_ax_r32 (0x90) > > @@ -493,7 +513,14 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, > int r, int v, int rm) > > tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ > } > -tmp |= (opc & P_DATA16 ? 1 : 0); /* VEX.pp */ > +/* VEX.pp */ > +if (opc & P_DATA16) { > +tmp |= 1; /* 0x66 */ > +} else if (opc & P_SIMDF3) { > +tmp |= 2; /* 0xf3 */ > +} else if (opc & P_SIMDF2) { > +tmp |= 3; /* 0xf2 */ > +} > tmp |= (~v & 15) << 3; /* VEX. */ > tcg_out8(s, tmp); > tcg_out8(s, opc); > @@ -1689,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg > *args, bool is64) > static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, >const TCGArg *args, const int *const_args) > { > -int c, rexw = 0; > +int c, vexop, rexw = 0; > > #if TCG_TARGET_REG_BITS == 64 > # define OP_32_64(x) \ > @@ -1860,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, > TCGOpcode opc, > > OP_32_64(shl): > c = SHIFT_SHL; > -goto gen_shift; > +vexop = OPC_SHLX; > +goto gen_shift_maybe_vex; > OP_32_64(shr): > c = SHIFT_SHR; > -goto gen_shift; > +vexop = OPC_SHRX; > +goto gen_shift_maybe_vex; > OP_32_64(sar): > c = SHIFT_SAR; > -goto gen_shift; > +vexop = OPC_SARX; > +goto gen_shift_maybe_vex; > OP_32_64(rotl): > c = SHIFT_ROL; > goto gen_shift; > OP_32_64(rotr): > c = SHIFT_ROR; > goto gen_shift; > +gen_shift_maybe_vex: > +if (have_bmi2 && !const_args[2]) { > +tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); > +break; > +} > +/* FALLTHRU */ > gen_shift: > if (const_args[2]) { >
Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
On 02/16/2014 06:21 AM, Paolo Bonzini wrote: > Il 31/01/2014 15:43, Richard Henderson ha scritto: >> +gen_shift_maybe_vex: >> +if (have_bmi2 && !const_args[2]) { >> +tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); >> +break; >> +} >> +/* FALLTHRU */ > > What if args[2] happens to be ECX? shlx handles that just fine. I don't think it's worth an extra check to fall back to shl on the off-chance that ecx is used; it's pretty far down on the register allocation order list, so it wouldn't happen often. r~
Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
Il 31/01/2014 15:43, Richard Henderson ha scritto: +gen_shift_maybe_vex: +if (have_bmi2 && !const_args[2]) { +tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); +break; +} +/* FALLTHRU */ What if args[2] happens to be ECX? Apart from this, Reviewed-by: Paolo Bonzini so feel free to post PATCH 6/5 and then squash it in the pull request. Paolo
[Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
These three-operand shift instructions do not require the shift count to be placed into ECX. This reduces the number of mov insns required, with the mere addition of a new register constraint. Don't attempt to get rid of the matching constraint, as that's impossible to manipulate with just a new constraint. In addition, constant shifts still need the matching constraint. Signed-off-by: Richard Henderson --- tcg/i386/tcg-target.c | 61 +-- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 4f6b9c1..fef1717 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -133,6 +133,12 @@ static bool have_movbe; it there. Therefore we always define the variable. */ bool have_bmi1; +#if defined(CONFIG_CPUID_H) && defined(bit_BMI2) +static bool have_bmi2; +#else +# define have_bmi2 0 +#endif + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -175,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX); break; case 'c': +case_c: ct->ct |= TCG_CT_REG; tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX); break; @@ -203,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set32(ct->u.regs, 0, 0xf); break; case 'r': +case_r: ct->ct |= TCG_CT_REG; if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(ct->u.regs, 0, 0x); @@ -210,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set32(ct->u.regs, 0, 0xff); } break; +case 'C': +/* With SHRX et al, we need not use ECX as shift count register. */ +if (have_bmi2) { +goto case_r; +} else { +goto case_c; +} /* qemu_ld/st address constraint */ case 'L': @@ -283,6 +298,8 @@ static inline int tcg_target_const_match(tcg_target_long val, # define P_REXB_RM 0 # define P_GS 0 #endif +#define P_SIMDF30x1 /* 0xf3 opcode prefix */ +#define P_SIMDF20x2 /* 0xf2 opcode prefix */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) @@ -325,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_SHIFT_1(0xd1) #define OPC_SHIFT_Ib (0xc1) #define OPC_SHIFT_cl (0xd3) +#define OPC_SARX(0xf7 | P_EXT38 | P_SIMDF3) +#define OPC_SHLX(0xf7 | P_EXT38 | P_DATA16) +#define OPC_SHRX(0xf7 | P_EXT38 | P_SIMDF2) #define OPC_TESTL (0x85) #define OPC_XCHG_ax_r32(0x90) @@ -493,7 +513,14 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ } -tmp |= (opc & P_DATA16 ? 1 : 0); /* VEX.pp */ +/* VEX.pp */ +if (opc & P_DATA16) { +tmp |= 1; /* 0x66 */ +} else if (opc & P_SIMDF3) { +tmp |= 2; /* 0xf3 */ +} else if (opc & P_SIMDF2) { +tmp |= 3; /* 0xf2 */ +} tmp |= (~v & 15) << 3; /* VEX. */ tcg_out8(s, tmp); tcg_out8(s, opc); @@ -1689,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) { -int c, rexw = 0; +int c, vexop, rexw = 0; #if TCG_TARGET_REG_BITS == 64 # define OP_32_64(x) \ @@ -1860,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, OP_32_64(shl): c = SHIFT_SHL; -goto gen_shift; +vexop = OPC_SHLX; +goto gen_shift_maybe_vex; OP_32_64(shr): c = SHIFT_SHR; -goto gen_shift; +vexop = OPC_SHRX; +goto gen_shift_maybe_vex; OP_32_64(sar): c = SHIFT_SAR; -goto gen_shift; +vexop = OPC_SARX; +goto gen_shift_maybe_vex; OP_32_64(rotl): c = SHIFT_ROL; goto gen_shift; OP_32_64(rotr): c = SHIFT_ROR; goto gen_shift; +gen_shift_maybe_vex: +if (have_bmi2 && !const_args[2]) { +tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); +break; +} +/* FALLTHRU */ gen_shift: if (const_args[2]) { tcg_out_shifti(s, c + rexw, args[0], args[2]); @@ -2065,9 +2101,9 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_xor_i32, { "r", "0", "ri" } }, { INDEX_op_andc_i32, { "r", "r", "ri" } }, -{ INDEX_op_shl_i32, { "r", "0", "ci" } }, -{ INDEX_op_shr_i32, { "r", "0", "ci" } }, -{ INDEX_op_sar_i32, { "r", "0", "