Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions

2014-02-17 Thread Richard Henderson
On 02/16/2014 08:21 AM, Paolo Bonzini wrote:
> Il 31/01/2014 15:43, Richard Henderson ha scritto:
>> +gen_shift_maybe_vex:
>> +if (have_bmi2 && !const_args[2]) {
>> +tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
>> +break;
>> +}
>> +/* FALLTHRU */
> 
> What if args[2] happens to be ECX?

I ran some measurements and as I expected this basically never happens.  For
64-bit, I never saw it occur.  For 32-bit, 1/800 of all shifts used ecx.

For 64-bit, the use of shlx et al is always a size win.  The mov and shift,
including their rex prefixes, are 3 bytes each, while the shlx is 5 byes.

For 32-bit, things are more complicated.  The mov and shift are 2 bytes each,
so the use of shlx is by itself a 1 byte size penalty.  Except that sometimes
the avoidance of the mov results in fewer spills, and thus fewer bytes overall.
 So overall I see the barest fraction (< 0.01%) size decrease across all TBs.


r~



Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions

2014-02-16 Thread Aurelien Jarno
On Fri, Jan 31, 2014 at 08:43:38AM -0600, Richard Henderson wrote:
> These three-operand shift instructions do not require the shift count
> to be placed into ECX.  This reduces the number of mov insns required,
> with the mere addition of a new register constraint.
> 
> Don't attempt to get rid of the matching constraint, as that's impossible
> to manipulate with just a new constraint.  In addition, constant shifts
> still need the matching constraint.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/i386/tcg-target.c | 61 
> +--
>  1 file changed, 50 insertions(+), 11 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 4f6b9c1..fef1717 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -133,6 +133,12 @@ static bool have_movbe;
> it there.  Therefore we always define the variable.  */
>  bool have_bmi1;
>  
> +#if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
> +static bool have_bmi2;
> +#else
> +# define have_bmi2 0
> +#endif
> +
>  static uint8_t *tb_ret_addr;
>  
>  static void patch_reloc(uint8_t *code_ptr, int type,
> @@ -175,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
> const char **pct_str)
>  tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
>  break;
>  case 'c':
> +case_c:
>  ct->ct |= TCG_CT_REG;
>  tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
>  break;
> @@ -203,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
> const char **pct_str)
>  tcg_regset_set32(ct->u.regs, 0, 0xf);
>  break;
>  case 'r':
> +case_r:
>  ct->ct |= TCG_CT_REG;
>  if (TCG_TARGET_REG_BITS == 64) {
>  tcg_regset_set32(ct->u.regs, 0, 0x);
> @@ -210,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
> const char **pct_str)
>  tcg_regset_set32(ct->u.regs, 0, 0xff);
>  }
>  break;
> +case 'C':
> +/* With SHRX et al, we need not use ECX as shift count register.  */
> +if (have_bmi2) {
> +goto case_r;
> +} else {
> +goto case_c;
> +}
>  
>  /* qemu_ld/st address constraint */
>  case 'L':
> @@ -283,6 +298,8 @@ static inline int tcg_target_const_match(tcg_target_long 
> val,
>  # define P_REXB_RM   0
>  # define P_GS   0
>  #endif
> +#define P_SIMDF30x1 /* 0xf3 opcode prefix */
> +#define P_SIMDF20x2 /* 0xf2 opcode prefix */
>  
>  #define OPC_ARITH_EvIz   (0x81)
>  #define OPC_ARITH_EvIb   (0x83)
> @@ -325,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long 
> val,
>  #define OPC_SHIFT_1  (0xd1)
>  #define OPC_SHIFT_Ib (0xc1)
>  #define OPC_SHIFT_cl (0xd3)
> +#define OPC_SARX(0xf7 | P_EXT38 | P_SIMDF3)
> +#define OPC_SHLX(0xf7 | P_EXT38 | P_DATA16)
> +#define OPC_SHRX(0xf7 | P_EXT38 | P_SIMDF2)
>  #define OPC_TESTL(0x85)
>  #define OPC_XCHG_ax_r32  (0x90)
>  
> @@ -493,7 +513,14 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, 
> int r, int v, int rm)
>  
>  tmp = (r & 8 ? 0 : 0x80);  /* VEX.R */
>  }
> -tmp |= (opc & P_DATA16 ? 1 : 0);   /* VEX.pp */
> +/* VEX.pp */
> +if (opc & P_DATA16) {
> +tmp |= 1;  /* 0x66 */
> +} else if (opc & P_SIMDF3) {
> +tmp |= 2;  /* 0xf3 */
> +} else if (opc & P_SIMDF2) {
> +tmp |= 3;  /* 0xf2 */
> +}
>  tmp |= (~v & 15) << 3; /* VEX. */
>  tcg_out8(s, tmp);
>  tcg_out8(s, opc);
> @@ -1689,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg 
> *args, bool is64)
>  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>const TCGArg *args, const int *const_args)
>  {
> -int c, rexw = 0;
> +int c, vexop, rexw = 0;
>  
>  #if TCG_TARGET_REG_BITS == 64
>  # define OP_32_64(x) \
> @@ -1860,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, 
> TCGOpcode opc,
>  
>  OP_32_64(shl):
>  c = SHIFT_SHL;
> -goto gen_shift;
> +vexop = OPC_SHLX;
> +goto gen_shift_maybe_vex;
>  OP_32_64(shr):
>  c = SHIFT_SHR;
> -goto gen_shift;
> +vexop = OPC_SHRX;
> +goto gen_shift_maybe_vex;
>  OP_32_64(sar):
>  c = SHIFT_SAR;
> -goto gen_shift;
> +vexop = OPC_SARX;
> +goto gen_shift_maybe_vex;
>  OP_32_64(rotl):
>  c = SHIFT_ROL;
>  goto gen_shift;
>  OP_32_64(rotr):
>  c = SHIFT_ROR;
>  goto gen_shift;
> +gen_shift_maybe_vex:
> +if (have_bmi2 && !const_args[2]) {
> +tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
> +break;
> +}
> +/* FALLTHRU */
>  gen_shift:
>  if (const_args[2]) {
>

Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions

2014-02-16 Thread Richard Henderson
On 02/16/2014 06:21 AM, Paolo Bonzini wrote:
> Il 31/01/2014 15:43, Richard Henderson ha scritto:
>> +gen_shift_maybe_vex:
>> +if (have_bmi2 && !const_args[2]) {
>> +tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
>> +break;
>> +}
>> +/* FALLTHRU */
> 
> What if args[2] happens to be ECX?

shlx handles that just fine.  I don't think it's worth an extra check to
fall back to shl on the off-chance that ecx is used; it's pretty far down
on the register allocation order list, so it wouldn't happen often.


r~



Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions

2014-02-16 Thread Paolo Bonzini

Il 31/01/2014 15:43, Richard Henderson ha scritto:

+gen_shift_maybe_vex:
+if (have_bmi2 && !const_args[2]) {
+tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
+break;
+}
+/* FALLTHRU */


What if args[2] happens to be ECX?

Apart from this,

Reviewed-by: Paolo Bonzini 

so feel free to post PATCH 6/5 and then squash it in the pull request.

Paolo



[Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions

2014-01-31 Thread Richard Henderson
These three-operand shift instructions do not require the shift count
to be placed into ECX.  This reduces the number of mov insns required,
with the mere addition of a new register constraint.

Don't attempt to get rid of the matching constraint, as that's impossible
to manipulate with just a new constraint.  In addition, constant shifts
still need the matching constraint.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c | 61 +--
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4f6b9c1..fef1717 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -133,6 +133,12 @@ static bool have_movbe;
it there.  Therefore we always define the variable.  */
 bool have_bmi1;
 
+#if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
+static bool have_bmi2;
+#else
+# define have_bmi2 0
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -175,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
const char **pct_str)
 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
 break;
 case 'c':
+case_c:
 ct->ct |= TCG_CT_REG;
 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
 break;
@@ -203,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
const char **pct_str)
 tcg_regset_set32(ct->u.regs, 0, 0xf);
 break;
 case 'r':
+case_r:
 ct->ct |= TCG_CT_REG;
 if (TCG_TARGET_REG_BITS == 64) {
 tcg_regset_set32(ct->u.regs, 0, 0x);
@@ -210,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
const char **pct_str)
 tcg_regset_set32(ct->u.regs, 0, 0xff);
 }
 break;
+case 'C':
+/* With SHRX et al, we need not use ECX as shift count register.  */
+if (have_bmi2) {
+goto case_r;
+} else {
+goto case_c;
+}
 
 /* qemu_ld/st address constraint */
 case 'L':
@@ -283,6 +298,8 @@ static inline int tcg_target_const_match(tcg_target_long 
val,
 # define P_REXB_RM 0
 # define P_GS   0
 #endif
+#define P_SIMDF30x1 /* 0xf3 opcode prefix */
+#define P_SIMDF20x2 /* 0xf2 opcode prefix */
 
 #define OPC_ARITH_EvIz (0x81)
 #define OPC_ARITH_EvIb (0x83)
@@ -325,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long 
val,
 #define OPC_SHIFT_1(0xd1)
 #define OPC_SHIFT_Ib   (0xc1)
 #define OPC_SHIFT_cl   (0xd3)
+#define OPC_SARX(0xf7 | P_EXT38 | P_SIMDF3)
+#define OPC_SHLX(0xf7 | P_EXT38 | P_DATA16)
+#define OPC_SHRX(0xf7 | P_EXT38 | P_SIMDF2)
 #define OPC_TESTL  (0x85)
 #define OPC_XCHG_ax_r32(0x90)
 
@@ -493,7 +513,14 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int 
r, int v, int rm)
 
 tmp = (r & 8 ? 0 : 0x80);  /* VEX.R */
 }
-tmp |= (opc & P_DATA16 ? 1 : 0);   /* VEX.pp */
+/* VEX.pp */
+if (opc & P_DATA16) {
+tmp |= 1;  /* 0x66 */
+} else if (opc & P_SIMDF3) {
+tmp |= 2;  /* 0xf3 */
+} else if (opc & P_SIMDF2) {
+tmp |= 3;  /* 0xf2 */
+}
 tmp |= (~v & 15) << 3; /* VEX. */
 tcg_out8(s, tmp);
 tcg_out8(s, opc);
@@ -1689,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg 
*args, bool is64)
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
   const TCGArg *args, const int *const_args)
 {
-int c, rexw = 0;
+int c, vexop, rexw = 0;
 
 #if TCG_TARGET_REG_BITS == 64
 # define OP_32_64(x) \
@@ -1860,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 
 OP_32_64(shl):
 c = SHIFT_SHL;
-goto gen_shift;
+vexop = OPC_SHLX;
+goto gen_shift_maybe_vex;
 OP_32_64(shr):
 c = SHIFT_SHR;
-goto gen_shift;
+vexop = OPC_SHRX;
+goto gen_shift_maybe_vex;
 OP_32_64(sar):
 c = SHIFT_SAR;
-goto gen_shift;
+vexop = OPC_SARX;
+goto gen_shift_maybe_vex;
 OP_32_64(rotl):
 c = SHIFT_ROL;
 goto gen_shift;
 OP_32_64(rotr):
 c = SHIFT_ROR;
 goto gen_shift;
+gen_shift_maybe_vex:
+if (have_bmi2 && !const_args[2]) {
+tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
+break;
+}
+/* FALLTHRU */
 gen_shift:
 if (const_args[2]) {
 tcg_out_shifti(s, c + rexw, args[0], args[2]);
@@ -2065,9 +2101,9 @@ static const TCGTargetOpDef x86_op_defs[] = {
 { INDEX_op_xor_i32, { "r", "0", "ri" } },
 { INDEX_op_andc_i32, { "r", "r", "ri" } },
 
-{ INDEX_op_shl_i32, { "r", "0", "ci" } },
-{ INDEX_op_shr_i32, { "r", "0", "ci" } },
-{ INDEX_op_sar_i32, { "r", "0", "