Re: [PATCH v2 17/25] target/i386: move C0-FF opcodes to new decoder (except for x87)

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

The shift instructions are rewritten instead of reusing code from the old
decoder.  Rotates use CC_OP_ADCOX more extensively and generally rely
more on the optimizer, so that the code generators are shared between
the immediate-count and variable-count cases.

In particular, this makes gen_RCL and gen_RCR pretty efficient for the
count == 1 case, which becomes (apart from a few extra movs) something like:

   (compute_cc_all if needed)
   // save old value for OF calculation
   mov cc_src2, T0
   // the bulk of RCL is just this!
   deposit T0, cc_src, T0, 1, TARGET_LONG_BITS - 1
   // compute carry
   shr cc_dst, cc_src2, length - 1
   and cc_dst, cc_dst, 1
   // compute overflow
   xor cc_src2, cc_src2, T0
   extract cc_src2, cc_src2, length - 1, 1

32-bit MUL and IMUL are also slightly more efficient on 64-bit hosts.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/decode-new.h |1 +
  target/i386/tcg/translate.c  |   23 +-
  target/i386/tcg/decode-new.c.inc |  142 +
  target/i386/tcg/emit.c.inc   | 1014 +-
  4 files changed, 1169 insertions(+), 11 deletions(-)


Reviewed-by: Richard Henderson 

r~



[PATCH v2 17/25] target/i386: move C0-FF opcodes to new decoder (except for x87)

2024-05-06 Thread Paolo Bonzini
The shift instructions are rewritten instead of reusing code from the old
decoder.  Rotates use CC_OP_ADCOX more extensively and generally rely
more on the optimizer, so that the code generators are shared between
the immediate-count and variable-count cases.

In particular, this makes gen_RCL and gen_RCR pretty efficient for the
count == 1 case, which becomes (apart from a few extra movs) something like:

  (compute_cc_all if needed)
  // save old value for OF calculation
  mov cc_src2, T0
  // the bulk of RCL is just this!
  deposit T0, cc_src, T0, 1, TARGET_LONG_BITS - 1
  // compute carry
  shr cc_dst, cc_src2, length - 1
  and cc_dst, cc_dst, 1
  // compute overflow
  xor cc_src2, cc_src2, T0
  extract cc_src2, cc_src2, length - 1, 1

32-bit MUL and IMUL are also slightly more efficient on 64-bit hosts.

Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/decode-new.h |1 +
 target/i386/tcg/translate.c  |   23 +-
 target/i386/tcg/decode-new.c.inc |  142 +
 target/i386/tcg/emit.c.inc   | 1014 +-
 4 files changed, 1169 insertions(+), 11 deletions(-)

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index 790ad5e1d00..77bb31eb143 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -89,6 +89,7 @@ typedef enum X86OpSize {
 X86_SIZE_x,  /* 128/256-bit, based on operand size */
 X86_SIZE_y,  /* 32/64-bit, based on operand size */
 X86_SIZE_z,  /* 16-bit for 16-bit operand size, else 32-bit */
+X86_SIZE_z_f64,  /* 32-bit for 32-bit operand size or 64-bit mode, else 
16-bit */
 
 /* Custom */
 X86_SIZE_d64,
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 708fe023224..79b6e2760fe 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -38,6 +38,9 @@
 #include "exec/helper-info.c.inc"
 #undef  HELPER_H
 
+/* Fixes for Windows namespace pollution.  */
+#undef IN
+#undef OUT
 
 #define PREFIX_REPZ   0x01
 #define PREFIX_REPNZ  0x02
@@ -2488,14 +2491,24 @@ static inline int insn_const_size(MemOp ot)
 }
 }
 
+static void gen_conditional_jump_labels(DisasContext *s, target_long diff,
+TCGLabel *not_taken, TCGLabel *taken)
+{
+if (not_taken) {
+gen_set_label(not_taken);
+}
+gen_jmp_rel_csize(s, 0, 1);
+
+gen_set_label(taken);
+gen_jmp_rel(s, s->dflag, diff, 0);
+}
+
 static void gen_jcc(DisasContext *s, int b, int diff)
 {
 TCGLabel *l1 = gen_new_label();
 
 gen_jcc1(s, b, l1);
-gen_jmp_rel_csize(s, 0, 1);
-gen_set_label(l1);
-gen_jmp_rel(s, s->dflag, diff, 0);
+gen_conditional_jump_labels(s, diff, NULL, l1);
 }
 
 static void gen_cmovcc1(DisasContext *s, int b, TCGv dest, TCGv src)
@@ -2752,7 +2765,7 @@ static void gen_unknown_opcode(CPUX86State *env, 
DisasContext *s)
 
 /* an interrupt is different from an exception because of the
privilege checks */
-static void gen_interrupt(DisasContext *s, int intno)
+static void gen_interrupt(DisasContext *s, uint8_t intno)
 {
 gen_update_cc_op(s);
 gen_update_eip_cur(s);
@@ -3183,7 +3196,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
 #ifndef CONFIG_USER_ONLY
 use_new &= b <= limit;
 #endif
-if (use_new && b <= 0xbf) {
+if (use_new && (b < 0xd8 || b >= 0xe0)) {
 disas_insn_new(s, cpu, b);
 return true;
 }
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 55fc0173a41..a47ecab6dd4 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -135,6 +135,8 @@
 ## __VA_ARGS__\
 }
 
+#define X86_OP_GROUP1(op, op0, s0, ...)   \
+X86_OP_GROUP3(op, op0, s0, 2op, s0, None, None, ## __VA_ARGS__)
 #define X86_OP_GROUP2(op, op0, s0, op1, s1, ...)  \
 X86_OP_GROUP3(op, op0, s0, 2op, s0, op1, s1, ## __VA_ARGS__)
 #define X86_OP_GROUPw(op, op0, s0, ...)   \
@@ -1174,6 +1176,83 @@ static void decode_group1A(DisasContext *s, CPUX86State 
*env, X86OpEntry *entry,
 }
 }
 
+static void decode_group2(DisasContext *s, CPUX86State *env, X86OpEntry 
*entry, uint8_t *b)
+{
+static const X86GenFunc group2_gen[8] = {
+gen_ROL, gen_ROR, gen_RCL, gen_RCR,
+gen_SHL, gen_SHR, gen_SHL /* SAL, undocumented */, gen_SAR,
+};
+int op = (get_modrm(s, env) >> 3) & 7;
+entry->gen = group2_gen[op];
+if (op == 7) {
+entry->special = X86_SPECIAL_SExtT0;
+} else {
+entry->special = X86_SPECIAL_ZExtT0;
+}
+}
+
+static void decode_group3(DisasContext *s, CPUX86State *env, X86OpEntry 
*entry, uint8_t *b)
+{
+static const X86OpEntry opcodes_grp3[16] = {
+/* 0xf6 */
+[0x00] = X86_OP_ENTRYrr(AND, E,b, I,b),
+[0x02] = X86_OP_ENTRY1(NOT,  E,b,  lock),
+[0x03] = X86_OP_ENTRY1(NEG,