This is more efficient both when generating code and when testing
flags.
Signed-off-by: Paolo Bonzini <[email protected]>
---
target/i386/cpu.h | 13 ++++++++++++-
target/i386/cpu-dump.c | 2 ++
target/i386/tcg/cc_helper.c | 6 ++++++
target/i386/tcg/translate.c | 13 +++++++++++++
target/i386/tcg/emit.c.inc | 33 ++++++---------------------------
5 files changed, 39 insertions(+), 28 deletions(-)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index ecca38ed0b5..314e773a5d4 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1515,7 +1515,18 @@ typedef enum {
CC_OP_POPCNTL__,
CC_OP_POPCNTQ__,
CC_OP_POPCNT = sizeof(target_ulong) == 8 ? CC_OP_POPCNTQ__ :
CC_OP_POPCNTL__,
-#define CC_OP_LAST_BWLQ CC_OP_POPCNTQ__
+
+ /*
+ * Note that only CC_OP_SBB_SELF (i.e. the one with MO_TL size)
+ * is used or implemented, because the translation produces a
+ * sign-extended CC_DST.
+ */
+ CC_OP_SBB_SELFB__, /* S/Z/C/A via CC_DST, O clear, P set. */
+ CC_OP_SBB_SELFW__,
+ CC_OP_SBB_SELFL__,
+ CC_OP_SBB_SELFQ__,
+ CC_OP_SBB_SELF = sizeof(target_ulong) == 8 ? CC_OP_SBB_SELFQ__ :
CC_OP_SBB_SELFL__,
+#define CC_OP_LAST_BWLQ CC_OP_SBB_SELFQ__
CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
} CCOp;
diff --git a/target/i386/cpu-dump.c b/target/i386/cpu-dump.c
index 67bf31e0caa..20a3002f013 100644
--- a/target/i386/cpu-dump.c
+++ b/target/i386/cpu-dump.c
@@ -91,6 +91,8 @@ static const char * const cc_op_str[] = {
[CC_OP_BMILGQ] = "BMILGQ",
[CC_OP_POPCNT] = "POPCNT",
+
+ [CC_OP_SBB_SELF] = "SBBx,x",
};
static void
diff --git a/target/i386/tcg/cc_helper.c b/target/i386/tcg/cc_helper.c
index 2c4170b5b77..91e492196af 100644
--- a/target/i386/tcg/cc_helper.c
+++ b/target/i386/tcg/cc_helper.c
@@ -100,6 +100,9 @@ target_ulong helper_cc_compute_all(target_ulong dst,
target_ulong src1,
return src1;
case CC_OP_POPCNT:
return dst ? 0 : CC_Z;
+ case CC_OP_SBB_SELF:
+ /* dst is either all zeros (--Z-P-) or all ones (-S-APC) */
+ return (dst & (CC_Z|CC_A|CC_C|CC_S)) ^ (CC_P | CC_Z);
case CC_OP_ADCX:
return compute_all_adcx(dst, src1, src2);
@@ -326,6 +329,9 @@ target_ulong helper_cc_compute_c(target_ulong dst,
target_ulong src1,
case CC_OP_MULQ:
return src1 != 0;
+ case CC_OP_SBB_SELF:
+ return dst & 1;
+
case CC_OP_ADCX:
case CC_OP_ADCOX:
return dst;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index e91715af817..17ad4ccacaf 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -304,6 +304,7 @@ static const uint8_t cc_op_live_[] = {
[CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2,
[CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
[CC_OP_POPCNT] = USES_CC_DST,
+ [CC_OP_SBB_SELF] = USES_CC_DST,
};
static uint8_t cc_op_live(CCOp op)
@@ -938,6 +939,9 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv
reg)
size = cc_op_size(s->cc_op);
return gen_prepare_val_nz(cpu_cc_src, size, false);
+ case CC_OP_SBB_SELF:
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst };
+
case CC_OP_ADCX:
case CC_OP_ADCOX:
return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst,
@@ -999,6 +1003,7 @@ static CCPrepare gen_prepare_eflags_o(DisasContext *s,
TCGv reg)
case CC_OP_ADCOX:
return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2,
.no_setcond = true };
+ case CC_OP_SBB_SELF:
case CC_OP_LOGICB ... CC_OP_LOGICQ:
case CC_OP_POPCNT:
return (CCPrepare) { .cond = TCG_COND_NEVER };
@@ -1078,6 +1083,14 @@ static CCPrepare gen_prepare_cc(DisasContext *s, int b,
TCGv reg)
}
break;
+ case CC_OP_SBB_SELF:
+ /* checking for nonzero is usually the most efficient */
+ if (jcc_op == JCC_L || jcc_op == JCC_B || jcc_op == JCC_S) {
+ jcc_op = JCC_Z;
+ inv = !inv;
+ }
+ goto slow_jcc;
+
case CC_OP_LOGICB ... CC_OP_LOGICQ:
/* Mostly used for test+jump */
size = s->cc_op - CC_OP_LOGICB;
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 8dac4d09da1..0fde3d669d9 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -3876,37 +3876,16 @@ static void gen_SBB(DisasContext *s, X86DecodedInsn
*decode)
return;
}
- c_in = tcg_temp_new();
- gen_compute_eflags_c(s, c_in);
-
- /*
- * Here the change is as follows:
- * CC_SBB: src1 = T0, src2 = T0, src3 = c_in
- * CC_SUB: src1 = 0, src2 = c_in (no src3)
- *
- * The difference also does not matter:
- * - AF is bit 4 of dst^src1^src2, but bit 4 of src1^src2 is zero in both
cases
- * therefore AF comes straight from dst (in fact it is c_in)
- * - for OF, src1 and src2 have the same sign in both cases, meaning there
- * can be no overflow
- */
+ /* SBB x,x has its own CCOp so that's even easier. */
if (decode->e.op2 != X86_TYPE_I && !decode->op[0].has_ea &&
decode->op[0].n == decode->op[2].n) {
- if (s->cc_op == CC_OP_DYNAMIC) {
- tcg_gen_neg_tl(s->T0, c_in);
- } else {
- /*
- * Do not negate c_in because it will often be dead and only the
- * instruction generated by negsetcond will survive.
- */
- gen_neg_setcc(s, JCC_B << 1, s->T0);
- }
- tcg_gen_movi_tl(s->cc_srcT, 0);
- decode->cc_src = c_in;
- decode->cc_dst = s->T0;
- decode->cc_op = CC_OP_SUBB + ot;
+ gen_neg_setcc(s, JCC_B << 1, s->T0);
+ prepare_update1_cc(decode, s, CC_OP_SBB_SELF);
return;
}
+ c_in = tcg_temp_new();
+ gen_compute_eflags_c(s, c_in);
+
if (s->prefix & PREFIX_LOCK) {
tcg_gen_add_tl(s->T0, s->T1, c_in);
tcg_gen_neg_tl(s->T0, s->T0);
--
2.52.0