The catch here is that these are whole vector operations (not independent 128 bit lanes). We abuse the SSE_OPF_SCALAR flag to select the memory operand width appropriately.
Signed-off-by: Paul Brook <p...@nowt.org> --- target/i386/ops_sse.h | 51 ++++++++++++++++++++++++++++++++++++ target/i386/ops_sse_header.h | 8 ++++++ target/i386/tcg/translate.c | 42 ++++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index a1f50f0c8b..4115c9a257 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3071,7 +3071,57 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #endif #endif +#if SHIFT >= 1 +void glue(helper_vbroadcastb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint8_t val = s->B(0); + int i; + + for (i = 0; i < 16 * SHIFT; i++) { + d->B(i) = val; + } +} + +void glue(helper_vbroadcastw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint16_t val = s->W(0); + int i; + + for (i = 0; i < 8 * SHIFT; i++) { + d->W(i) = val; + } +} + +void glue(helper_vbroadcastl, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint32_t val = s->L(0); + int i; + + for (i = 0; i < 8 * SHIFT; i++) { + d->L(i) = val; + } +} + +void glue(helper_vbroadcastq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint64_t val = s->Q(0); + d->Q(0) = val; + d->Q(1) = val; #if SHIFT == 2 + d->Q(2) = val; + d->Q(3) = val; +#endif +} + +#if SHIFT == 2 +void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + d->Q(0) = s->Q(0); + d->Q(1) = s->Q(1); + d->Q(2) = s->Q(0); + d->Q(3) = s->Q(1); +} + void helper_vzeroall(CPUX86State *env) { int i; @@ -3118,6 +3168,7 @@ void helper_vzeroupper_hi8(CPUX86State *env) } #endif #endif +#endif #undef SSE_HELPER_S diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 48f0945917..51e02cd4fa 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -411,7 +411,14 @@ DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32) #endif +/* AVX helpers */ +#if SHIFT >= 1 +DEF_HELPER_3(glue(vbroadcastb, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastw, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastl, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastq, SUFFIX), void, env, Reg, Reg) #if SHIFT == 2 +DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_1(vzeroall, void, env) DEF_HELPER_1(vzeroupper, void, env) #ifdef TARGET_X86_64 @@ -419,6 +426,7 @@ DEF_HELPER_1(vzeroall_hi8, void, env) DEF_HELPER_1(vzeroupper_hi8, void, env) #endif #endif +#endif #undef SHIFT #undef Reg diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index ba70aeb039..59ab1dc562 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3255,6 +3255,11 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x14] = BLENDV_OP(blendvps, SSE41, 0), [0x15] = BLENDV_OP(blendvpd, SSE41, 0), [0x17] = CMP_OP(ptest, SSE41), + /* TODO:Some vbroadcast variants require AVX2 */ + [0x18] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR), /* vbroadcastss */ + [0x19] = UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR), /* vbroadcastsd */ +#define gen_helper_vbroadcastdq_xmm NULL + [0x1a] = UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR), /* vbroadcastf128 */ [0x1c] = UNARY_OP_MMX(pabsb, SSSE3), [0x1d] = UNARY_OP_MMX(pabsw, SSSE3), [0x1e] = UNARY_OP_MMX(pabsd, SSSE3), @@ -3286,6 +3291,16 @@ static const struct SSEOpHelper_table6 sse_op_table6[256] = { [0x40] = BINARY_OP(pmulld, SSE41, SSE_OPF_MMX), #define gen_helper_phminposuw_ymm NULL [0x41] = UNARY_OP(phminposuw, SSE41, 0), + /* vpbroadcastd */ + [0x58] = UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastq */ + [0x59] = UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vbroadcasti128 */ + [0x5a] = UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastb */ + [0x78] = UNARY_OP(vbroadcastb, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastw */ + [0x79] = UNARY_OP(vbroadcastw, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), #define gen_helper_aesimc_ymm NULL [0xdb] = UNARY_OP(aesimc, AES, 0), [0xdc] = BINARY_OP(aesenc, AES, 0), @@ -4323,6 +4338,24 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, op2_offset = offsetof(CPUX86State, xmm_t0); gen_lea_modrm(env, s, modrm); switch (b) { + case 0x78: /* vpbroadcastb */ + size = 8; + break; + case 0x79: /* vpbroadcastw */ + size = 16; + break; + case 0x18: /* vbroadcastss */ + case 0x58: /* vpbroadcastd */ + size = 32; + break; + case 0x19: /* vbroadcastsd */ + case 0x59: /* vpbroadcastq */ + size = 64; + break; + case 0x1a: /* vbroadcastf128 */ + case 0x5a: /* vbroadcasti128 */ + size = 128; + break; case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */ case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */ case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */ @@ -4346,10 +4379,17 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, default: size = 128; } - if (s->vex_l) { + /* 256 bit vbroadcast only load a single element. */ + if ((op6.flags & SSE_OPF_SCALAR) == 0 && s->vex_l) { size *= 2; } switch (size) { + case 8: + tcg_gen_qemu_ld_tl(s->tmp0, s->A0, + s->mem_index, MO_UB); + tcg_gen_st16_tl(s->tmp0, cpu_env, op2_offset + + offsetof(ZMMReg, ZMM_B(0))); + break; case 16: tcg_gen_qemu_ld_tl(s->tmp0, s->A0, s->mem_index, MO_LEUW); -- 2.36.0