Make these helpers suitable for use with tcg_gen_gvec_* functions. Signed-off-by: Jan Bobek <jan.bo...@gmail.com> --- target/i386/ops_sse.h | 141 ++++++++++++++++++++++++----------- target/i386/ops_sse_header.h | 12 +-- target/i386/translate.c | 34 ++++----- 3 files changed, 119 insertions(+), 68 deletions(-)
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 8172324e34..2e50d91a25 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -551,70 +551,123 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *a, Reg *b, } #if SHIFT == 0 -void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) +void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *a, uint32_t desc) { - Reg r; + const intptr_t oprsz = simd_oprsz(desc); + const intptr_t maxsz = simd_maxsz(desc); + const uint8_t ctrl = simd_data(desc); - r.W(0) = s->W(order & 3); - r.W(1) = s->W((order >> 2) & 3); - r.W(2) = s->W((order >> 4) & 3); - r.W(3) = s->W((order >> 6) & 3); - *d = r; + for (intptr_t i = 0; 4 * i * sizeof(uint16_t) < oprsz; ++i) { + const uint16_t t0 = a->W(4 * i + ((ctrl >> 0) & 3)); + const uint16_t t1 = a->W(4 * i + ((ctrl >> 2) & 3)); + const uint16_t t2 = a->W(4 * i + ((ctrl >> 4) & 3)); + const uint16_t t3 = a->W(4 * i + ((ctrl >> 6) & 3)); + + d->W(4 * i + 0) = t0; + d->W(4 * i + 1) = t1; + d->W(4 * i + 2) = t2; + d->W(4 * i + 3) = t3; + } + glue(clear_high, SUFFIX)(d, oprsz, maxsz); } #else -void helper_shufps(Reg *d, Reg *s, int order) +void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *a, uint32_t desc) { - Reg r; + const intptr_t oprsz = simd_oprsz(desc); + const intptr_t maxsz = simd_maxsz(desc); + const uint8_t ctrl = simd_data(desc); - r.L(0) = d->L(order & 3); - r.L(1) = d->L((order >> 2) & 3); - r.L(2) = s->L((order >> 4) & 3); - r.L(3) = s->L((order >> 6) & 3); - *d = r; + for (intptr_t i = 0; 8 * i * sizeof(uint16_t) < oprsz; ++i) { + const uint16_t t0 = a->W(8 * i + ((ctrl >> 0) & 3)); + const uint16_t t1 = a->W(8 * i + ((ctrl >> 2) & 3)); + const uint16_t t2 = a->W(8 * i + ((ctrl >> 4) & 3)); + const uint16_t t3 = a->W(8 * i + ((ctrl >> 6) & 3)); + + d->W(8 * i + 0) = t0; + d->W(8 * i + 1) = t1; + d->W(8 * i + 2) = t2; + d->W(8 * i + 3) = t3; + d->Q(2 * i + 1) = a->Q(2 * i + 1); + } + glue(clear_high, SUFFIX)(d, oprsz, maxsz); } -void helper_shufpd(Reg *d, Reg *s, int order) +void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *a, uint32_t desc) { - Reg r; + const intptr_t oprsz = simd_oprsz(desc); + const intptr_t maxsz = simd_maxsz(desc); + const uint8_t ctrl = simd_data(desc); + + for (intptr_t i = 0; 8 * i * sizeof(uint16_t) < oprsz; ++i) { + const uint16_t t0 = a->W(8 * i + 4 + ((ctrl >> 0) & 3)); + const uint16_t t1 = a->W(8 * i + 4 + ((ctrl >> 2) & 3)); + const uint16_t t2 = a->W(8 * i + 4 + ((ctrl >> 4) & 3)); + const uint16_t t3 = a->W(8 * i + 4 + ((ctrl >> 6) & 3)); - r.Q(0) = d->Q(order & 1); - r.Q(1) = s->Q((order >> 1) & 1); - *d = r; + d->Q(2 * i + 0) = a->Q(2 * i + 0); + d->W(8 * i + 4) = t0; + d->W(8 * i + 5) = t1; + d->W(8 * i + 6) = t2; + d->W(8 * i + 7) = t3; + } + glue(clear_high, SUFFIX)(d, oprsz, maxsz); } -void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) +void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *a, uint32_t desc) { - Reg r; + const intptr_t oprsz = simd_oprsz(desc); + const intptr_t maxsz = simd_maxsz(desc); + const uint8_t ctrl = simd_data(desc); + + for (intptr_t i = 0; 4 * i * sizeof(uint32_t) < oprsz; ++i) { + const uint32_t t0 = a->L(4 * i + ((ctrl >> 0) & 3)); + const uint32_t t1 = a->L(4 * i + ((ctrl >> 2) & 3)); + const uint32_t t2 = a->L(4 * i + ((ctrl >> 4) & 3)); + const uint32_t t3 = a->L(4 * i + ((ctrl >> 6) & 3)); + + d->L(4 * i + 0) = t0; + d->L(4 * i + 1) = t1; + d->L(4 * i + 2) = t2; + d->L(4 * i + 3) = t3; - r.L(0) = s->L(order & 3); - r.L(1) = s->L((order >> 2) & 3); - r.L(2) = s->L((order >> 4) & 3); - r.L(3) = s->L((order >> 6) & 3); - *d = r; + } + glue(clear_high, SUFFIX)(d, oprsz, maxsz); } -void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) +void glue(helper_shufps, SUFFIX)(Reg *d, Reg *a, Reg *b, uint32_t desc) { - Reg r; - - r.W(0) = s->W(order & 3); - r.W(1) = s->W((order >> 2) & 3); - r.W(2) = s->W((order >> 4) & 3); - r.W(3) = s->W((order >> 6) & 3); - r.Q(1) = s->Q(1); - *d = r; + const intptr_t oprsz = simd_oprsz(desc); + const intptr_t maxsz = simd_maxsz(desc); + const uint8_t ctrl = simd_data(desc); + + for (intptr_t i = 0; 4 * i * sizeof(uint32_t) < oprsz; ++i) { + const uint32_t t0 = a->L(4 * i + ((ctrl >> 0) & 3)); + const uint32_t t1 = a->L(4 * i + ((ctrl >> 2) & 3)); + const uint32_t t2 = b->L(4 * i + ((ctrl >> 4) & 3)); + const uint32_t t3 = b->L(4 * i + ((ctrl >> 6) & 3)); + + d->W(4 * i + 0) = t0; + d->W(4 * i + 1) = t1; + d->W(4 * i + 2) = t2; + d->W(4 * i + 3) = t3; + } + glue(clear_high, SUFFIX)(d, oprsz, maxsz); } -void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) +void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *a, Reg *b, uint32_t desc) { - Reg r; - - r.Q(0) = s->Q(0); - r.W(4) = s->W(4 + (order & 3)); - r.W(5) = s->W(4 + ((order >> 2) & 3)); - r.W(6) = s->W(4 + ((order >> 4) & 3)); - r.W(7) = s->W(4 + ((order >> 6) & 3)); - *d = r; + const intptr_t oprsz = simd_oprsz(desc); + const intptr_t maxsz = simd_maxsz(desc); + const uint8_t ctrl = simd_data(desc); + + for (intptr_t i = 0; 2 * i * sizeof(uint64_t) < oprsz; ++i) { + const uint64_t t0 = a->Q(2 * i + ((ctrl >> 0) & 1)); + const uint64_t t1 = b->Q(2 * i + ((ctrl >> 1) & 1)); + + d->Q(2 * i + 0) = t0; + d->Q(2 * i + 1) = t1; + } + glue(clear_high, SUFFIX)(d, oprsz, maxsz); } #endif diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index ee8bd4c1af..207d41e248 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -78,13 +78,13 @@ DEF_HELPER_4(glue(psadbw, SUFFIX), void, Reg, Reg, Reg, i32) DEF_HELPER_4(glue(maskmov, SUFFIX), void, env, Reg, Reg, tl) #if SHIFT == 0 -DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int) +DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, i32) #else -DEF_HELPER_3(shufps, void, Reg, Reg, int) -DEF_HELPER_3(shufpd, void, Reg, Reg, int) -DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int) -DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int) -DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int) +DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, i32) +DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, i32) +DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, i32) +DEF_HELPER_4(glue(shufps, SUFFIX), void, Reg, Reg, Reg, i32) +DEF_HELPER_4(glue(shufpd, SUFFIX), void, Reg, Reg, Reg, i32) #endif #if SHIFT == 1 diff --git a/target/i386/translate.c b/target/i386/translate.c index 3554086336..bb4120a848 100644 --- a/target/i386/translate.c +++ b/target/i386/translate.c @@ -2763,8 +2763,6 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = { [0x5b] = { gen_helper_cvtdq2ps, gen_helper_cvtps2dq, gen_helper_cvttps2dq }, [0xc2] = SSE_FOP(cmpeq), - [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps, - (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */ /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX. */ [0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, @@ -6971,22 +6969,22 @@ DEF_GEN_INSN3_HELPER_EPP(pshufb, pshufb_mmx, Pq, Pq, Qq) DEF_GEN_INSN3_HELPER_EPP(pshufb, pshufb_xmm, Vdq, Vdq, Wdq) DEF_GEN_INSN3_HELPER_EPP(vpshufb, pshufb_xmm, Vdq, Hdq, Wdq) DEF_GEN_INSN3_HELPER_EPP(vpshufb, pshufb_xmm, Vqq, Hqq, Wqq) -DEF_GEN_INSN3_HELPER_PPI(pshufw, pshufw_mmx, Pq, Qq, Ib) -DEF_GEN_INSN3_HELPER_PPI(pshuflw, pshuflw_xmm, Vdq, Wdq, Ib) -DEF_GEN_INSN3_HELPER_PPI(vpshuflw, pshuflw_xmm, Vdq, Wdq, Ib) -DEF_GEN_INSN3_HELPER_PPI(vpshuflw, pshuflw_xmm, Vqq, Wqq, Ib) -DEF_GEN_INSN3_HELPER_PPI(pshufhw, pshufhw_xmm, Vdq, Wdq, Ib) -DEF_GEN_INSN3_HELPER_PPI(vpshufhw, pshufhw_xmm, Vdq, Wdq, Ib) -DEF_GEN_INSN3_HELPER_PPI(vpshufhw, pshufhw_xmm, Vqq, Wqq, Ib) -DEF_GEN_INSN3_HELPER_PPI(pshufd, pshufd_xmm, Vdq, Wdq, Ib) -DEF_GEN_INSN3_HELPER_PPI(vpshufd, pshufd_xmm, Vdq, Wdq, Ib) -DEF_GEN_INSN3_HELPER_PPI(vpshufd, pshufd_xmm, Vqq, Wqq, Ib) -DEF_GEN_INSN4_HELPER_PPI(shufps, shufps, Vdq, Vdq, Wdq, Ib) -DEF_GEN_INSN4_HELPER_PPI(vshufps, shufps, Vdq, Hdq, Wdq, Ib) -DEF_GEN_INSN4_HELPER_PPI(vshufps, shufps, Vqq, Hqq, Wqq, Ib) -DEF_GEN_INSN4_HELPER_PPI(shufpd, shufpd, Vdq, Vdq, Wdq, Ib) -DEF_GEN_INSN4_HELPER_PPI(vshufpd, shufpd, Vdq, Hdq, Wdq, Ib) -DEF_GEN_INSN4_HELPER_PPI(vshufpd, shufpd, Vqq, Hqq, Wqq, Ib) +DEF_GEN_INSN3_GVEC(pshufw, Pq, Qq, Ib, 2i_ool, MM_OPRSZ, MM_MAXSZ, pshufw_mmx) +DEF_GEN_INSN3_GVEC(pshuflw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm) +DEF_GEN_INSN3_GVEC(vpshuflw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm) +DEF_GEN_INSN3_GVEC(vpshuflw, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm) +DEF_GEN_INSN3_GVEC(pshufhw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm) +DEF_GEN_INSN3_GVEC(vpshufhw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm) +DEF_GEN_INSN3_GVEC(vpshufhw, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm) +DEF_GEN_INSN3_GVEC(pshufd, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm) +DEF_GEN_INSN3_GVEC(vpshufd, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm) +DEF_GEN_INSN3_GVEC(vpshufd, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm) +DEF_GEN_INSN4_GVEC(shufps, Vdq, Vdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm) +DEF_GEN_INSN4_GVEC(vshufps, Vdq, Hdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm) +DEF_GEN_INSN4_GVEC(vshufps, Vqq, Hqq, Wqq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm) +DEF_GEN_INSN4_GVEC(shufpd, Vdq, Vdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm) +DEF_GEN_INSN4_GVEC(vshufpd, Vdq, Hdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm) +DEF_GEN_INSN4_GVEC(vshufpd, Vqq, Hqq, Wqq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm) DEF_GEN_INSN4_HELPER_EPPI(blendps, blendps_xmm, Vdq, Vdq, Wdq, Ib) DEF_GEN_INSN4_HELPER_EPPI(vblendps, blendps_xmm, Vdq, Hdq, Wdq, Ib) -- 2.20.1