On Fri, 8 May 2020 at 16:22, Richard Henderson <richard.hender...@linaro.org> wrote: > > Create vectorized versions of handle_shri_with_rndacc > for shift+round and shift+round+accumulate. Add out-of-line > helpers in preparation for longer vector lengths from SVE. > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > --- > target/arm/helper.h | 20 ++ > target/arm/translate.h | 9 + > target/arm/translate-a64.c | 11 +- > target/arm/translate.c | 461 +++++++++++++++++++++++++++++++++++-- > target/arm/vec_helper.c | 50 ++++ > 5 files changed, 525 insertions(+), 26 deletions(-)
> +void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, > + int64_t shift, uint32_t opr_sz, uint32_t max_sz) > +{ > + static const TCGOpcode vecop_list[] = { > + INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 > + }; > + static const GVecGen2i ops[4] = { > + { .fni8 = gen_srshr8_i64, > + .fniv = gen_srshr_vec, > + .fno = gen_helper_gvec_srshr_b, > + .opt_opc = vecop_list, > + .vece = MO_8 }, > + { .fni8 = gen_srshr16_i64, > + .fniv = gen_srshr_vec, > + .fno = gen_helper_gvec_srshr_h, > + .opt_opc = vecop_list, > + .vece = MO_16 }, > + { .fni4 = gen_srshr32_i32, > + .fniv = gen_srshr_vec, > + .fno = gen_helper_gvec_srshr_s, > + .opt_opc = vecop_list, > + .vece = MO_32 }, > + { .fni8 = gen_srshr64_i64, > + .fniv = gen_srshr_vec, > + .fno = gen_helper_gvec_srshr_d, > + .prefer_i64 = TCG_TARGET_REG_BITS == 64, > + .opt_opc = vecop_list, > + .vece = MO_64 }, > + }; > + > + /* tszimm encoding produces immediates in the range [1..esize] */ > + tcg_debug_assert(shift > 0); This assert can be triggered: #4 0x00000000004ee086 in gen_gvec_ursra (vece=3, rd_ofs=3112, rm_ofs=3296, shift=-6, opr_sz=8, max_sz=8) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/arm/translate.c:4413 #5 0x00000000004f17ad in disas_neon_data_insn (s=0x7fffffffe000, insn=4089066426) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/arm/translate.c:5702 #6 0x0000000000510dfa in disas_arm_insn (s=0x7fffffffe000, insn=4089066426) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/arm/translate.c:10803 #7 0x0000000000511b6b in arm_tr_translate_insn (dcbase=0x7fffffffe000, cpu=0xd04ea0) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/arm/translate.c:11277 #8 0x000000000045a833 in translator_loop (ops=0xc2ad40 <arm_translator_ops>, db=0x7fffffffe000, cpu=0xd04ea0, tb=0x7fffe80e3540 <code_gen_buffer+931091>, max_insns=512) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/translator.c:102 #9 0x0000000000512303 in gen_intermediate_code (cpu=0xd04ea0, tb=0x7fffe80e3540 <code_gen_buffer+931091>, max_insns=512) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/arm/translate.c:11568 #10 0x0000000000458c01 in tb_gen_code (cpu=0xd04ea0, pc=4268188000, cs_base=0, flags=1196032, cflags=-16777216) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/translate-all.c:1718 #11 0x0000000000455f73 in tb_find (cpu=0xd04ea0, last_tb=0x0, tb_exit=0, cf_mask=0) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:407 #12 0x00000000004566d3 in cpu_exec (cpu=0xd04ea0) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:731 #13 0x000000000049a380 in cpu_loop (env=0xd0d180) at /home/petmay01/linaro/qemu-from-laptop/qemu/linux-user/arm/cpu_loop.c:219 #14 0x0000000000465637 in main (argc=5, argv=0x7fffffffeb28, envp=0x7fffffffeb58) at /home/petmay01/linaro/qemu-from-laptop/qemu/linux-user/main.c:872 > + tcg_debug_assert(shift <= (8 << vece)); > + > + if (shift == (8 << vece)) { > + /* > + * Shifts larger than the element size are architecturally valid. > + * Signed results in all sign bits. With rounding, this produces > + * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. > + * I.e. always zero. > + */ > + tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); > + } else { > + tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); > + } > +} > @@ -5269,6 +5685,28 @@ static int disas_neon_data_insn(DisasContext *s, > uint32_t insn) > } > return 0; > > + case 2: /* VRSHR */ > + /* Right shift comes here negative. */ > + shift = -shift; > + if (u) { > + gen_gvec_urshr(size, rd_ofs, rm_ofs, shift, > + vec_size, vec_size); > + } else { > + gen_gvec_srshr(size, rd_ofs, rm_ofs, shift, > + vec_size, vec_size); > + } > + return 0; > + > + case 3: /* VRSRA */ > + if (u) { > + gen_gvec_ursra(size, rd_ofs, rm_ofs, shift, > + vec_size, vec_size); > + } else { > + gen_gvec_srsra(size, rd_ofs, rm_ofs, shift, > + vec_size, vec_size); > + } > + return 0; I think the VRSRA case needs the same "shift = -shift" as VRSHR. thanks -- PMM