Richard Henderson <richard.hender...@linaro.org> writes:
> Better handling of non-power-of-2 tails as seen with Arm 8-byte > vector operations. > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> Reviewed-by: Alex Bennée <alex.ben...@linaro.org> > --- > tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 63 insertions(+), 19 deletions(-) > > diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c > index 5a6cc19812..43cac1a0bf 100644 > --- a/tcg/tcg-op-gvec.c > +++ b/tcg/tcg-op-gvec.c > @@ -326,11 +326,34 @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, > uint32_t bofs, > in units of LNSZ. This limits the expansion of inline code. */ > static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) > { > - if (oprsz % lnsz == 0) { > - uint32_t lnct = oprsz / lnsz; > - return lnct >= 1 && lnct <= MAX_UNROLL; > + uint32_t q, r; > + > + if (oprsz < lnsz) { > + return false; > } > - return false; > + > + q = oprsz / lnsz; > + r = oprsz % lnsz; > + tcg_debug_assert((r & 7) == 0); > + > + if (lnsz < 16) { > + /* For sizes below 16, accept no remainder. */ > + if (r != 0) { > + return false; > + } > + } else { > + /* > + * Recall that ARM SVE allows vector sizes that are not a > + * power of 2, but always a multiple of 16. The intent is > + * that e.g. size == 80 would be expanded with 2x32 + 1x16. > + * In addition, expand_clr needs to handle a multiple of 8. > + * Thus we can handle the tail with one more operation per > + * diminishing power of 2. > + */ > + q += ctpop32(r); > + } > + > + return q <= MAX_UNROLL; > } > > static void expand_clr(uint32_t dofs, uint32_t maxsz); > @@ -402,22 +425,31 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, > TCGv_i64 in) > static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, > uint32_t size, bool prefer_i64) > { > - if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { > - /* > - * Recall that ARM SVE allows vector sizes that are not a > - * power of 2, but always a multiple of 16. The intent is > - * that e.g. size == 80 would be expanded with 2x32 + 1x16. > - * It is hard to imagine a case in which v256 is supported > - * but v128 is not, but check anyway. > - */ > - if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) > - && (size % 32 == 0 > - || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { > - return TCG_TYPE_V256; > - } > + /* > + * Recall that ARM SVE allows vector sizes that are not a > + * power of 2, but always a multiple of 16. The intent is > + * that e.g. size == 80 would be expanded with 2x32 + 1x16. > + * It is hard to imagine a case in which v256 is supported > + * but v128 is not, but check anyway. > + * In addition, expand_clr needs to handle a multiple of 8. > + */ > + if (TCG_TARGET_HAS_v256 && > + check_size_impl(size, 32) && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && > + (!(size & 16) || > + (TCG_TARGET_HAS_v128 && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && > + (!(size & 8) || > + (TCG_TARGET_HAS_v64 && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { > + return TCG_TYPE_V256; > } > - if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) > - && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { > + if (TCG_TARGET_HAS_v128 && > + check_size_impl(size, 16) && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && > + (!(size & 8) || > + (TCG_TARGET_HAS_v64 && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { > return TCG_TYPE_V128; > } > if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) > @@ -432,6 +464,18 @@ static void do_dup_store(TCGType type, uint32_t dofs, > uint32_t oprsz, > { > uint32_t i = 0; > > + tcg_debug_assert(oprsz >= 8); > + > + /* > + * This may be expand_clr for the tail of an operation, e.g. > + * oprsz == 8 && maxsz == 64. The first 8 bytes of this store > + * are misaligned wrt the maximum vector size, so do that first. > + */ > + if (dofs & 8) { > + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); > + i += 8; > + } > + > switch (type) { > case TCG_TYPE_V256: > /* -- Alex Bennée