On Wed, 2 Jul 2025 at 13:38, Richard Henderson <richard.hender...@linaro.org> wrote: > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > --- > target/arm/tcg/helper-sme.h | 20 ++++++ > target/arm/tcg/sme_helper.c | 116 +++++++++++++++++++++++++++++++++ > target/arm/tcg/translate-sme.c | 35 ++++++++++ > target/arm/tcg/sme.decode | 22 +++++++ > 4 files changed, 193 insertions(+)
> index d69d57c4cb..906d369d37 100644 > --- a/target/arm/tcg/sme_helper.c > +++ b/target/arm/tcg/sme_helper.c > @@ -1561,6 +1561,64 @@ void HELPER(sme2_fcvt_n)(void *vd, void *vs, > float_status *fpst, uint32_t desc) > } > } > > +#define SQCVT2(NAME, TW, TN, HW, HN, SAT) \ > +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ > +{ \ > + ARMVectorReg scratch; \ > + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ > + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ > + TN *d = vd; \ > + if ((vd - vs) < 2 * sizeof(ARMVectorReg)) { \ Does this do the right thing if Vd is less than Vs? Pointer differences are signed, I think, so for eg vd == 0 vs == 16 we unnecessarily use the scratch reg. Maybe clearer to write (vd >= vs && vd < (vs + 2 * sizeof(..)) (Similarly for other use of this condition later in the patch.) > + d = (TN *)&scratch; \ > + } \ > + for (size_t i = 0; i < n; ++i) { \ > + d[HN(i)] = SAT(s0[HW(i)]); \ > + d[HN(i) + n] = SAT(s1[HW(i)]); \ Should this be HN(i + n) ? > + } \ > + if (d != vd) { \ > + memcpy(vd, d, oprsz); \ > + } \ > +} > + > +SQCVT2(sme2_sqcvt_sh, int32_t, int16_t, H4, H2, do_ssat_h) > +SQCVT2(sme2_uqcvt_sh, uint32_t, uint16_t, H4, H2, do_usat_h) > +SQCVT2(sme2_sqcvtu_sh, int32_t, uint16_t, H4, H2, do_usat_h) > + > +#undef SQCVT2 > + > +#define SQCVT4(NAME, TW, TN, HW, HN, SAT) \ > +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ > +{ \ > + ARMVectorReg scratch; \ > + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ > + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ > + TW *s2 = vs + 2 * sizeof(ARMVectorReg); \ > + TW *s3 = vs + 3 * sizeof(ARMVectorReg); \ > + TN *d = vd; \ > + if ((vd - vs) < 4 * sizeof(ARMVectorReg)) { \ > + d = (TN *)&scratch; \ > + } \ > + for (size_t i = 0; i < n; ++i) { \ > + d[HN(i)] = SAT(s0[HW(i)]); \ > + d[HN(i) + n] = SAT(s1[HW(i)]); \ > + d[HN(i) + 2 * n] = SAT(s2[HW(i)]); \ > + d[HN(i) + 3 * n] = SAT(s3[HW(i)]); \ Similarly I thought the Hn macros would need these to be HN(i + n) etc. > + } \ > + if (d != vd) { \ > + memcpy(vd, d, oprsz); \ > + } \ > +} > +#define SQCVTN2(NAME, TW, TN, HW, HN, SAT) \ > +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ > +{ \ > + ARMVectorReg scratch; \ > + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ > + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ > + TN *d = vd; \ > + if ((vd - vs) < 2 * sizeof(ARMVectorReg)) { \ > + d = (TN *)&scratch; \ > + } \ > + for (size_t i = 0; i < n; ++i) { \ > + d[HN(2 * i + 0)] = SAT(s0[HW(i)]); \ > + d[HN(2 * i + 1)] = SAT(s1[HW(i)]); \ Hmm, here we do do HN(whole expr)... > + } \ > + if (d != vd) { \ > + memcpy(vd, d, oprsz); \ > + } \ > +} > + thanks -- PMM