On Wed, 2 Jul 2025 at 13:38, Richard Henderson
<richard.hender...@linaro.org> wrote:
>
> Signed-off-by: Richard Henderson <richard.hender...@linaro.org>
> ---
>  target/arm/tcg/helper-sme.h    |  20 ++++++
>  target/arm/tcg/sme_helper.c    | 116 +++++++++++++++++++++++++++++++++
>  target/arm/tcg/translate-sme.c |  35 ++++++++++
>  target/arm/tcg/sme.decode      |  22 +++++++
>  4 files changed, 193 insertions(+)


> index d69d57c4cb..906d369d37 100644
> --- a/target/arm/tcg/sme_helper.c
> +++ b/target/arm/tcg/sme_helper.c
> @@ -1561,6 +1561,64 @@ void HELPER(sme2_fcvt_n)(void *vd, void *vs, 
> float_status *fpst, uint32_t desc)
>      }
>  }
>
> +#define SQCVT2(NAME, TW, TN, HW, HN, SAT)                       \
> +void HELPER(NAME)(void *vd, void *vs, uint32_t desc)            \
> +{                                                               \
> +    ARMVectorReg scratch;                                       \
> +    size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW);    \
> +    TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg);               \
> +    TN *d = vd;                                                 \
> +    if ((vd - vs) < 2 * sizeof(ARMVectorReg)) {                 \

Does this do the right thing if Vd is less than Vs?
Pointer differences are signed, I think, so for eg vd == 0
vs == 16 we unnecessarily use the scratch reg.
Maybe clearer to write
   (vd >= vs && vd < (vs + 2 * sizeof(..))

(Similarly for other use of this condition later in the patch.)


> +        d = (TN *)&scratch;                                     \
> +    }                                                           \
> +    for (size_t i = 0; i < n; ++i) {                            \
> +        d[HN(i)] = SAT(s0[HW(i)]);                              \
> +        d[HN(i) + n] = SAT(s1[HW(i)]);                          \

Should this be HN(i + n) ?

> +    }                                                           \
> +    if (d != vd) {                                              \
> +        memcpy(vd, d, oprsz);                                   \
> +    }                                                           \
> +}
> +
> +SQCVT2(sme2_sqcvt_sh, int32_t, int16_t, H4, H2, do_ssat_h)
> +SQCVT2(sme2_uqcvt_sh, uint32_t, uint16_t, H4, H2, do_usat_h)
> +SQCVT2(sme2_sqcvtu_sh, int32_t, uint16_t, H4, H2, do_usat_h)
> +
> +#undef SQCVT2
> +
> +#define SQCVT4(NAME, TW, TN, HW, HN, SAT)                       \
> +void HELPER(NAME)(void *vd, void *vs, uint32_t desc)            \
> +{                                                               \
> +    ARMVectorReg scratch;                                       \
> +    size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW);    \
> +    TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg);               \
> +    TW *s2 = vs + 2 * sizeof(ARMVectorReg);                     \
> +    TW *s3 = vs + 3 * sizeof(ARMVectorReg);                     \
> +    TN *d = vd;                                                 \
> +    if ((vd - vs) < 4 * sizeof(ARMVectorReg)) {                 \
> +        d = (TN *)&scratch;                                     \
> +    }                                                           \
> +    for (size_t i = 0; i < n; ++i) {                            \
> +        d[HN(i)] = SAT(s0[HW(i)]);                              \
> +        d[HN(i) + n] = SAT(s1[HW(i)]);                          \
> +        d[HN(i) + 2 * n] = SAT(s2[HW(i)]);                      \
> +        d[HN(i) + 3 * n] = SAT(s3[HW(i)]);                      \

Similarly I thought the Hn macros would need these to be
HN(i + n) etc.

> +    }                                                           \
> +    if (d != vd) {                                              \
> +        memcpy(vd, d, oprsz);                                   \
> +    }                                                           \
> +}

> +#define SQCVTN2(NAME, TW, TN, HW, HN, SAT)                      \
> +void HELPER(NAME)(void *vd, void *vs, uint32_t desc)            \
> +{                                                               \
> +    ARMVectorReg scratch;                                       \
> +    size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW);    \
> +    TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg);               \
> +    TN *d = vd;                                                 \
> +    if ((vd - vs) < 2 * sizeof(ARMVectorReg)) {                 \
> +        d = (TN *)&scratch;                                     \
> +    }                                                           \
> +    for (size_t i = 0; i < n; ++i) {                            \
> +        d[HN(2 * i + 0)] = SAT(s0[HW(i)]);                      \
> +        d[HN(2 * i + 1)] = SAT(s1[HW(i)]);                      \

Hmm, here we do do HN(whole expr)...


> +    }                                                           \
> +    if (d != vd) {                                              \
> +        memcpy(vd, d, oprsz);                                   \
> +    }                                                           \
> +}
> +

thanks
-- PMM

Reply via email to