arm: Add helpers for FMLAL and FMLSL

Laurent Desnogues Thu, 14 Feb 2019 01:41:51 -0800

Hello,

On Thu, Feb 14, 2019 at 5:00 AM Richard Henderson
<richard.hender...@linaro.org> wrote:
>
> Note that float16_to_float32 rightly squashes SNaN to QNaN.
> But of course pickNaNMulAdd, for ARM, selects SNaNs first.
> So we have to preserve SNaN long enough for the correct NaN
> to be selected.  Thus float16_to_float32_by_bits.
>
> Signed-off-by: Richard Henderson <richard.hender...@linaro.org>
> ---
>  target/arm/helper.h     |   9 +++
>  target/arm/vec_helper.c | 154 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 163 insertions(+)
>
> diff --git a/target/arm/helper.h b/target/arm/helper.h
> index 53a38188c6..0302e13604 100644
> --- a/target/arm/helper.h
> +++ b/target/arm/helper.h
> @@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
>  DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
>                     void, ptr, ptr, ptr, ptr, ptr, i32)
>
> +DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG,
> +                   void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG,
> +                   void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG,
> +                   void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG,
> +                   void, ptr, ptr, ptr, ptr, i32)
> +
>  #ifdef TARGET_AARCH64
>  #include "helper-a64.h"
>  #include "helper-sve.h"
> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
> index 37f338732e..0c3b3de961 100644
> --- a/target/arm/vec_helper.c
> +++ b/target/arm/vec_helper.c
> @@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
>  DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
>
>  #undef DO_FMLA_IDX
> +
> +/*
> + * Convert float16 to float32, raising no exceptions and
> + * preserving exceptional values, including SNaN.
> + * This is effectively an unpack+repack operation.
> + */
> +static float32 float16_to_float32_by_bits(uint32_t f16)
> +{
> +    const int f16_bias = 15;
> +    const int f32_bias = 127;
> +    uint32_t sign = extract32(f16, 15, 1);
> +    uint32_t exp = extract32(f16, 10, 5);
> +    uint32_t frac = extract32(f16, 0, 10);
> +
> +    if (exp == 0x1f) {
> +        /* Inf or NaN */
> +        exp = 0xff;
> +    } else if (exp == 0) {
> +        /* Zero or denormal.  */
> +        if (frac != 0) {
> +            /*
> +             * Denormal; these are all normal float32.
> +             * Shift the fraction so that the msb is at bit 11,
> +             * then remove bit 11 as the implicit bit of the
> +             * normalized float32.  Note that we still go through
> +             * the shift for normal numbers below, to put the
> +             * float32 fraction at the right place.
> +             */
> +            int shift = clz32(frac) - 21;
> +            frac = (frac << shift) & 0x3ff;
> +            exp = f32_bias - f16_bias - shift + 1;


If FZ16 is set, this should flush to zero.

This means you will have to use both fp_status (for the muladd) and
fp_status_f16 (for this function) and so you should pass cpu_env to
the helpers rather than the fp_status.

Thanks,

Laurent

> +        }
> +    } else {
> +        /* Normal number; adjust the bias.  */
> +        exp += f32_bias - f16_bias;
> +    }
> +    sign <<= 31;
> +    exp <<= 23;
> +    frac <<= 23 - 10;
> +
> +    return sign | exp | frac;
> +}
> +
> +static float32 fmlal(float32 a, float16 n16, float16 m16, float_status *fpst)
> +{
> +    float32 n = float16_to_float32_by_bits(n16);
> +    float32 m = float16_to_float32_by_bits(m16);
> +    return float32_muladd(n, m, a, 0, fpst);
> +}
> +
> +static float32 fmlsl(float32 a, float16 n16, float16 m16, float_status *fpst)
> +{
> +    float32 n = float16_to_float32_by_bits(n16);
> +    float32 m = float16_to_float32_by_bits(m16);
> +    return float32_muladd(float32_chs(n), m, a, 0, fpst);
> +}
> +
> +static inline uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
> +{
> +    /*
> +     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
> +     * Load the 2nd qword iff is_q & is_2.
> +     * Shift to the 2nd dword iff !is_q & is_2.
> +     * For !is_q & !is_2, the upper bits of the result are garbage.
> +     */
> +    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
> +}
> +
> +/*
> + * Note that FMLAL and FMLSL require oprsz == 8 or oprsz == 16,
> + * as there is not yet SVE versions that might use blocking.
> + */
> +
> +void HELPER(gvec_fmlal_h)(void *vd, void *vn, void *vm,
> +                          void *fpst, uint32_t desc)
> +{
> +    intptr_t i, oprsz = simd_oprsz(desc);
> +    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> +    int is_q = oprsz == 16;
> +    float32 *d = vd;
> +    uint64_t n_4, m_4;
> +
> +    /* Pre-load all of the f16 data, avoiding overlap issues.  */
> +    n_4 = load4_f16(vn, is_q, is_2);
> +    m_4 = load4_f16(vm, is_q, is_2);
> +
> +    for (i = 0; i < oprsz / 4; i++) {
> +        d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i*16, 16),
> +                         extract64(m_4, i*16, 16), fpst);
> +    }
> +    clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlsl_h)(void *vd, void *vn, void *vm,
> +                          void *fpst, uint32_t desc)
> +{
> +    intptr_t i, oprsz = simd_oprsz(desc);
> +    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> +    int is_q = oprsz == 16;
> +    float32 *d = vd;
> +    uint64_t n_4, m_4;
> +
> +    /* Pre-load all of the f16 data, avoiding overlap issues.  */
> +    n_4 = load4_f16(vn, is_q, is_2);
> +    m_4 = load4_f16(vm, is_q, is_2);
> +
> +    for (i = 0; i < oprsz / 4; i++) {
> +        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16),
> +                         extract64(m_4, i*16, 16), fpst);
> +    }
> +    clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlal_idx_h)(void *vd, void *vn, void *vm,
> +                              void *fpst, uint32_t desc)
> +{
> +    intptr_t i, oprsz = simd_oprsz(desc);
> +    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> +    int index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
> +    int is_q = oprsz == 16;
> +    float32 *d = vd;
> +    uint64_t n_4;
> +    float16 m_1;
> +
> +    /* Pre-load all of the f16 data, avoiding overlap issues.  */
> +    n_4 = load4_f16(vn, is_q, is_2);
> +    m_1 = ((float16 *)vm)[H2(index)];
> +
> +    for (i = 0; i < oprsz / 4; i++) {
> +        d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i * 16, 16), m_1, fpst);
> +    }
> +    clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlsl_idx_h)(void *vd, void *vn, void *vm,
> +                              void *fpst, uint32_t desc)
> +{
> +    intptr_t i, oprsz = simd_oprsz(desc);
> +    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> +    int index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
> +    int is_q = oprsz == 16;
> +    float32 *d = vd;
> +    uint64_t n_4;
> +    float16 m_1;
> +
> +    /* Pre-load all of the f16 data, avoiding overlap issues.  */
> +    n_4 = load4_f16(vn, is_q, is_2);
> +    m_1 = ((float16 *)vm)[H2(index)];
> +
> +    for (i = 0; i < oprsz / 4; i++) {
> +        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16), m_1, fpst);
> +    }
> +    clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> --
> 2.17.2
>
>

Re: [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL

Reply via email to