Re: Performance of floating point instructions

Siarhei Siamashka Wed, 10 Mar 2010 14:32:19 -0800

On Wednesday 10 March 2010, Laurent GUERBY wrote:
> On Wed, 2010-03-10 at 21:54 +0200, Siarhei Siamashka wrote:
> > I wonder why the compiler does not use real NEON instructions with
> > -ffast-math option, it should be quite useful even for scalar code.
> >
> > something like:
> >
> > vld1.32  {d0[0]}, [r0]
> > vadd.f32 d0, d0, d0
> > vst1.32  {d0[0]}, [r0]
> >
> > instead of:
> >
> > flds     s0, [r0]
> > fadds    s0, s0, s0
> > fsts     s0, [r0]
> >
> > for:
> >
> > *float_ptr = *float_ptr + *float_ptr;
> >
> > At least NEON is pipelined and should be a lot faster on more complex
> > code examples where it can actually benefit from pipelining. On x86, SSE2
> > is used quite nicely for floating point math.
>
> Hi,
>
> Please open a report on http://gcc.gnu.org/bugzilla with your test
> sources and command line, at least GCC developpers will notice there's
> interest :).


This sounds reasonable :)

> GCC comes with some builtins for neon, they're defined in arm_neon.h
> see below.

This does not sound like a good idea. If the code has to be modified and
changed into something nonportable, there are way better options than
intrinsics.

Regarding the use of NEON instructions via C++ operator overloading. A test
program is attached.

# gcc -O3 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -ffast-math
      -o neon_float neon_float.cpp

=== ieee754 floats ===

real    0m3.396s
user    0m3.391s
sys     0m0.000s

=== runfast floats ===

real    0m2.285s
user    0m2.273s
sys     0m0.008s

=== NEON C++ wrapper ===

real    0m1.312s
user    0m1.313s
sys     0m0.000s

But the quality of generated code is quite bad. That's also something to be
reported to gcc bugzilla :)

-- 
Best regards,
Siarhei Siamashka

#include <stdio.h>
#include <arm_neon.h>

#if 1
class fast_float
{
    float32x2_t data;
public:
    fast_float(float x) { data = vset_lane_f32(x, data, 0); }
    fast_float(const fast_float &x) { data = x.data; }
    fast_float(const float32x2_t &x) { data = x; }
    operator float () { return vget_lane_f32(data, 0); }

    friend fast_float operator+(const fast_float &a, const fast_float &b);
    friend fast_float operator*(const fast_float &a, const fast_float &b);

    const fast_float &operator+=(fast_float a)
    {
        data = vadd_f32(data, a.data);
        return *this;
    }
};
fast_float operator+(const fast_float &a, const fast_float &b)
{
    return vadd_f32(a.data, b.data);
}
fast_float operator*(const fast_float &a, const fast_float &b)
{
    return vmul_f32(a.data, b.data);
}
#else
typedef float fast_float;
#endif

float f(float *a, float *b)
{
    int i;
    fast_float accumulator = 0;
    for (i = 0; i < 1024; i += 16)
    {
        accumulator += (fast_float)a[i + 0] * (fast_float)b[i + 0];
        accumulator += (fast_float)a[i + 1] * (fast_float)b[i + 1];
        accumulator += (fast_float)a[i + 2] * (fast_float)b[i + 2];
        accumulator += (fast_float)a[i + 3] * (fast_float)b[i + 3];
        accumulator += (fast_float)a[i + 4] * (fast_float)b[i + 4];
        accumulator += (fast_float)a[i + 5] * (fast_float)b[i + 5];
        accumulator += (fast_float)a[i + 6] * (fast_float)b[i + 6];
        accumulator += (fast_float)a[i + 7] * (fast_float)b[i + 7];
        accumulator += (fast_float)a[i + 8] * (fast_float)b[i + 8];
        accumulator += (fast_float)a[i + 9] * (fast_float)b[i + 9];
        accumulator += (fast_float)a[i + 10] * (fast_float)b[i + 10];
        accumulator += (fast_float)a[i + 11] * (fast_float)b[i + 11];
        accumulator += (fast_float)a[i + 12] * (fast_float)b[i + 12];
        accumulator += (fast_float)a[i + 13] * (fast_float)b[i + 13];
        accumulator += (fast_float)a[i + 14] * (fast_float)b[i + 14];
        accumulator += (fast_float)a[i + 15] * (fast_float)b[i + 15];
    }
    return accumulator;
}

volatile float dummy;
float buf1[1024];
float buf2[1024];

int main()
{
    int i;
    int tmp;
    __asm__ volatile(
        "fmrx       %[tmp], fpscr\n"
        "orr        %[tmp], %[tmp], #(1 << 24)\n" /* flush-to-zero */
        "orr        %[tmp], %[tmp], #(1 << 25)\n" /* default NaN */
        "bic        %[tmp], %[tmp], #((1 << 15) | (1 << 12) | (1 << 11) | (1 << 10) | (1 << 9) | (1 << 8))\n" /* clear exception bits */
        "fmxr       fpscr, %[tmp]\n"
        : [tmp] "=r" (tmp)
      );
    for (i = 0; i < 1024; i++)
    {
        buf1[i] = buf2[i] = i % 16;
    }
    for (i = 0; i < 100000; i++)
    {
        dummy = f(buf1, buf2);
    }
    printf("%f\n", (double)dummy);
    return 0;
}

_______________________________________________
maemo-developers mailing list
maemo-developers@maemo.org
https://lists.maemo.org/mailman/listinfo/maemo-developers

Re: Performance of floating point instructions

Reply via email to