https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88510

            Bug ID: 88510
           Summary: GCC generates inefficient U64x2 scalar multiply for
                    NEON32
           Product: gcc
           Version: 8.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: husseydevin at gmail dot com
  Target Milestone: ---

Note: I use these typedefs here for brevity.

typedef uint64x2_t U64x2;
typedef uint32x2_t U32x2;
typedef uint32x2x2_t U32x2x2;
typedef uint32x4_t U32x4;

GCC and Clang both have issues with this code on ARMv7a NEON, and will switch
to scalar:

U64x2 multiply(U64x2 top, U64x2 bot)
{
    return top * bot;
}

gcc-8 -mfloat-abi=hard -mfpu=neon -O3 -S -march=armv7-a 

multiply:
        push    {r4, r5, r6, r7, lr}
        sub     sp, sp, #20
        vmov    r0, r1, d0  @ v2di
        vmov    r6, r7, d2  @ v2di
        vmov    r2, r3, d1  @ v2di
        vmov    r4, r5, d3  @ v2di
        mul     lr, r0, r7
        mla     lr, r6, r1, lr
        mul     ip, r2, r5
        umull   r0, r1, r0, r6
        mla     ip, r4, r3, ip
        add     r1, lr, r1
        umull   r2, r3, r2, r4
        strd    r0, [sp]
        add     r3, ip, r3
        strd    r2, [sp, #8]
        vld1.64 {d0-d1}, [sp:64]
        add     sp, sp, #20
        pop     {r4, r5, r6, r7, pc}

Clang's is worse, and you can compare the output, as well as the i386 SSE4.1
code here: https://godbolt.org/z/35owtL

Related LLVM bug 39967: https://bugs.llvm.org/show_bug.cgi?id=39967

I started the discussion in LLVM, as it had the worse problem, and we have come
up with a few options for faster code that does not require scalar. You can
also find the benchmark file (with outdated tests) and results results. They
are from Clang, but since they use intrinsics, results are similar.

While we don't have vmulq_u64, we do have faster ways to multiply without going
scalar.

I have benchmarked the code, and have found this option, based on the code
emitted for SSE4.1:

U64x2 goodmul_sse(U64x2 top, U64x2 bot)
{
    U32x2 topHi = vshrn_n_u64(top, 32);     // U32x2 topHi  = top >> 32;
    U32x2 topLo = vmovn_u64(top);           // U32x2 topLo  = top & 0xFFFFFFFF;
    U32x2 botHi = vshrn_n_u64(bot, 32);     // U32x2 botHi  = bot >> 32;
    U32x2 botLo = vmovn_u64(bot);           // U32x2 botLo  = bot & 0xFFFFFFFF;

    U64x2 ret64 = vmull_u32(topHi, botLo);  // U64x2 ret64   = (U64x2)topHi *
(U64x2)botLo;
    ret64 = vmlal_u32(ret64, topLo, botHi); //       ret64  += (U64x2)topLo *
(U64x2)botHi;
    ret64 = vshlq_n_u64(ret64, 32);         //       ret64 <<= 32;
    ret64 = vmlal_u32(ret64, topLo, botLo); //       ret64  += (U64x2)topLo *
(U64x2)botLo;
    return ret64;
}

If GCC can figure out how to interleave one or two of the operands, for
example, changing this:

    U64x2 inp1 = vld1q_u64(p);
    U64x2 inp2 = vld1q_u64(q);
    vec = goodmul_sse(inp1, inp2);

to this (if it knows inp1 and/or inp2 are only used for multiplication):

    U32x2x2 inp1 = vld2_u32(p);
    U32x2x2 inp2 = vld2_u32(q);
    vec = goodmul_sse_interleaved(inp1, inp2)

then we can do this and save 4 cycles:

U64x2 goodmul_sse_interleaved(const U32x2x2 top, const U32x2x2 bot)
{
    U64x2 ret64 = vmull_u32(top.val[1], bot.val[0]);  // U64x2 ret64   =
(U64x2)topHi * (U64x2)botLo;
    ret64 = vmlal_u32(ret64, top.val[0], bot.val[1]); //       ret64  +=
(U64x2)topLo * (U64x2)botHi;
    ret64 = vshlq_n_u64(ret64, 32);                   //       ret64 <<= 32;
    ret64 = vmlal_u32(ret64, top.val[0], bot.val[0]); //       ret64  +=
(U64x2)topLo * (U64x2)botLo;
    return ret64;
}

Another user posted this (typos fixed).

It seems to use two fewer cycles when not interleaved (not 100% sure about it),
but two cycles slower when it is fully interleaved.

U64x2 twomul(U64x2 top, U64x2 bot)
{
    U32x2 top_low = vmovn_u64(top);
    U32x2 bot_low = vmovn_u64(bot);
    U32x4 top_re = vreinterpretq_u32_u64(top);
    U32x4 bot_re = vrev64q_u32(vreinterpretq_u32_u64(bot));
    U32x4 prod = vmulq_u32(top_re, bot_re);
    U64x2 paired = vpaddlq_u32(prod);
    U64x2 shifted = vshlq_n_u64(paired, 32);
    return vmlal_u32(shifted, top_low, bot_low);
}

Either one of these is faster than scalar.

Reply via email to