gcc -O2 -g3 [...] addmul_N.c -DN=8 -DCLOCK=1694000000
$ ./t.out
mpn_addmul_8:     2845ms (1.782 cycles/limb) [973.59 Gb/s]
mpn_addmul_8:     2620ms (1.641 cycles/limb) [1057.20 Gb/s]
mpn_addmul_8:     2625ms (1.644 cycles/limb) [1055.19 Gb/s]
mpn_addmul_8:     2625ms (1.644 cycles/limb) [1055.19 Gb/s]
mpn_addmul_8:     2625ms (1.644 cycles/limb) [1055.19 Gb/s]

(Clearly there's something wrong with the bandwith calculation.  ;-)

It's somewhat unfortunate that neon load/store insns are not conditional. I've made do with conditionally loading limbs into core registers and copying them over.

It might be worth a try having two copies of this main loop, one in which there are more than 8 limbs remaining (requiring loads of the 9th RP limb), and one for the final 8 limbs. But I don't have time today, and my suspicion is that you'll get no better than 1.60 cyc/limb, if indeed one can measure any improvement at all.

Any real improvement on this routine will have to be algorithmic, using a different mechanism to compute and apply carries. And that's where it's likely someone else that's going to have to be clever.

What sizes are really interesting for addmul decomposition?

Would it be useful do a prime like addmul_7? It would run at pretty much the same speed as addmul_8, since nearly the same insns would be used. Just the last vector would be a D register instead of a Q register.

There's still lots of free registers -- over half in fact. The same scheme would still easily fit k=16 without spilling. But given the distance between dependencies already I can't imagine but that we're near the point of diminishing returns.


r~
dnl  ARM neon mpn_addmul_8.
dnl
dnl  Copyright 2013 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

        .fpu    neon
        .arch   armv6t2
        .arm

C            cycles/limb
        
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

define(`Dv01', `d0')    C v2si of two multiplier limbs
define(`Dv23', `d1')    C ... 2nd
define(`Dv45', `d2')    C ... 3rd
define(`Dv67', `d3')    C ... 4th
define(`Du0', `d4')     C v2si of one replicated multiplicand limb
define(`Qa9', `q3')     C v4si with [0] containing 9th addend limb
define(`Da9', `d6')

define(`Qc01', `q8')    C v2di of product or carry
define(`Dc0', `d16')    C v1di (scalar) aliases of Qc01
define(`Dc1', `d17')
define(`Qc23', `q9')    C ... 2nd
define(`Dc2', `d18')
define(`Dc3', `d19')
define(`Qc45', `q10')   C ... 3rd
define(`Dc4', `d20')
define(`Dc5', `d21')
define(`Qc67', `q11')   C ... 4th
define(`Dc6', `d22')
define(`Dc7', `d23')

ASM_START()
PROLOGUE(mpn_addmul_8)
        vld1.32         {Dc0, Dc1, Dc2, Dc3}, [rp]      @ load 8 rp limbs
        cmp             n, #8                           @ more than 8 limbs?
        vld1.32         {Dv01, Dv23, Dv45, Dv67}, [vp]  @ load all vp limbs
        vld1.32         {Du0[0]}, [up]                  @ load first up
        ldr             ip, [up, #4]                    @ load 2nd up limb
        ldrgt           r3, [rp, #32]                   @ maybe load 9th rp
        add             up, up, #8
        moveq           r3, #0                          @ maybe zero 9th rp

        vmov.i32        Qa9, #0                 @ clear addend input
        vmovl.u32       Qc67, Dc3               @ extend rp limbs to carry-in
        vmovl.u32       Qc45, Dc2
        vmovl.u32       Qc23, Dc1
        vmovl.u32       Qc01, Dc0

        .balign 16
        @ Main loop -- 8 x 1 multiplications
.Loop:
        vmlal.u32       Qc01, Dv01, Du0[0]      @ compute all products
        vmov.32         Da9[0], r3              @ move 9th rp into place
        vmlal.u32       Qc23, Dv23, Du0[0]
        cmp             n, #9                   @ more than 8 limbs left?
        vmlal.u32       Qc45, Dv45, Du0[0]
        ldrgt           r3, [rp, #36]           @ maybe load next rp limb
        vmlal.u32       Qc67, Dv67, Du0[0]
        movle           r3, #0                  @ maybe zero next rp limb
        vmov.32         Du0[0], ip              @ move next up limb into place
        subs            n, n, #1
        vst1.u32        {Dc0[0]}, [rp]!         @ store lowest product limb
        ldrne           ip, [up], #4            @ maybe load next up limb

        vext.32         Qc01, Qc01, Qc23, #1    @ rotate carry down one
        vext.32         Qc23, Qc23, Qc45, #1
        vext.32         Qc45, Qc45, Qc67, #1
        vext.32         Qc67, Qc67, Qa9, #1     @ rotate in 9th addend or 0.
        vpaddl.u32      Qc01, Qc01              @ fold carries
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Qc45, Qc45
        vpaddl.u32      Qc67, Qc67
        bne             .Loop

        @ Done with all products.  Finish propagating carries,
        @ Store seven carry limbs and return the eighth.
        @ c0-c7 = { c7l, c6, c5, c4, c3, c2, c1, c0 }
        vst1.32         {Dc0[0]}, [rp]!         @ one...
        vext.32         Qc01, Qc01, Qc23, #1
        vext.32         Qc23, Qc23, Qc45, #1
        vext.32         Qc45, Qc45, Qc67, #1
        vext.32         Dc6, Dc6, Dc7, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Qc45, Qc45
        vpaddl.u32      Dc6, Dc6

        @ c0-c7 = { 0, c6l, c5, c4, c3, c2, c1, c0 }
        vst1.32         {Dc0[0]}, [rp]!         @ two...
        vext.32         Qc01, Qc01, Qc23, #1
        vext.32         Qc23, Qc23, Qc45, #1
        vext.32         Qc45, Qc45, Qc67, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Qc45, Qc45

        @ c0-c7 = { 0, 0, c5l, c4, c3, c2, c1, c0 }
        vst1.32         {Dc0[0]}, [rp]!         @ three...
        vext.32         Qc01, Qc01, Qc23, #1
        vext.32         Qc23, Qc23, Qc45, #1
        vext.32         Dc4, Dc4, Dc5, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Dc4, Dc4

        @ c0-c7 = { 0, 0, 0, c4l, c3, c2, c1, c0 }
        vst1.32         {Dc0[0]}, [rp]!         @ four...
        vext.32         Qc01, Qc01, Qc23, #1
        vext.32         Qc23, Qc23, Qc45, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23

        @ c0-c7 = { 0, 0, 0, 0, c3l, c2, c1, c0 }
        vst1.32         {Dc0[0]}, [rp]!         @ five...
        vext.32         Qc01, Qc01, Qc23, #1
        vext.32         Dc2, Dc2, Dc3, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Dc2, Dc2

        @ c0-c7 = { 0, 0, 0, 0, 0, c2l, c1, c0 }
        vst1.32         {Dc0[0]}, [rp]!         @ six...
        vext.32         Qc01, Qc01, Qc23, #1
        vpaddl.u32      Qc01, Qc01

        @ c0-c7 = { 0, 0, 0, 0, 0, 0, c1l, c0 }
        vst1.32         {Dc0[0]}, [rp]!         @ seven...
        vext.32         Dc0, Dc0, Dc1, #1
        vpaddl.u32      Dc0, Dc0

        @ c0-c7 = { 0, 0, 0, 0, 0, 0, 0, c0l }
        vmov.32         r0, Dc0[0]              @ eight...
        bx              lr

EPILOGUE()
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to