ni...@lysator.liu.se (Niels Möller) writes:

> Maybe later, but for now, A9 is my target platform. But it seems you're
> right that Neon is almost useless there.

I'm attaching the functions I've been testing, in case anyone else would
like to play with them.

/Niels

dnl  ARM neon mpn_addmul_4.

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

        .fpu    neon
        .arm
        .arch   armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')

C Recurrency variables
define(`Qc01', `q1')
define(`Qc23', `q2')
C Aliases
define(`Dc0', `d2')
define(`Dc1', `d3')
define(`Dc2', `d4')
define(`Dc3', `d5')
        
define(`Du00', `d6')
define(`Qtmp', `q8')
define(`Dtmp', `d16')
define(`Ttmp', `d30')
        
define(`TDu00', `d7')
define(`Tc01', `q9')
define(`Tc23', `q10')
define(`Ec01', `q11')
define(`Ec23', `q12')
define(`Ac01', `q13')
define(`Ac23', `q14')
        
ASM_START()
PROLOGUE(mpn_addmul_4)
        vld1.32 {Dv01, Dv23}, [vp]
        C We read at vp, and store at rp
        mov     vp, rp
        vld1.32 {Dc0, Dc1}, [vp]!
        vmov.i32        Qc23, #0
        vzip.32 Qc01, Qc23      C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, 
r3, 0, r2]
        C Let n denote the number of result words left to read
        subs    n, #4
        beq     .Lend
        .balign 16
.Loop:
        
        vld1.32 {Du00[]}, [up]!
        C Critical path starts here
        vmlal.u32       Qc01, Dv01, Du00
        vmlal.u32       Qc23, Dv23, Du00
        vld1.32         Dtmp[0], [vp]!
        subs    n, #1

        C We have:   Qc23 = [c3, l2, c2, l1], Qc01 = [c1, l0, c0, r0]
        C Rotate to: Qc23 = [r4, c3, l2, c2], Qc01 = [l1, c1, l0, c0]
        C Then add:  Qc23 = [r4+ c3, l2+ c2], Qc01 = [l1+ c1, l0+ c0]
        vst1.32         Dc0[0], [rp]!
        vext.32 Qc01, Qc01, Qc23, #1
        vext.32 Qc23, Qc23, Qtmp, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        bne     .Loop

.Lend:
        C Repeat 4 more times, without reading any new limbs from vp
        vmov.i32        Dtmp, #0
        mov     n, #4
.Lend_loop:

        vld1.32 {Du00[]}, [up]!
        vmlal.u32       Qc01, Dv01, Du00
        vmlal.u32       Qc23, Dv23, Du00
        subs    n, #1

        vst1.32         Dc0[0], [rp]!
        vext.32 Qc01, Qc01, Qc23, #1
        vext.32 Qc23, Qc23, Qtmp, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        bne     .Lend_loop

        C We have Qc23 = [c3, c2], Qc10 = [c1, c0] as (small) 64-bit values
        C and need to add it together
        vst1.32         Dc0[0], [rp]!
        vshr.u64        Dc0, Dc0, #32
        vadd.i64        Dc1, Dc1, Dc0
        vst1.32         Dc1[0], [rp]!
        vshr.u64        Dc1, Dc1, #32
        vadd.i64        Dc2, Dc2, Dc1
        vst1.32         Dc2[0], [rp]!
        vshr.u64        Dc2, Dc2, #32
        vadd.i64        Dc3, Dc3, Dc2
        vmov.32 r0, Dc3[0]

        bx              lr
EPILOGUE()
        
dnl  ARM neon mpn_addmul_6.

dnl  Contributed to the GNU project by Niels Möller

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

        .fpu    neon
        .arm
        .arch   armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
define(`Dv45', `d2')
define(`Du00', `d3')

C Recurrency variables
define(`Qc01', `q2')
define(`Qc23', `q3')
define(`Qc45', `q8')
C Aliases
define(`Dc0', `d4')
define(`Dc1', `d5')
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dc4', `d16')
define(`Dc5', `d17')
        
define(`Qtmp', `q9')
define(`Dtmp', `d18')

ASM_START()
PROLOGUE(mpn_addmul_6)
        vldm    vp, {Dv01,Dv23,Dv45}
        C We read at vp, and store at rp
        mov     vp, rp
        vldm    vp!, {Dc0,Dc1,Dc2}
        vmov    Dc4, Dc2
        vmov.i32        Qc23, #0
        vzip.32 Qc01, Qc23      C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, 
r3, 0, r2]
        vmov.i32        Dc5, #0
        vzip.32 Dc4, Dc5        C Pad to get Qc45 = [Dc5, Dc4] = [ 0, r5, 0, r4 
]
        C Let n denote the number of result words left to read
        subs    n, #6
        beq     .Lend
.Loop:

        vld1.32 {Du00[]}, [up]! 
        C Critical path starts here
        vmlal.u32       Qc01, Dv01, Du00
        vmlal.u32       Qc23, Dv23, Du00
        vmlal.u32       Qc45, Dv45, Du00
        vld1.32         Dtmp[0], [vp]!
        subs    n, #1

        C We have:   Qc45, Qc23, Qc01 = [c5, l4, c4, l3], [c3, l2, c2, l1], 
[c1, l0, c0, r0]
        C Rotate to: Qc45, Qc23, Qc01 = [r6, c5, l4, c4], [l3, c3, l2, c2], 
[l1, c1, l0, c0]
        C Then add:  Qc45, Qc23, Qc01 = [r6+ c5, l4+ c4], [l3+ c3, l2+ c2], 
[l1+ c1, l0+ c0]
        vst1.32         Dc0[0], [rp]!
        vext.32 Qc01, Qc01, Qc23, #1
        vext.32 Qc23, Qc23, Qc45, #1
        vext.32 Qc45, Qc45, Qtmp, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Qc45, Qc45
        bne     .Loop

.Lend:
        C Repeat 6 more times, without reading any new limbs from vp
        vmov.i32        Dtmp, #0
        mov     n, #6
.Lend_loop:

        vld1.32 {Du00[]}, [up]!
        vmlal.u32       Qc01, Dv01, Du00
        vmlal.u32       Qc23, Dv23, Du00
        vmlal.u32       Qc45, Dv45, Du00
        subs    n, #1

        vst1.32         Dc0[0], [rp]!
        vext.32 Qc01, Qc01, Qc23, #1
        vext.32 Qc23, Qc23, Qc45, #1
        vext.32 Qc45, Qc45, Qtmp, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Qc45, Qc45
        bne     .Lend_loop

        C We have Qc45, Qc23, Qc01 = [c5, c4],[c3, c2], [c1, c0] as (small) 
64-bit values
        C and need to add it together
        vst1.32         Dc0[0], [rp]!
        vshr.u64        Dc0, Dc0, #32
        vadd.i64        Dc1, Dc1, Dc0
        vst1.32         Dc1[0], [rp]!
        vshr.u64        Dc1, Dc1, #32
        vadd.i64        Dc2, Dc2, Dc1
        vst1.32         Dc2[0], [rp]!
        vshr.u64        Dc2, Dc2, #32
        vadd.i64        Dc3, Dc3, Dc2
        vst1.32         Dc3[0], [rp]!
        vshr.u64        Dc3, Dc3, #32
        vadd.i64        Dc4, Dc4, Dc3
        vst1.32         Dc4[0], [rp]!
        vshr.u64        Dc4, Dc4, #32
        vadd.i64        Dc5, Dc5, Dc4
        
        vmov.32 r0, Dc5[0]

        bx              lr
EPILOGUE()
dnl  ARM neon mpn_addmul_8.

dnl  Contributed to the GNU project by Richard Hendersson and Niels Möller

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

        .fpu    neon
        .arm
        .arch   armv6t2

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`vp',`r3')

C Invariant v limbs
define(`Dv01', `d0')
define(`Dv23', `d1')
define(`Dv45', `d2')
define(`Dv67', `d3')

C Recurrency variables
define(`Qc01', `q2')
define(`Qc23', `q3')
define(`Qc45', `q8')
define(`Qc67', `q9')
C Aliases
define(`Dc0', `d4')
define(`Dc1', `d5')
define(`Dc2', `d6')
define(`Dc3', `d7')
define(`Dc4', `d16')
define(`Dc5', `d17')
define(`Dc6', `d18')
define(`Dc7', `d19')

define(`Du00', `d22')
        
define(`Qtmp', `q10')
define(`Dtmp', `d20')

ASM_START()
PROLOGUE(mpn_addmul_8)
        vldm    vp, {Dv01,Dv23,Dv45,Dv67}
        C We read at vp, and store at rp
        mov     vp, rp
        vldm    vp!, {Dc0,Dc1,Dc2,Dc3}
        vmov    Qc45, Qc23
        vmov.i32        Qc23, #0
        vmov.i32        Qc67, Qc23
        vzip.32 Qc01, Qc23      C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, 
r3, 0, r2]
        vzip.32 Qc45, Qc67      C Pad to get Qc45 = [0, r5, 0, r4], Qc67 = [0, 
r7, 0, r6]
        C Let n denote the number of result words left to read
        subs    n, #8
        vld1.32 {Du00[]}, [up]! 
        beq     .Lend

        .balign 16
.Loop:
        vld1.32         Dtmp[0], [vp]!

        C Critical path starts here
        vmlal.u32       Qc01, Dv01, Du00
        vmlal.u32       Qc23, Dv23, Du00
        vmlal.u32       Qc45, Dv45, Du00
        vmlal.u32       Qc67, Dv67, Du00
        vld1.32 {Du00[]}, [up]! 
        subs    n, #1

        C Shift and add
        vst1.32         Dc0[0], [rp]!
        vext.32 Qc01, Qc01, Qc23, #1
        vext.32 Qc23, Qc23, Qc45, #1
        vext.32 Qc45, Qc45, Qc67, #1
        vext.32 Qc67, Qc67, Qtmp, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Qc45, Qc45
        vpaddl.u32      Qc67, Qc67
        bne     .Loop

.Lend:
        C Repeat 8 more times, without reading any new limbs from vp
        vmov.i32        Dtmp, #0
        mov     n, #7
.Lend_loop:

        vmlal.u32       Qc01, Dv01, Du00
        vmlal.u32       Qc23, Dv23, Du00
        vmlal.u32       Qc45, Dv45, Du00
        vmlal.u32       Qc67, Dv67, Du00
        vld1.32 {Du00[]}, [up]!
        subs    n, #1

        vst1.32         Dc0[0], [rp]!
        vext.32 Qc01, Qc01, Qc23, #1
        vext.32 Qc23, Qc23, Qc45, #1
        vext.32 Qc45, Qc45, Qc67, #1
        vext.32 Qc67, Qc67, Qtmp, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Qc45, Qc45
        vpaddl.u32      Qc67, Qc67
        bne     .Lend_loop

        C Wind down, already read Du00
        vmlal.u32       Qc01, Dv01, Du00
        vmlal.u32       Qc23, Dv23, Du00
        vmlal.u32       Qc45, Dv45, Du00
        vmlal.u32       Qc67, Dv67, Du00

        vst1.32         Dc0[0], [rp]!
        vext.32 Qc01, Qc01, Qc23, #1
        vext.32 Qc23, Qc23, Qc45, #1
        vext.32 Qc45, Qc45, Qc67, #1
        vext.32 Qc67, Qc67, Qtmp, #1
        vpaddl.u32      Qc01, Qc01
        vpaddl.u32      Qc23, Qc23
        vpaddl.u32      Qc45, Qc45
        vpaddl.u32      Qc67, Qc67

        C FIXME: Somehow combine above vext vpaddl with below additions?

        C We have c7-c0 as (small) 64-bit values and need to add it together
        vst1.32         Dc0[0], [rp]!
        vshr.u64        Dc0, Dc0, #32
        vadd.i64        Dc1, Dc1, Dc0
        vst1.32         Dc1[0], [rp]!
        vshr.u64        Dc1, Dc1, #32
        vadd.i64        Dc2, Dc2, Dc1
        vst1.32         Dc2[0], [rp]!
        vshr.u64        Dc2, Dc2, #32
        vadd.i64        Dc3, Dc3, Dc2
        vst1.32         Dc3[0], [rp]!
        vshr.u64        Dc3, Dc3, #32
        vadd.i64        Dc4, Dc4, Dc3
        vst1.32         Dc4[0], [rp]!
        vshr.u64        Dc4, Dc4, #32
        vadd.i64        Dc5, Dc5, Dc4
        vst1.32         Dc5[0], [rp]!
        vshr.u64        Dc5, Dc5, #32
        vadd.i64        Dc6, Dc6, Dc5
        vst1.32         Dc6[0], [rp]!
        vshr.u64        Dc6, Dc6, #32
        vadd.i64        Dc7, Dc7, Dc6
        
        vmov.32 r0, Dc7[0]

        bx              lr
EPILOGUE()
-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to