ni...@lysator.liu.se (Niels Möller) writes: > Maybe later, but for now, A9 is my target platform. But it seems you're > right that Neon is almost useless there.
I'm attaching the functions I've been testing, in case anyone else would like to play with them. /Niels
dnl ARM neon mpn_addmul_4. dnl Contributed to the GNU project by Niels Möller dnl Copyright 2013 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') .fpu neon .arm .arch armv6t2 define(`rp',`r0') define(`up',`r1') define(`n', `r2') define(`vp',`r3') C Invariant v limbs define(`Dv01', `d0') define(`Dv23', `d1') C Recurrency variables define(`Qc01', `q1') define(`Qc23', `q2') C Aliases define(`Dc0', `d2') define(`Dc1', `d3') define(`Dc2', `d4') define(`Dc3', `d5') define(`Du00', `d6') define(`Qtmp', `q8') define(`Dtmp', `d16') define(`Ttmp', `d30') define(`TDu00', `d7') define(`Tc01', `q9') define(`Tc23', `q10') define(`Ec01', `q11') define(`Ec23', `q12') define(`Ac01', `q13') define(`Ac23', `q14') ASM_START() PROLOGUE(mpn_addmul_4) vld1.32 {Dv01, Dv23}, [vp] C We read at vp, and store at rp mov vp, rp vld1.32 {Dc0, Dc1}, [vp]! vmov.i32 Qc23, #0 vzip.32 Qc01, Qc23 C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2] C Let n denote the number of result words left to read subs n, #4 beq .Lend .balign 16 .Loop: vld1.32 {Du00[]}, [up]! C Critical path starts here vmlal.u32 Qc01, Dv01, Du00 vmlal.u32 Qc23, Dv23, Du00 vld1.32 Dtmp[0], [vp]! subs n, #1 C We have: Qc23 = [c3, l2, c2, l1], Qc01 = [c1, l0, c0, r0] C Rotate to: Qc23 = [r4, c3, l2, c2], Qc01 = [l1, c1, l0, c0] C Then add: Qc23 = [r4+ c3, l2+ c2], Qc01 = [l1+ c1, l0+ c0] vst1.32 Dc0[0], [rp]! vext.32 Qc01, Qc01, Qc23, #1 vext.32 Qc23, Qc23, Qtmp, #1 vpaddl.u32 Qc01, Qc01 vpaddl.u32 Qc23, Qc23 bne .Loop .Lend: C Repeat 4 more times, without reading any new limbs from vp vmov.i32 Dtmp, #0 mov n, #4 .Lend_loop: vld1.32 {Du00[]}, [up]! vmlal.u32 Qc01, Dv01, Du00 vmlal.u32 Qc23, Dv23, Du00 subs n, #1 vst1.32 Dc0[0], [rp]! vext.32 Qc01, Qc01, Qc23, #1 vext.32 Qc23, Qc23, Qtmp, #1 vpaddl.u32 Qc01, Qc01 vpaddl.u32 Qc23, Qc23 bne .Lend_loop C We have Qc23 = [c3, c2], Qc10 = [c1, c0] as (small) 64-bit values C and need to add it together vst1.32 Dc0[0], [rp]! vshr.u64 Dc0, Dc0, #32 vadd.i64 Dc1, Dc1, Dc0 vst1.32 Dc1[0], [rp]! vshr.u64 Dc1, Dc1, #32 vadd.i64 Dc2, Dc2, Dc1 vst1.32 Dc2[0], [rp]! vshr.u64 Dc2, Dc2, #32 vadd.i64 Dc3, Dc3, Dc2 vmov.32 r0, Dc3[0] bx lr EPILOGUE()
dnl ARM neon mpn_addmul_6. dnl Contributed to the GNU project by Niels Möller dnl Copyright 2013 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') .fpu neon .arm .arch armv6t2 define(`rp',`r0') define(`up',`r1') define(`n', `r2') define(`vp',`r3') C Invariant v limbs define(`Dv01', `d0') define(`Dv23', `d1') define(`Dv45', `d2') define(`Du00', `d3') C Recurrency variables define(`Qc01', `q2') define(`Qc23', `q3') define(`Qc45', `q8') C Aliases define(`Dc0', `d4') define(`Dc1', `d5') define(`Dc2', `d6') define(`Dc3', `d7') define(`Dc4', `d16') define(`Dc5', `d17') define(`Qtmp', `q9') define(`Dtmp', `d18') ASM_START() PROLOGUE(mpn_addmul_6) vldm vp, {Dv01,Dv23,Dv45} C We read at vp, and store at rp mov vp, rp vldm vp!, {Dc0,Dc1,Dc2} vmov Dc4, Dc2 vmov.i32 Qc23, #0 vzip.32 Qc01, Qc23 C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2] vmov.i32 Dc5, #0 vzip.32 Dc4, Dc5 C Pad to get Qc45 = [Dc5, Dc4] = [ 0, r5, 0, r4 ] C Let n denote the number of result words left to read subs n, #6 beq .Lend .Loop: vld1.32 {Du00[]}, [up]! C Critical path starts here vmlal.u32 Qc01, Dv01, Du00 vmlal.u32 Qc23, Dv23, Du00 vmlal.u32 Qc45, Dv45, Du00 vld1.32 Dtmp[0], [vp]! subs n, #1 C We have: Qc45, Qc23, Qc01 = [c5, l4, c4, l3], [c3, l2, c2, l1], [c1, l0, c0, r0] C Rotate to: Qc45, Qc23, Qc01 = [r6, c5, l4, c4], [l3, c3, l2, c2], [l1, c1, l0, c0] C Then add: Qc45, Qc23, Qc01 = [r6+ c5, l4+ c4], [l3+ c3, l2+ c2], [l1+ c1, l0+ c0] vst1.32 Dc0[0], [rp]! vext.32 Qc01, Qc01, Qc23, #1 vext.32 Qc23, Qc23, Qc45, #1 vext.32 Qc45, Qc45, Qtmp, #1 vpaddl.u32 Qc01, Qc01 vpaddl.u32 Qc23, Qc23 vpaddl.u32 Qc45, Qc45 bne .Loop .Lend: C Repeat 6 more times, without reading any new limbs from vp vmov.i32 Dtmp, #0 mov n, #6 .Lend_loop: vld1.32 {Du00[]}, [up]! vmlal.u32 Qc01, Dv01, Du00 vmlal.u32 Qc23, Dv23, Du00 vmlal.u32 Qc45, Dv45, Du00 subs n, #1 vst1.32 Dc0[0], [rp]! vext.32 Qc01, Qc01, Qc23, #1 vext.32 Qc23, Qc23, Qc45, #1 vext.32 Qc45, Qc45, Qtmp, #1 vpaddl.u32 Qc01, Qc01 vpaddl.u32 Qc23, Qc23 vpaddl.u32 Qc45, Qc45 bne .Lend_loop C We have Qc45, Qc23, Qc01 = [c5, c4],[c3, c2], [c1, c0] as (small) 64-bit values C and need to add it together vst1.32 Dc0[0], [rp]! vshr.u64 Dc0, Dc0, #32 vadd.i64 Dc1, Dc1, Dc0 vst1.32 Dc1[0], [rp]! vshr.u64 Dc1, Dc1, #32 vadd.i64 Dc2, Dc2, Dc1 vst1.32 Dc2[0], [rp]! vshr.u64 Dc2, Dc2, #32 vadd.i64 Dc3, Dc3, Dc2 vst1.32 Dc3[0], [rp]! vshr.u64 Dc3, Dc3, #32 vadd.i64 Dc4, Dc4, Dc3 vst1.32 Dc4[0], [rp]! vshr.u64 Dc4, Dc4, #32 vadd.i64 Dc5, Dc5, Dc4 vmov.32 r0, Dc5[0] bx lr EPILOGUE()
dnl ARM neon mpn_addmul_8. dnl Contributed to the GNU project by Richard Hendersson and Niels Möller dnl Copyright 2013 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') .fpu neon .arm .arch armv6t2 define(`rp',`r0') define(`up',`r1') define(`n', `r2') define(`vp',`r3') C Invariant v limbs define(`Dv01', `d0') define(`Dv23', `d1') define(`Dv45', `d2') define(`Dv67', `d3') C Recurrency variables define(`Qc01', `q2') define(`Qc23', `q3') define(`Qc45', `q8') define(`Qc67', `q9') C Aliases define(`Dc0', `d4') define(`Dc1', `d5') define(`Dc2', `d6') define(`Dc3', `d7') define(`Dc4', `d16') define(`Dc5', `d17') define(`Dc6', `d18') define(`Dc7', `d19') define(`Du00', `d22') define(`Qtmp', `q10') define(`Dtmp', `d20') ASM_START() PROLOGUE(mpn_addmul_8) vldm vp, {Dv01,Dv23,Dv45,Dv67} C We read at vp, and store at rp mov vp, rp vldm vp!, {Dc0,Dc1,Dc2,Dc3} vmov Qc45, Qc23 vmov.i32 Qc23, #0 vmov.i32 Qc67, Qc23 vzip.32 Qc01, Qc23 C Pad to get Qc01 = [0, r1, 0, r0], Qc23 = [0, r3, 0, r2] vzip.32 Qc45, Qc67 C Pad to get Qc45 = [0, r5, 0, r4], Qc67 = [0, r7, 0, r6] C Let n denote the number of result words left to read subs n, #8 vld1.32 {Du00[]}, [up]! beq .Lend .balign 16 .Loop: vld1.32 Dtmp[0], [vp]! C Critical path starts here vmlal.u32 Qc01, Dv01, Du00 vmlal.u32 Qc23, Dv23, Du00 vmlal.u32 Qc45, Dv45, Du00 vmlal.u32 Qc67, Dv67, Du00 vld1.32 {Du00[]}, [up]! subs n, #1 C Shift and add vst1.32 Dc0[0], [rp]! vext.32 Qc01, Qc01, Qc23, #1 vext.32 Qc23, Qc23, Qc45, #1 vext.32 Qc45, Qc45, Qc67, #1 vext.32 Qc67, Qc67, Qtmp, #1 vpaddl.u32 Qc01, Qc01 vpaddl.u32 Qc23, Qc23 vpaddl.u32 Qc45, Qc45 vpaddl.u32 Qc67, Qc67 bne .Loop .Lend: C Repeat 8 more times, without reading any new limbs from vp vmov.i32 Dtmp, #0 mov n, #7 .Lend_loop: vmlal.u32 Qc01, Dv01, Du00 vmlal.u32 Qc23, Dv23, Du00 vmlal.u32 Qc45, Dv45, Du00 vmlal.u32 Qc67, Dv67, Du00 vld1.32 {Du00[]}, [up]! subs n, #1 vst1.32 Dc0[0], [rp]! vext.32 Qc01, Qc01, Qc23, #1 vext.32 Qc23, Qc23, Qc45, #1 vext.32 Qc45, Qc45, Qc67, #1 vext.32 Qc67, Qc67, Qtmp, #1 vpaddl.u32 Qc01, Qc01 vpaddl.u32 Qc23, Qc23 vpaddl.u32 Qc45, Qc45 vpaddl.u32 Qc67, Qc67 bne .Lend_loop C Wind down, already read Du00 vmlal.u32 Qc01, Dv01, Du00 vmlal.u32 Qc23, Dv23, Du00 vmlal.u32 Qc45, Dv45, Du00 vmlal.u32 Qc67, Dv67, Du00 vst1.32 Dc0[0], [rp]! vext.32 Qc01, Qc01, Qc23, #1 vext.32 Qc23, Qc23, Qc45, #1 vext.32 Qc45, Qc45, Qc67, #1 vext.32 Qc67, Qc67, Qtmp, #1 vpaddl.u32 Qc01, Qc01 vpaddl.u32 Qc23, Qc23 vpaddl.u32 Qc45, Qc45 vpaddl.u32 Qc67, Qc67 C FIXME: Somehow combine above vext vpaddl with below additions? C We have c7-c0 as (small) 64-bit values and need to add it together vst1.32 Dc0[0], [rp]! vshr.u64 Dc0, Dc0, #32 vadd.i64 Dc1, Dc1, Dc0 vst1.32 Dc1[0], [rp]! vshr.u64 Dc1, Dc1, #32 vadd.i64 Dc2, Dc2, Dc1 vst1.32 Dc2[0], [rp]! vshr.u64 Dc2, Dc2, #32 vadd.i64 Dc3, Dc3, Dc2 vst1.32 Dc3[0], [rp]! vshr.u64 Dc3, Dc3, #32 vadd.i64 Dc4, Dc4, Dc3 vst1.32 Dc4[0], [rp]! vshr.u64 Dc4, Dc4, #32 vadd.i64 Dc5, Dc5, Dc4 vst1.32 Dc5[0], [rp]! vshr.u64 Dc5, Dc5, #32 vadd.i64 Dc6, Dc6, Dc5 vst1.32 Dc6[0], [rp]! vshr.u64 Dc6, Dc6, #32 vadd.i64 Dc7, Dc7, Dc6 vmov.32 r0, Dc7[0] bx lr EPILOGUE()
-- Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26. Internet email is subject to wholesale government surveillance.
_______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org http://gmplib.org/mailman/listinfo/gmp-devel