ni...@lysator.liu.se (Niels Möller) writes:

> I'll also try using fewer updates of the up pointer, that seems to save
> half a cycle, and could perhaps speed up addmul_1 too.

No speedup for addmul_1, unfortunately, but a saving for submul_1. Here
are new versions of both files (for mpn/arm/v6). I wonder if this
submul_1 complement trick is useful on some other platforms too, e.g.,
64-bit sparc?

Running at 3.25 and 3.9 c/l on A9:

$ GMP_CPU_FREQUENCY=1e9 ./speed -s1-1000 -f 1.2  -C mpn_addmul_1.17 
mpn_submul_1.17
clock_gettime is 1.000ns accurate
overhead 8.98 cycles, precision 1000 units of 1.00e-06 secs, CPU freq 1000.00 
MHz
        mpn_addmul_1.17 mpn_submul_1.17
1             19.9985      #18.7576
2             10.9929      #10.8108
3             #7.9880        8.6664
4             #6.4099        6.9205
5             #5.9251        6.3792
6             #5.4384        6.1228
7             #5.2336        5.9634
8             #4.8648        5.4154
9             #4.8359        5.2633
10            #4.5423        5.2216
12            #4.3122        4.8634
14            #4.1876        4.8111
16            #4.0881        4.6616
19            #4.0045        4.5861
22            #3.8592        4.4916
26            #3.7191        4.4362
31            #3.7194        4.3459
37            #3.6437        4.2051
44            #3.8368        4.3953
52            #3.5039        4.1120
62            #3.6497        4.2448
74            #3.6015        4.1965
88            #3.5376        4.1343
105           #3.5087        4.0832
126           #3.4988        4.1222
151           #3.4397        3.9997
181           #3.3900        3.9654
217           #3.3483        3.9500
260           #3.2994        3.9175
312           #3.3313        3.8723
374           #3.3173        3.8869
448           #3.2657        3.9315
537           #3.2799        3.9140
644           #3.3077        3.8574
772           #3.2621        3.9059
926           #3.2673        3.8317

dnl  ARM mpn_submul_1.

dnl  Copyright 2012, 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C            cycles/limb
C StrongARM:     -
C XScale         -
C Cortex-A7      ?
C Cortex-A8      ?
C Cortex-A9      3.9
C Cortex-A15     ?

C TODO
C  * Micro-optimise feed-in code.
C  * Optimise for n=1,2 by delaying register saving.
C  * Try using ldm/stm.

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`v0',`r3')

ASM_START()
PROLOGUE(mpn_submul_1)
        stmfd   sp!, { r4, r5, r6, r7 }

        ands    r6, n, #3
        mov     r12, v0
        beq     L(fi0)
        cmp     r6, #2
        bcc     L(fi1)
        beq     L(fi2)

L(fi3): ldr     r4, [up], #12
        mvn     r4, r4
        ldr     r6, [rp, #0]
        ldr     r5, [up, #-8]
        b       L(lo3)

L(fi0): ldr     r5, [up], #16
        mvn     r5, r5
        ldr     r7, [rp], #4
        ldr     r4, [up, #-12]
        b       L(lo0)

L(fi1): ldr     r4, [up], #20
        mvn     r4, r4
        ldr     r6, [rp], #8
        subs    n, n, #1
        beq     L(1)
        ldr     r5, [up, #-16]
        b       L(lo1)

L(fi2): ldr     r5, [up], #8
        mvn     r5, r5
        ldr     r7, [rp], #12
        ldr     r4, [up, #-4]
        b       L(lo2)

        ALIGN(16)
L(top): ldr     r6, [rp, #-8]
        ldr     r5, [up], #16
        str     r7, [rp, #-12]
L(lo1): umaal   r6, r12, r4, v0
        mvn     r5, r5
        ldr     r7, [rp, #-4]
        ldr     r4, [up, #-12]
        str     r6, [rp, #-8]
L(lo0): umaal   r7, r12, r5, v0
        mvn     r4, r4
        ldr     r6, [rp, #0]
        ldr     r5, [up, #-8]
        str     r7, [rp, #-4]
L(lo3): umaal   r6, r12, r4, v0
        mvn     r5, r5
        ldr     r7, [rp, #4]
        ldr     r4, [up, #-4]
        str     r6, [rp], #16
L(lo2): umaal   r7, r12, r5, v0
        mvn     r4, r4
        subs    n, n, #4
        bhi     L(top)

        ldr     r6, [rp, #-8]
        str     r7, [rp, #-12]
L(1):   umaal   r6, r12, r4, v0
        str     r6, [rp, #-8]
        sub     r0, v0, r12
        ldmfd   sp!, { r4, r5, r6, r7 }
        bx      lr
EPILOGUE()
dnl  ARM mpn_addmul_1.

dnl  Copyright 2012 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C            cycles/limb
C StrongARM:     -
C XScale         -
C Cortex-A7      ?
C Cortex-A8      ?
C Cortex-A9      3.25
C Cortex-A15     4

C TODO
C  * Micro-optimise feed-in code.
C  * Optimise for n=1,2 by delaying register saving.
C  * Try using ldm/stm.

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`v0',`r3')

ASM_START()
PROLOGUE(mpn_addmul_1)
        stmfd   sp!, { r4, r5, r6, r7 }

        ands    r6, n, #3
        mov     r12, #0
        beq     L(fi0)
        cmp     r6, #2
        bcc     L(fi1)
        beq     L(fi2)

L(fi3): ldr     r4, [up], #12
        ldr     r6, [rp, #0]
        ldr     r5, [up, #-8]
        b       L(lo3)

L(fi0): ldr     r5, [up], #16
        ldr     r7, [rp], #4
        ldr     r4, [up, #-12]
        b       L(lo0)

L(fi1): ldr     r4, [up], #20
        ldr     r6, [rp], #8
        subs    n, n, #1
        beq     L(1)
        ldr     r5, [up, #-16]
        b       L(lo1)

L(fi2): ldr     r5, [up], #8
        ldr     r7, [rp], #12
        ldr     r4, [up, #-4]
        b       L(lo2)

        ALIGN(16)
L(top): ldr     r6, [rp, #-8]
        ldr     r5, [up], #16
        str     r7, [rp, #-12]
L(lo1): umaal   r6, r12, r4, v0
        ldr     r7, [rp, #-4]
        ldr     r4, [up, #-12]
        str     r6, [rp, #-8]
L(lo0): umaal   r7, r12, r5, v0
        ldr     r6, [rp, #0]
        ldr     r5, [up, #-8]
        str     r7, [rp, #-4]
L(lo3): umaal   r6, r12, r4, v0
        ldr     r7, [rp, #4]
        ldr     r4, [up, #-4]
        str     r6, [rp], #16
L(lo2): umaal   r7, r12, r5, v0
        subs    n, n, #4
        bhi     L(top)

        ldr     r6, [rp, #-8]
        str     r7, [rp, #-12]
L(1):   umaal   r6, r12, r4, v0
        str     r6, [rp, #-8]
        mov     r0, r12
        ldmfd   sp!, { r4, r5, r6, r7 }
        bx      lr
EPILOGUE()
Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
http://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to