On Tue, 2022-01-04 at 21:28 +0100, Niels Möller wrote:
>
> > +define(`FUNC_ALIGN', `5')
> > +PROLOGUE(_nettle_ecc_secp384r1_modp)
> > +
> > + std H0, -48(SP)
> > + std H1, -40(SP)
> > + std H2, -32(SP)
> > + std H3, -24(SP)
> > + std H4, -16(SP)
> > + std H5, -8(SP)
>
> I find it clearer to use register names rather than the m4 defines
> for
> save and restore of callee-save registers.
Here's the modified code which uses the actual registers when saving
and restoring from stack.
Amitay.
--
Before marriage, a man yearns for the woman he loves. After marriage,
the
'Y' becomes silent.
C powerpc64/ecc-secp384r1-modp.asm
ifelse(`
Copyright (C) 2021 Martin Schwenke, Amitay Isaacs & Alastair D´Silva, IBM
Corporation
Based on x86_64/ecc-secp256r1-redc.asm
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "ecc-secp384r1-modp.asm"
C Register usage:
define(`SP', `r1')
define(`RP', `r4')
define(`XP', `r5')
define(`D5', `r6')
define(`T0', `r7')
define(`T1', `r8')
define(`T2', `r9')
define(`T3', `r10')
define(`T4', `r11')
define(`T5', `r12')
define(`H0', `r14')
define(`H1', `r15')
define(`H2', `r16')
define(`H3', `r17')
define(`H4', `r18')
define(`H5', `r19')
define(`C2', `r3')
define(`C0', H5) C Overlap
define(`TMP', XP) C Overlap
C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp,
mp_limb_t *xp)
.text
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_ecc_secp384r1_modp)
std r14, -48(SP)
std r15, -40(SP)
std r16, -32(SP)
std r17, -24(SP)
std r18, -16(SP)
std r19, -8(SP)
C First get top 2 limbs, which need folding twice.
C B^10 = B^6 + B^4 + 2^32 (B-1)B^4.
C We handle the terms as follow:
C
C B^6: Folded immediatly.
C
C B^4: Delayed, added in in the next folding.
C
C 2^32(B-1) B^4: Low half limb delayed until the next
C folding. Top 1.5 limbs subtracted and shifter now, resulting
C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added
C in.
ld H4, 80(XP)
ld H5, 88(XP)
C Shift right 32 bits, into H1, H0
srdi H1, H5, 32
sldi D5, H5, 32
srdi H0, H4, 32
or H0, H0, D5
C H1 H0
C - H1 H0
C --------
C H1 H0 D5
subfic D5, H0, 0
subfe H0, H1, H0
addme H1, H1
li C2, 0
addc H0, H4, H0
adde H1, H5, H1
addze C2, C2
C Add in to high part
ld T1, 48(XP)
ld T2, 56(XP)
addc H0, T1, H0
adde H1, T2, H1
addze C2, C2 C Do C2 later
C +1 term
ld T0, 0(XP)
ld T1, 8(XP)
ld T2, 16(XP)
ld T3, 24(XP)
ld T4, 32(XP)
ld T5, 40(XP)
ld H2, 64(XP)
ld H3, 72(XP)
addc T0, H0, T0
adde T1, H1, T1
adde T2, H2, T2
adde T3, H3, T3
adde T4, H4, T4
adde T5, H5, T5
li C0, 0
addze C0, C0
C +B^2 term
addc T2, H0, T2
adde T3, H1, T3
adde T4, H2, T4
adde T5, H3, T5
addze C0, C0
C Shift left, including low half of H4
sldi H4, H4, 32
srdi TMP, H3, 32
or H4, TMP, H4
sldi H3, H3, 32
srdi TMP, H2, 32
or H3, TMP, H3
sldi H2, H2, 32
srdi TMP, H1, 32
or H2, TMP, H2
sldi H1, H1, 32
srdi TMP, H0, 32
or H1, TMP, H1
sldi H0, H0, 32
C H4 H3 H2 H1 H0 0
C - H4 H3 H2 H1 H0
C ---------------
C H4 H3 H2 H1 H0 TMP
subfic TMP, H0, 0
subfe H0, H1, H0
subfe H1, H2, H1
subfe H2, H3, H2
subfe H3, H4, H3
addme H4, H4
addc T0, TMP, T0
adde T1, H0, T1
adde T2, H1, T2
adde T3, H2, T3
adde T4, H3, T4
adde T5, H4, T5
addze C0, C0
C Remains to add in C2 and C0
C Set H1, H0 = (2^96 - 2^32 + 1) C0
sldi H1, C0, 32
subfc H0, H1, C0
addme H1, H1
C Set H3, H2 = (2^96 - 2^32 + 1) C2
sldi H3, C2, 32
subfc H2, H3, C2
addme H3, H3
addc H2, C0, H2
li C0, 0
addc T0, H0, T0
adde T1, H1, T1
adde T2, H2, T2
adde T3, H3, T3
adde T4, C2, T4
adde T5, D5, T5 C Value delayed from initial folding
addze C0, C0
C Final unlikely carry
sldi H1, C0, 32
subfc H0, H1, C0
addme H1, H1
addc T0, H0, T0
adde T1, H1, T1
adde T2, C0, T2
addze T3, T3
addze T4, T4
addze T5, T5
std T0, 0(RP)
std T1, 8(RP)
std T2, 16(RP)
std T3, 24(RP)
std T4, 32(RP)
std T5, 40(RP)
ld r14, -48(SP)
ld r15, -40(SP)
ld r16, -32(SP)
ld r17, -24(SP)
ld r18, -16(SP)
ld r19, -8(SP)
blr
EPILOGUE(_nettle_ecc_secp384r1_modp)
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs