I made a merge request in the main repo that enables optimized GHASH on
AArch64 architecture. The implementation is based on Niels Möller's
enhanced algorithm which yields more speedup on AArch64 arch in
comparison with intel algorithm. Using the Karatsuba algorithm with Intel
algorithm yielded an overhead so I dropped its benchmark result. I'll
attach the file of Intel algorithm implementation here since it's not
include in the MR.
Here is the benchmark result on AArch64:
*---------------------------------------------------------------------------------------------*
| C version | Intel algorithm | Niels Möller's enhanced
algorithm |
| 208 Mbyte/s | 2781 Mbyte/s | 3255 Mbyte/s
|
*---------------------------------------------------------------------------------------------*
This is +17% performance boost of the enhanced algorithm over the Intel
algorithm, it's not as impressive as PowerPC benchmark result but it did a
great job on AArch64 considering PMULL instruction doesn't have
the assistance that vpmsumd offers by multiply four polynomials then
summing.
I tried to avoid using the stack in this implementation so I wrote a
procedure to handle leftovers by just using the registers, let me know if
there's a room for improvement here.
regards,
Mamone
C arm/v8/gcm-hash.asm
ifelse(`
Copyright (C) 2020 Niels Möller and Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
C gcm_set_key() assigns H value in the middle element of the table
define(`H_Idx', `128')
.file "gcm-hash.asm"
.text
C void gcm_init_key (union gcm_block *table)
C This function populates the gcm table as the following layout
C
*******************************************************************************
C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
|
C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div
x⁶⁴) |
C |
|
C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
|
C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div
x⁶⁴) |
C |
|
C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
|
C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div
x⁶⁴) |
C |
|
C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
|
C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div
x⁶⁴) |
C
*******************************************************************************
define(`TABLE', `x0')
define(`ZERO', `v0')
define(`EMSB', `v1')
define(`POLY', `v2')
define(`B', `v3')
define(`H', `v4')
define(`HQ', `q4')
define(`H_t', `v5')
define(`H2', `v6')
define(`H2_t', `v7')
define(`H3', `v16')
define(`H3_t', `v17')
define(`H4', `v18')
define(`H4_t', `v19')
define(`H_m', `v20')
define(`H_m1', `v21')
define(`H_h', `v22')
define(`H_l', `v23')
define(`RP', `v24')
define(`Ml', `v25')
define(`Mh', `v26')
PROLOGUE(_nettle_gcm_init_key)
ldr HQ,[TABLE,#16*H_Idx]
dup EMSB.16b,H.b[0]
rev64 H.16b,H.16b
mov x9,#0xC200000000000000
mov x10,#1
mov POLY.d[0],x9
mov POLY.d[1],x10
sshr EMSB.16b,EMSB.16b,#7
and EMSB.16b,EMSB.16b,POLY.16b
ushr B.2d,H.2d,#63
and B.16b,B.16b,POLY.16b
ext B.16b,B.16b,B.16b,#8
shl H.2d,H.2d,#1
orr H.16b,H.16b,B.16b
eor H.16b,H.16b,EMSB.16b
eor ZERO.16b,ZERO.16b,ZERO.16b
dup POLY.2d,POLY.d[0]
ext H_t.16b,H.16b,H.16b,#8
pmull H_m.1q,H.1d,H_t.1d
pmull2 H_m1.1q,H.2d,H_t.2d
pmull H_h.1q,H.1d,H.1d
pmull2 H_l.1q,H.2d,H.2d
eor H_m.16b,H_m.16b,H_m1.16b
pmull RP.1q,H_l.1d,POLY.1d
ext Ml.16b,ZERO.16b,H_m.16b,#8
ext Mh.16b,H_m.16b,ZERO.16b,#8
ext RP.16b,RP.16b,RP.16b,#8
eor H_l.16b,H_l.16b,Ml.16b
eor H_h.16b,H_h.16b,Mh.16b
eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d
eor H_h.16b,H_h.16b,H_l.16b
eor H2_t.16b,H_h.16b,RP.16b
ext H2.16b,H2_t.16b,H2_t.16b,#8
st1 {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE],#64
pmull H_m.1q,H.1d,H2_t.1d
pmull2 H_m1.1q,H.2d,H2_t.2d
pmull H_h.1q,H.1d,H2.1d
pmull2 H_l.1q,H.2d,H2.2d
eor H_m.16b,H_m.16b,H_m1.16b
pmull RP.1q,H_l.1d,POLY.1d
ext Ml.16b,ZERO.16b,H_m.16b,#8
ext Mh.16b,H_m.16b,ZERO.16b,#8
ext RP.16b,RP.16b,RP.16b,#8
eor H_l.16b,H_l.16b,Ml.16b
eor H_h.16b,H_h.16b,Mh.16b
eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d
eor H_h.16b,H_h.16b,H_l.16b
eor H3_t.16b,H_h.16b,RP.16b
ext H3.16b,H3_t.16b,H3_t.16b,#8
pmull H_m.1q,H2.1d,H2_t.1d
pmull2 H_m1.1q,H2.2d,H2_t.2d
pmull H_h.1q,H2.1d,H2.1d
pmull2 H_l.1q,H2.2d,H2.2d
eor H_m.16b,H_m.16b,H_m1.16b
pmull RP.1q,H_l.1d,POLY.1d
ext Ml.16b,ZERO.16b,H_m.16b,#8
ext Mh.16b,H_m.16b,ZERO.16b,#8
ext RP.16b,RP.16b,RP.16b,#8
eor H_l.16b,H_l.16b,Ml.16b
eor H_h.16b,H_h.16b,Mh.16b
eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d
eor H_h.16b,H_h.16b,H_l.16b
eor H4_t.16b,H_h.16b,RP.16b
ext H4.16b,H4_t.16b,H4_t.16b,#8
st1 {H3.16b,H3_t.16b,H4.16b,H4_t.16b},[TABLE]
ret
EPILOGUE(_nettle_gcm_init_key)
define(`TABLE', `x0')
define(`X', `x1')
define(`LENGTH', `x2')
define(`DATA', `x3')
define(`POLY', `v0')
define(`ZERO', `v1')
define(`D', `v2')
define(`C0', `v3')
define(`C0D', `d3')
define(`C1', `v4')
define(`C2', `v5')
define(`C3', `v6')
define(`RP', `v7')
define(`H', `v16')
define(`H_t', `v17')
define(`H2', `v18')
define(`H2_t', `v19')
define(`H3', `v20')
define(`H3_t', `v21')
define(`H4', `v22')
define(`H4_t', `v23')
define(`H_m', `v24')
define(`H_m1', `v25')
define(`H_h', `v26')
define(`H_l', `v27')
define(`H_m2', `v28')
define(`H_m3', `v29')
define(`H_h2', `v30')
define(`H_l2', `v31')
define(`Ml', `v4')
define(`Mh', `v5')
C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
C size_t length, const uint8_t *data)
PROLOGUE(_nettle_gcm_hash)
mov x10,#0xC200000000000000
mov POLY.d[0],x10
dup POLY.2d,POLY.d[0]
eor ZERO.16b,ZERO.16b,ZERO.16b
ld1 {D.16b},[X]
rev64 D.16b,D.16b
ands x10,LENGTH,#-64
b.eq L2x
add x9,TABLE,64
ld1 {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE]
ld1 {H3.16b,H3_t.16b,H4.16b,H4_t.16b},[x9]
L4x_loop:
ld1 {C0.16b,C1.16b,C2.16b,C3.16b},[DATA],#64
rev64 C0.16b,C0.16b
rev64 C1.16b,C1.16b
rev64 C2.16b,C2.16b
rev64 C3.16b,C3.16b
eor C0.16b,C0.16b,D.16b
pmull H_m.1q,C1.1d,H3_t.1d
pmull2 H_m1.1q,C1.2d,H3_t.2d
pmull H_h.1q,C1.1d,H3.1d
pmull2 H_l.1q,C1.2d,H3.2d
pmull H_m2.1q,C2.1d,H2_t.1d
pmull2 H_m3.1q,C2.2d,H2_t.2d
pmull H_h2.1q,C2.1d,H2.1d
pmull2 H_l2.1q,C2.2d,H2.2d
eor H_m.16b,H_m.16b,H_m2.16b
eor H_m1.16b,H_m1.16b,H_m3.16b
eor H_h.16b,H_h.16b,H_h2.16b
eor H_l.16b,H_l.16b,H_l2.16b
pmull H_m2.1q,C3.1d,H_t.1d
pmull2 H_m3.1q,C3.2d,H_t.2d
pmull H_h2.1q,C3.1d,H.1d
pmull2 H_l2.1q,C3.2d,H.2d
eor H_m.16b,H_m.16b,H_m2.16b
eor H_m1.16b,H_m1.16b,H_m3.16b
eor H_h.16b,H_h.16b,H_h2.16b
eor H_l.16b,H_l.16b,H_l2.16b
pmull H_m2.1q,C0.1d,H4_t.1d
pmull2 H_m3.1q,C0.2d,H4_t.2d
pmull H_h2.1q,C0.1d,H4.1d
pmull2 H_l2.1q,C0.2d,H4.2d
eor H_m.16b,H_m.16b,H_m2.16b
eor H_m1.16b,H_m1.16b,H_m3.16b
eor H_h.16b,H_h.16b,H_h2.16b
eor H_l.16b,H_l.16b,H_l2.16b
eor H_m.16b,H_m.16b,H_m1.16b
pmull RP.1q,H_l.1d,POLY.1d
ext Ml.16b,ZERO.16b,H_m.16b,#8
ext Mh.16b,H_m.16b,ZERO.16b,#8
ext RP.16b,RP.16b,RP.16b,#8
eor H_l.16b,H_l.16b,Ml.16b
eor H_h.16b,H_h.16b,Mh.16b
eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d
eor H_h.16b,H_h.16b,H_l.16b
eor D.16b,H_h.16b,RP.16b
ext D.16b,D.16b,D.16b,#8
subs x10,x10,64
b.ne L4x_loop
and LENGTH,LENGTH,#63
L2x:
tst LENGTH,#-32
b.eq L1x
ld1 {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE]
ld1 {C0.16b,C1.16b},[DATA],#32
rev64 C0.16b,C0.16b
rev64 C1.16b,C1.16b
eor C0.16b,C0.16b,D.16b
pmull H_m.1q,C1.1d,H_t.1d
pmull2 H_m1.1q,C1.2d,H_t.2d
pmull H_h.1q,C1.1d,H.1d
pmull2 H_l.1q,C1.2d,H.2d
pmull H_m2.1q,C0.1d,H2_t.1d
pmull2 H_m3.1q,C0.2d,H2_t.2d
pmull H_h2.1q,C0.1d,H2.1d
pmull2 H_l2.1q,C0.2d,H2.2d
eor H_m.16b,H_m.16b,H_m2.16b
eor H_m1.16b,H_m1.16b,H_m3.16b
eor H_h.16b,H_h.16b,H_h2.16b
eor H_l.16b,H_l.16b,H_l2.16b
eor H_m.16b,H_m.16b,H_m1.16b
pmull RP.1q,H_l.1d,POLY.1d
ext Ml.16b,ZERO.16b,H_m.16b,#8
ext Mh.16b,H_m.16b,ZERO.16b,#8
ext RP.16b,RP.16b,RP.16b,#8
eor H_l.16b,H_l.16b,Ml.16b
eor H_h.16b,H_h.16b,Mh.16b
eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d
eor H_h.16b,H_h.16b,H_l.16b
eor D.16b,H_h.16b,RP.16b
ext D.16b,D.16b,D.16b,#8
and LENGTH,LENGTH,#31
L1x:
tst LENGTH,#-16
b.eq Lmod
ld1 {H.16b,H_t.16b},[TABLE]
ld1 {C0.16b},[DATA],#16
rev64 C0.16b,C0.16b
eor C0.16b,C0.16b,D.16b
pmull H_m.1q,C0.1d,H_t.1d
pmull2 H_m1.1q,C0.2d,H_t.2d
pmull H_h.1q,C0.1d,H.1d
pmull2 H_l.1q,C0.2d,H.2d
eor H_m.16b,H_m.16b,H_m1.16b
pmull RP.1q,H_l.1d,POLY.1d
ext Ml.16b,ZERO.16b,H_m.16b,#8
ext Mh.16b,H_m.16b,ZERO.16b,#8
ext RP.16b,RP.16b,RP.16b,#8
eor H_l.16b,H_l.16b,Ml.16b
eor H_h.16b,H_h.16b,Mh.16b
eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d
eor H_h.16b,H_h.16b,H_l.16b
eor D.16b,H_h.16b,RP.16b
ext D.16b,D.16b,D.16b,#8
Lmod:
tst LENGTH,#15
b.eq Ldone
ld1 {H.16b,H_t.16b},[TABLE]
tbz LENGTH,3,Lmod_8
ldr C0D,[DATA],#8
rev64 C0.16b,C0.16b
mov x10,#0
mov C0.d[1],x10
Lmod_8:
tst LENGTH,#7
b.eq Lmod_8_done
mov x9,#0
mov x8,#64
and x7,LENGTH,#7
Lmod_8_loop:
mov x10,#0
ldrb w10,[DATA],#1
sub x8,x8,#8
lsl x10,x10,x8
orr x9,x9,x10
subs x7,x7,#1
b.ne Lmod_8_loop
tbz LENGTH,3,Lmod_8_load
mov C0.d[1],x9
b Lmod_8_done
Lmod_8_load:
mov x10,#0
mov C0.d[0],x9
mov C0.d[1],x10
Lmod_8_done:
eor C0.16b,C0.16b,D.16b
pmull H_m.1q,C0.1d,H_t.1d
pmull2 H_m1.1q,C0.2d,H_t.2d
pmull H_h.1q,C0.1d,H.1d
pmull2 H_l.1q,C0.2d,H.2d
eor H_m.16b,H_m.16b,H_m1.16b
pmull RP.1q,H_l.1d,POLY.1d
ext Ml.16b,ZERO.16b,H_m.16b,#8
ext Mh.16b,H_m.16b,ZERO.16b,#8
ext RP.16b,RP.16b,RP.16b,#8
eor H_l.16b,H_l.16b,Ml.16b
eor H_h.16b,H_h.16b,Mh.16b
eor H_l.16b,H_l.16b,RP.16b
pmull2 RP.1q,H_l.2d,POLY.2d
eor H_h.16b,H_h.16b,H_l.16b
eor D.16b,H_h.16b,RP.16b
ext D.16b,D.16b,D.16b,#8
Ldone:
rev64 D.16b,D.16b
st1 {D.16b},[X]
ret
EPILOGUE(_nettle_gcm_hash)
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs