Re: [AArch64] Optimize GHASH

Maamoun TK Mon, 14 Dec 2020 15:00:59 -0800

I forgot to mention that I made the benchmark test on gcc17 in GCC Farm.

regards,
Mamone


On Tue, Dec 15, 2020 at 12:12 AM Maamoun TK <[email protected]>
wrote:

> I made a merge request in the main repo that enables optimized GHASH on
> AArch64 architecture. The implementation is based on Niels Möller's
> enhanced algorithm which yields more speedup on AArch64 arch in
> comparison with intel algorithm. Using the Karatsuba algorithm with Intel
> algorithm yielded an overhead so I dropped its benchmark result. I'll
> attach the file of Intel algorithm implementation here since it's not
> include in the MR.
>
> Here is the benchmark result on AArch64:
>
>
> *---------------------------------------------------------------------------------------------*
> | C version       |   Intel algorithm  |   Niels Möller's enhanced
> algorithm  |
> |  208 Mbyte/s  |   2781 Mbyte/s   |   3255 Mbyte/s
>           |
>
> *---------------------------------------------------------------------------------------------*
>
> This is +17% performance boost of the enhanced algorithm over the Intel
> algorithm, it's not as impressive as PowerPC benchmark result but it did a
> great job on AArch64 considering PMULL instruction doesn't have
> the assistance that vpmsumd offers by multiply four polynomials then
> summing.
>
> I tried to avoid using the stack in this implementation so I wrote a
> procedure to handle leftovers by just using the registers, let me know if
> there's a room for improvement here.
>
> regards,
> Mamone
>
> C arm/v8/gcm-hash.asm
>
> ifelse(`
>    Copyright (C) 2020 Niels Möller and Mamone Tarsha
>    This file is part of GNU Nettle.
>
>    GNU Nettle is free software: you can redistribute it and/or
>    modify it under the terms of either:
>
>      * the GNU Lesser General Public License as published by the Free
>        Software Foundation; either version 3 of the License, or (at your
>        option) any later version.
>
>    or
>
>      * the GNU General Public License as published by the Free
>        Software Foundation; either version 2 of the License, or (at your
>        option) any later version.
>
>    or both in parallel, as here.
>
>    GNU Nettle is distributed in the hope that it will be useful,
>    but WITHOUT ANY WARRANTY; without even the implied warranty of
>    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>    General Public License for more details.
>
>    You should have received copies of the GNU General Public License and
>    the GNU Lesser General Public License along with this program.  If
>    not, see http://www.gnu.org/licenses/.
> ')
>
> C gcm_set_key() assigns H value in the middle element of the table
> define(`H_Idx', `128')
>
> .file "gcm-hash.asm"
>
> .text
>
>     C void gcm_init_key (union gcm_block *table)
>
> C This function populates the gcm table as the following layout
> C
> *******************************************************************************
> C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
>      |
> C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div
> x⁶⁴) |
> C |
>       |
> C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
>      |
> C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div
> x⁶⁴) |
> C |
>       |
> C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
>      |
> C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div
> x⁶⁴) |
> C |
>       |
> C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴
>      |
> C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div
> x⁶⁴) |
> C
> *******************************************************************************
>
>
> define(`TABLE', `x0')
>
> define(`ZERO', `v0')
> define(`EMSB', `v1')
> define(`POLY', `v2')
> define(`B', `v3')
>
> define(`H', `v4')
> define(`HQ', `q4')
> define(`H_t', `v5')
> define(`H2', `v6')
> define(`H2_t', `v7')
> define(`H3', `v16')
> define(`H3_t', `v17')
> define(`H4', `v18')
> define(`H4_t', `v19')
> define(`H_m', `v20')
> define(`H_m1', `v21')
> define(`H_h', `v22')
> define(`H_l', `v23')
> define(`RP', `v24')
> define(`Ml', `v25')
> define(`Mh', `v26')
>
>
> PROLOGUE(_nettle_gcm_init_key)
>     ldr            HQ,[TABLE,#16*H_Idx]
>     dup            EMSB.16b,H.b[0]
>     rev64          H.16b,H.16b
>     mov            x9,#0xC200000000000000
>     mov            x10,#1
>     mov            POLY.d[0],x9
>     mov            POLY.d[1],x10
>     sshr           EMSB.16b,EMSB.16b,#7
>     and            EMSB.16b,EMSB.16b,POLY.16b
>     ushr           B.2d,H.2d,#63
>     and            B.16b,B.16b,POLY.16b
>     ext            B.16b,B.16b,B.16b,#8
>     shl            H.2d,H.2d,#1
>     orr            H.16b,H.16b,B.16b
>     eor            H.16b,H.16b,EMSB.16b
>
>     eor            ZERO.16b,ZERO.16b,ZERO.16b
>     dup            POLY.2d,POLY.d[0]
>     ext            H_t.16b,H.16b,H.16b,#8
>
>     pmull          H_m.1q,H.1d,H_t.1d
>     pmull2         H_m1.1q,H.2d,H_t.2d
>     pmull          H_h.1q,H.1d,H.1d
>     pmull2         H_l.1q,H.2d,H.2d
>
>     eor            H_m.16b,H_m.16b,H_m1.16b
>     pmull          RP.1q,H_l.1d,POLY.1d
>     ext            Ml.16b,ZERO.16b,H_m.16b,#8
>     ext            Mh.16b,H_m.16b,ZERO.16b,#8
>     ext            RP.16b,RP.16b,RP.16b,#8
>     eor            H_l.16b,H_l.16b,Ml.16b
>     eor            H_h.16b,H_h.16b,Mh.16b
>     eor            H_l.16b,H_l.16b,RP.16b
>
>     pmull2         RP.1q,H_l.2d,POLY.2d
>     eor            H_h.16b,H_h.16b,H_l.16b
>     eor            H2_t.16b,H_h.16b,RP.16b
>     ext            H2.16b,H2_t.16b,H2_t.16b,#8
>
>     st1            {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE],#64
>
>     pmull          H_m.1q,H.1d,H2_t.1d
>     pmull2         H_m1.1q,H.2d,H2_t.2d
>     pmull          H_h.1q,H.1d,H2.1d
>     pmull2         H_l.1q,H.2d,H2.2d
>
>     eor            H_m.16b,H_m.16b,H_m1.16b
>     pmull          RP.1q,H_l.1d,POLY.1d
>     ext            Ml.16b,ZERO.16b,H_m.16b,#8
>     ext            Mh.16b,H_m.16b,ZERO.16b,#8
>     ext            RP.16b,RP.16b,RP.16b,#8
>     eor            H_l.16b,H_l.16b,Ml.16b
>     eor            H_h.16b,H_h.16b,Mh.16b
>     eor            H_l.16b,H_l.16b,RP.16b
>
>     pmull2         RP.1q,H_l.2d,POLY.2d
>     eor            H_h.16b,H_h.16b,H_l.16b
>     eor            H3_t.16b,H_h.16b,RP.16b
>     ext            H3.16b,H3_t.16b,H3_t.16b,#8
>
>     pmull          H_m.1q,H2.1d,H2_t.1d
>     pmull2         H_m1.1q,H2.2d,H2_t.2d
>     pmull          H_h.1q,H2.1d,H2.1d
>     pmull2         H_l.1q,H2.2d,H2.2d
>
>     eor            H_m.16b,H_m.16b,H_m1.16b
>     pmull          RP.1q,H_l.1d,POLY.1d
>     ext            Ml.16b,ZERO.16b,H_m.16b,#8
>     ext            Mh.16b,H_m.16b,ZERO.16b,#8
>     ext            RP.16b,RP.16b,RP.16b,#8
>     eor            H_l.16b,H_l.16b,Ml.16b
>     eor            H_h.16b,H_h.16b,Mh.16b
>     eor            H_l.16b,H_l.16b,RP.16b
>
>     pmull2         RP.1q,H_l.2d,POLY.2d
>     eor            H_h.16b,H_h.16b,H_l.16b
>     eor            H4_t.16b,H_h.16b,RP.16b
>     ext            H4.16b,H4_t.16b,H4_t.16b,#8
>
>     st1            {H3.16b,H3_t.16b,H4.16b,H4_t.16b},[TABLE]
>
>     ret
> EPILOGUE(_nettle_gcm_init_key)
>
> define(`TABLE', `x0')
> define(`X', `x1')
> define(`LENGTH', `x2')
> define(`DATA', `x3')
>
> define(`POLY', `v0')
> define(`ZERO', `v1')
>
> define(`D', `v2')
> define(`C0', `v3')
> define(`C0D', `d3')
> define(`C1', `v4')
> define(`C2', `v5')
> define(`C3', `v6')
> define(`RP', `v7')
> define(`H', `v16')
> define(`H_t', `v17')
> define(`H2', `v18')
> define(`H2_t', `v19')
> define(`H3', `v20')
> define(`H3_t', `v21')
> define(`H4', `v22')
> define(`H4_t', `v23')
> define(`H_m', `v24')
> define(`H_m1', `v25')
> define(`H_h', `v26')
> define(`H_l', `v27')
> define(`H_m2', `v28')
> define(`H_m3', `v29')
> define(`H_h2', `v30')
> define(`H_l2', `v31')
> define(`Ml', `v4')
> define(`Mh', `v5')
>
>
>     C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
>     C                size_t length, const uint8_t *data)
>
> PROLOGUE(_nettle_gcm_hash)
>     mov            x10,#0xC200000000000000
>     mov            POLY.d[0],x10
>     dup            POLY.2d,POLY.d[0]
>     eor            ZERO.16b,ZERO.16b,ZERO.16b
>
>     ld1            {D.16b},[X]
>     rev64          D.16b,D.16b
>
>     ands           x10,LENGTH,#-64
>     b.eq           L2x
>
>     add            x9,TABLE,64
>     ld1            {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE]
>     ld1            {H3.16b,H3_t.16b,H4.16b,H4_t.16b},[x9]
>
> L4x_loop:
>     ld1            {C0.16b,C1.16b,C2.16b,C3.16b},[DATA],#64
>     rev64          C0.16b,C0.16b
>     rev64          C1.16b,C1.16b
>     rev64          C2.16b,C2.16b
>     rev64          C3.16b,C3.16b
>
>     eor            C0.16b,C0.16b,D.16b
>
>     pmull          H_m.1q,C1.1d,H3_t.1d
>     pmull2         H_m1.1q,C1.2d,H3_t.2d
>     pmull          H_h.1q,C1.1d,H3.1d
>     pmull2         H_l.1q,C1.2d,H3.2d
>
>     pmull          H_m2.1q,C2.1d,H2_t.1d
>     pmull2         H_m3.1q,C2.2d,H2_t.2d
>     pmull          H_h2.1q,C2.1d,H2.1d
>     pmull2         H_l2.1q,C2.2d,H2.2d
>
>     eor            H_m.16b,H_m.16b,H_m2.16b
>     eor            H_m1.16b,H_m1.16b,H_m3.16b
>     eor            H_h.16b,H_h.16b,H_h2.16b
>     eor            H_l.16b,H_l.16b,H_l2.16b
>
>     pmull          H_m2.1q,C3.1d,H_t.1d
>     pmull2         H_m3.1q,C3.2d,H_t.2d
>     pmull          H_h2.1q,C3.1d,H.1d
>     pmull2         H_l2.1q,C3.2d,H.2d
>
>     eor            H_m.16b,H_m.16b,H_m2.16b
>     eor            H_m1.16b,H_m1.16b,H_m3.16b
>     eor            H_h.16b,H_h.16b,H_h2.16b
>     eor            H_l.16b,H_l.16b,H_l2.16b
>
>     pmull          H_m2.1q,C0.1d,H4_t.1d
>     pmull2         H_m3.1q,C0.2d,H4_t.2d
>     pmull          H_h2.1q,C0.1d,H4.1d
>     pmull2         H_l2.1q,C0.2d,H4.2d
>
>     eor            H_m.16b,H_m.16b,H_m2.16b
>     eor            H_m1.16b,H_m1.16b,H_m3.16b
>     eor            H_h.16b,H_h.16b,H_h2.16b
>     eor            H_l.16b,H_l.16b,H_l2.16b
>
>     eor            H_m.16b,H_m.16b,H_m1.16b
>     pmull          RP.1q,H_l.1d,POLY.1d
>     ext            Ml.16b,ZERO.16b,H_m.16b,#8
>     ext            Mh.16b,H_m.16b,ZERO.16b,#8
>     ext            RP.16b,RP.16b,RP.16b,#8
>     eor            H_l.16b,H_l.16b,Ml.16b
>     eor            H_h.16b,H_h.16b,Mh.16b
>     eor            H_l.16b,H_l.16b,RP.16b
>
>     pmull2         RP.1q,H_l.2d,POLY.2d
>     eor            H_h.16b,H_h.16b,H_l.16b
>     eor            D.16b,H_h.16b,RP.16b
>     ext            D.16b,D.16b,D.16b,#8
>
>     subs           x10,x10,64
>     b.ne           L4x_loop
>
>     and            LENGTH,LENGTH,#63
>
> L2x:
>     tst            LENGTH,#-32
>     b.eq           L1x
>
>     ld1            {H.16b,H_t.16b,H2.16b,H2_t.16b},[TABLE]
>
>     ld1            {C0.16b,C1.16b},[DATA],#32
>     rev64          C0.16b,C0.16b
>     rev64          C1.16b,C1.16b
>
>     eor            C0.16b,C0.16b,D.16b
>
>     pmull          H_m.1q,C1.1d,H_t.1d
>     pmull2         H_m1.1q,C1.2d,H_t.2d
>     pmull          H_h.1q,C1.1d,H.1d
>     pmull2         H_l.1q,C1.2d,H.2d
>
>     pmull          H_m2.1q,C0.1d,H2_t.1d
>     pmull2         H_m3.1q,C0.2d,H2_t.2d
>     pmull          H_h2.1q,C0.1d,H2.1d
>     pmull2         H_l2.1q,C0.2d,H2.2d
>
>     eor            H_m.16b,H_m.16b,H_m2.16b
>     eor            H_m1.16b,H_m1.16b,H_m3.16b
>     eor            H_h.16b,H_h.16b,H_h2.16b
>     eor            H_l.16b,H_l.16b,H_l2.16b
>
>     eor            H_m.16b,H_m.16b,H_m1.16b
>     pmull          RP.1q,H_l.1d,POLY.1d
>     ext            Ml.16b,ZERO.16b,H_m.16b,#8
>     ext            Mh.16b,H_m.16b,ZERO.16b,#8
>     ext            RP.16b,RP.16b,RP.16b,#8
>     eor            H_l.16b,H_l.16b,Ml.16b
>     eor            H_h.16b,H_h.16b,Mh.16b
>     eor            H_l.16b,H_l.16b,RP.16b
>
>     pmull2         RP.1q,H_l.2d,POLY.2d
>     eor            H_h.16b,H_h.16b,H_l.16b
>     eor            D.16b,H_h.16b,RP.16b
>     ext            D.16b,D.16b,D.16b,#8
>
>     and            LENGTH,LENGTH,#31
>
> L1x:
>     tst            LENGTH,#-16
>     b.eq           Lmod
>
>     ld1            {H.16b,H_t.16b},[TABLE]
>
>     ld1            {C0.16b},[DATA],#16
>     rev64          C0.16b,C0.16b
>
>     eor            C0.16b,C0.16b,D.16b
>
>     pmull          H_m.1q,C0.1d,H_t.1d
>     pmull2         H_m1.1q,C0.2d,H_t.2d
>     pmull          H_h.1q,C0.1d,H.1d
>     pmull2         H_l.1q,C0.2d,H.2d
>
>     eor            H_m.16b,H_m.16b,H_m1.16b
>     pmull          RP.1q,H_l.1d,POLY.1d
>     ext            Ml.16b,ZERO.16b,H_m.16b,#8
>     ext            Mh.16b,H_m.16b,ZERO.16b,#8
>     ext            RP.16b,RP.16b,RP.16b,#8
>     eor            H_l.16b,H_l.16b,Ml.16b
>     eor            H_h.16b,H_h.16b,Mh.16b
>     eor            H_l.16b,H_l.16b,RP.16b
>
>     pmull2         RP.1q,H_l.2d,POLY.2d
>     eor            H_h.16b,H_h.16b,H_l.16b
>     eor            D.16b,H_h.16b,RP.16b
>     ext            D.16b,D.16b,D.16b,#8
>
> Lmod:
>     tst            LENGTH,#15
>     b.eq           Ldone
>
>     ld1            {H.16b,H_t.16b},[TABLE]
>
>     tbz            LENGTH,3,Lmod_8
>     ldr            C0D,[DATA],#8
>     rev64          C0.16b,C0.16b
>     mov            x10,#0
>     mov            C0.d[1],x10
> Lmod_8:
>     tst            LENGTH,#7
>     b.eq           Lmod_8_done
>     mov            x9,#0
>     mov            x8,#64
>     and            x7,LENGTH,#7
> Lmod_8_loop:
>     mov            x10,#0
>     ldrb           w10,[DATA],#1
>     sub            x8,x8,#8
>     lsl            x10,x10,x8
>     orr            x9,x9,x10
>     subs           x7,x7,#1
>     b.ne           Lmod_8_loop
>     tbz            LENGTH,3,Lmod_8_load
>     mov            C0.d[1],x9
>     b              Lmod_8_done
> Lmod_8_load:
>     mov            x10,#0
>     mov            C0.d[0],x9
>     mov            C0.d[1],x10
> Lmod_8_done:
>     eor            C0.16b,C0.16b,D.16b
>
>     pmull          H_m.1q,C0.1d,H_t.1d
>     pmull2         H_m1.1q,C0.2d,H_t.2d
>     pmull          H_h.1q,C0.1d,H.1d
>     pmull2         H_l.1q,C0.2d,H.2d
>
>     eor            H_m.16b,H_m.16b,H_m1.16b
>     pmull          RP.1q,H_l.1d,POLY.1d
>     ext            Ml.16b,ZERO.16b,H_m.16b,#8
>     ext            Mh.16b,H_m.16b,ZERO.16b,#8
>     ext            RP.16b,RP.16b,RP.16b,#8
>     eor            H_l.16b,H_l.16b,Ml.16b
>     eor            H_h.16b,H_h.16b,Mh.16b
>     eor            H_l.16b,H_l.16b,RP.16b
>
>     pmull2         RP.1q,H_l.2d,POLY.2d
>     eor            H_h.16b,H_h.16b,H_l.16b
>     eor            D.16b,H_h.16b,RP.16b
>     ext            D.16b,D.16b,D.16b,#8
>
> Ldone:
>     rev64          D.16b,D.16b
>     st1            {D.16b},[X]
>     ret
> EPILOGUE(_nettle_gcm_hash)
>
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Re: [AArch64] Optimize GHASH

Reply via email to