The branch main has been updated by jhb:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=c0855eaa3ee9614804b6bd6a255aa9f71e095f43

commit c0855eaa3ee9614804b6bd6a255aa9f71e095f43
Author:     John Baldwin <[email protected]>
AuthorDate: 2023-08-29 21:44:15 +0000
Commit:     John Baldwin <[email protected]>
CommitDate: 2023-08-29 21:44:15 +0000

    ossl: Update the generated assembly files from OpenSSL 3.0.
    
    Tested with:    cryptocheck -d ossl0 -a all -z on amd64
    Reviewed by:    markj
    Differential Revision:  https://reviews.freebsd.org/D41568
---
 sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S      | 6390 +++++++++++++++++++
 sys/crypto/openssl/aarch64/aesv8-armx.S            | 3014 ++++++++-
 sys/crypto/openssl/aarch64/arm64cpuid.S            |    7 +
 sys/crypto/openssl/aarch64/armv8-mont.S            |  732 ++-
 sys/crypto/openssl/aarch64/chacha-armv8.S          | 1553 ++---
 sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S    |    8 +-
 sys/crypto/openssl/aarch64/ghashv8-armx.S          |    1 +
 sys/crypto/openssl/aarch64/keccak1600-armv8.S      |  190 +-
 sys/crypto/openssl/aarch64/poly1305-armv8.S        |   31 +-
 sys/crypto/openssl/aarch64/sha1-armv8.S            |   54 +-
 sys/crypto/openssl/aarch64/sha256-armv8.S          |   28 +-
 sys/crypto/openssl/aarch64/sha512-armv8.S          |   28 +-
 sys/crypto/openssl/aarch64/vpaes-armv8.S           |  276 +-
 sys/crypto/openssl/amd64/aes-x86_64.S              | 2680 ++++++++
 sys/crypto/openssl/amd64/aesni-gcm-x86_64.S        |   21 +
 sys/crypto/openssl/amd64/aesni-mb-x86_64.S         |  102 +
 sys/crypto/openssl/amd64/aesni-sha1-x86_64.S       |   21 +
 sys/crypto/openssl/amd64/aesni-sha256-x86_64.S     |   21 +
 sys/crypto/openssl/amd64/aesni-x86_64.S            |   32 +
 sys/crypto/openssl/amd64/bsaes-x86_64.S            | 2619 ++++++++
 sys/crypto/openssl/amd64/chacha-x86_64.S           |   21 +
 sys/crypto/openssl/amd64/cmll-x86_64.S             |   22 +
 sys/crypto/openssl/amd64/e_padlock-x86_64.S        |   21 +
 sys/crypto/openssl/amd64/ecp_nistz256-x86_64.S     |   21 +
 sys/crypto/openssl/amd64/ghash-x86_64.S            |   27 +
 sys/crypto/openssl/amd64/keccak1600-x86_64.S       |   21 +
 sys/crypto/openssl/amd64/md5-x86_64.S              |   29 +-
 sys/crypto/openssl/amd64/poly1305-x86_64.S         |   21 +
 sys/crypto/openssl/amd64/rc4-md5-x86_64.S          |   21 +
 sys/crypto/openssl/amd64/rc4-x86_64.S              |   24 +
 sys/crypto/openssl/amd64/rsaz-avx2.S               |   21 +
 sys/crypto/openssl/amd64/rsaz-avx512.S             |  902 +++
 sys/crypto/openssl/amd64/rsaz-x86_64.S             |   21 +
 sys/crypto/openssl/amd64/sha1-mb-x86_64.S          |   57 +
 sys/crypto/openssl/amd64/sha1-x86_64.S             |   21 +
 sys/crypto/openssl/amd64/sha256-mb-x86_64.S        |   57 +
 sys/crypto/openssl/amd64/sha256-x86_64.S           |   21 +
 sys/crypto/openssl/amd64/sha512-x86_64.S           |   21 +
 sys/crypto/openssl/amd64/vpaes-x86_64.S            |   26 +
 sys/crypto/openssl/amd64/wp-x86_64.S               |   21 +
 sys/crypto/openssl/amd64/x25519-x86_64.S           |   21 +
 sys/crypto/openssl/amd64/x86_64-gf2m.S             |   21 +
 sys/crypto/openssl/amd64/x86_64-mont.S             |   21 +
 sys/crypto/openssl/amd64/x86_64-mont5.S            |   21 +
 sys/crypto/openssl/amd64/x86_64cpuid.S             |   49 +
 sys/crypto/openssl/arm/aes-armv4.S                 |    7 +-
 sys/crypto/openssl/arm/aesv8-armx.S                |  776 ++-
 sys/crypto/openssl/arm/armv4-gf2m.S                |   13 +-
 sys/crypto/openssl/arm/armv4-mont.S                |   17 +-
 sys/crypto/openssl/arm/armv4cpuid.S                |    3 +-
 sys/crypto/openssl/arm/bsaes-armv7.S               |   47 +-
 sys/crypto/openssl/arm/chacha-armv4.S              |   11 +-
 sys/crypto/openssl/arm/ecp_nistz256-armv4.S        |    4 +-
 sys/crypto/openssl/arm/ghash-armv4.S               |    3 +-
 sys/crypto/openssl/arm/ghashv8-armx.S              |   64 +-
 sys/crypto/openssl/arm/keccak1600-armv4.S          |   34 +-
 sys/crypto/openssl/arm/poly1305-armv4.S            |   37 +-
 sys/crypto/openssl/arm/sha1-armv4-large.S          |   15 +-
 sys/crypto/openssl/arm/sha256-armv4.S              |   17 +-
 sys/crypto/openssl/arm/sha512-armv4.S              |   15 +-
 sys/crypto/openssl/i386/aes-586.S                  | 6644 ++++++++++++++++++++
 sys/crypto/openssl/i386/aesni-x86.S                |  254 +
 sys/crypto/openssl/i386/bf-586.S                   |  134 +
 sys/crypto/openssl/i386/bn-586.S                   |  104 +
 sys/crypto/openssl/i386/cast-586.S                 |  134 +
 sys/crypto/openssl/i386/chacha-x86.S               |   64 +
 sys/crypto/openssl/i386/cmll-x86.S                 |  144 +
 sys/crypto/openssl/i386/co-586.S                   |   74 +
 sys/crypto/openssl/i386/crypt586.S                 |   44 +
 sys/crypto/openssl/i386/des-586.S                  |  254 +
 sys/crypto/openssl/i386/e_padlock-x86.S            |  214 +
 sys/crypto/openssl/i386/ecp_nistz256-x86.S         |  254 +
 sys/crypto/openssl/i386/ghash-x86.S                |  104 +
 sys/crypto/openssl/i386/md5-586.S                  |   64 +-
 sys/crypto/openssl/i386/poly1305-x86.S             |  114 +
 sys/crypto/openssl/i386/rc4-586.S                  |   64 +
 sys/crypto/openssl/i386/rc5-586.S                  |  134 +
 sys/crypto/openssl/i386/rmd-586.S                  |   44 +
 sys/crypto/openssl/i386/sha1-586.S                 |   74 +
 sys/crypto/openssl/i386/sha256-586.S               |   44 +
 sys/crypto/openssl/i386/sha512-586.S               |   44 +
 sys/crypto/openssl/i386/vpaes-x86.S                |  164 +
 sys/crypto/openssl/i386/wp-mmx.S                   |   44 +
 sys/crypto/openssl/i386/x86-gf2m.S                 |   64 +
 sys/crypto/openssl/i386/x86-mont.S                 |   44 +
 sys/crypto/openssl/i386/x86cpuid.S                 |  154 +
 sys/crypto/openssl/powerpc/bn-ppc.S                | 1855 ++++++
 sys/crypto/openssl/powerpc/poly1305-ppc.S          | 1091 +++-
 sys/crypto/openssl/powerpc/vpaes-ppc.S             |   14 +-
 sys/crypto/openssl/powerpc64/bn-ppc.S              | 1876 ++++++
 sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S  |  354 ++
 sys/crypto/openssl/powerpc64/keccak1600-ppc64.S    |   32 +-
 sys/crypto/openssl/powerpc64/poly1305-ppc.S        | 1011 ++-
 sys/crypto/openssl/powerpc64/vpaes-ppc.S           |   14 +-
 sys/crypto/openssl/powerpc64le/bn-ppc.S            | 1876 ++++++
 .../openssl/powerpc64le/ecp_nistp521-ppc64.S       |  354 ++
 sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S  |   32 +-
 sys/crypto/openssl/powerpc64le/poly1305-ppc.S      | 1002 ++-
 sys/crypto/openssl/powerpc64le/vpaes-ppc.S         |   14 +-
 99 files changed, 37489 insertions(+), 1910 deletions(-)

diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S 
b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
new file mode 100644
index 000000000000..eb85dbc9f996
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
@@ -0,0 +1,6390 @@
+/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=8
+.arch  armv8-a+crypto
+.text
+.globl aes_gcm_enc_128_kernel
+.type  aes_gcm_enc_128_kernel,%function
+.align 4
+aes_gcm_enc_128_kernel:
+       cbz     x1, .L128_enc_ret
+       stp     x19, x20, [sp, #-112]!
+       mov     x16, x4
+       mov     x8, x5
+       stp     x21, x22, [sp, #16]
+       stp     x23, x24, [sp, #32]
+       stp     d8, d9, [sp, #48]
+       stp     d10, d11, [sp, #64]
+       stp     d12, d13, [sp, #80]
+       stp     d14, d15, [sp, #96]
+
+       ldp     x10, x11, [x16]              //ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+       rev     x10, x10
+       rev     x11, x11
+#endif
+       ldp     x13, x14, [x8, #160]                     //load rk10
+#ifdef __AARCH64EB__
+       ror     x13, x13, #32
+       ror     x14, x14, #32
+#endif
+       ld1     {v11.16b}, [x3]
+       ext     v11.16b, v11.16b, v11.16b, #8
+       rev64   v11.16b, v11.16b
+       lsr     x5, x1, #3              //byte_len
+       mov     x15, x5
+
+       ld1     {v18.4s}, [x8], #16                                             
                  //load rk0
+       add     x4, x0, x1, lsr #3   //end_input_ptr
+       sub     x5, x5, #1      //byte_len - 1
+
+       lsr     x12, x11, #32
+       ldr     q15, [x3, #112]                        //load h4l | h4h
+#ifndef __AARCH64EB__
+       ext     v15.16b, v15.16b, v15.16b, #8
+#endif
+       fmov    d1, x10                               //CTR block 1
+       rev     w12, w12                                //rev_ctr32
+
+       add     w12, w12, #1                            //increment rev_ctr32
+       orr     w11, w11, w11
+       ld1     {v19.4s}, [x8], #16                                             
                  //load rk1
+
+       rev     w9, w12                                 //CTR block 1
+       add     w12, w12, #1                            //CTR block 1
+       fmov    d3, x10                               //CTR block 3
+
+       orr     x9, x11, x9, lsl #32            //CTR block 1
+       ld1     { v0.16b}, [x16]                             //special case 
vector load initial counter so we can start first AES block as quickly as 
possible
+
+       fmov    v1.d[1], x9                               //CTR block 1
+       rev     w9, w12                                 //CTR block 2
+
+       fmov    d2, x10                               //CTR block 2
+       orr     x9, x11, x9, lsl #32            //CTR block 2
+       add     w12, w12, #1                            //CTR block 2
+
+       fmov    v2.d[1], x9                               //CTR block 2
+       rev     w9, w12                                 //CTR block 3
+
+       orr     x9, x11, x9, lsl #32            //CTR block 3
+       ld1     {v20.4s}, [x8], #16                                             
                  //load rk2
+
+       add     w12, w12, #1                            //CTR block 3
+       fmov    v3.d[1], x9                               //CTR block 3
+
+       ldr     q14, [x3, #80]                         //load h3l | h3h
+#ifndef __AARCH64EB__
+       ext     v14.16b, v14.16b, v14.16b, #8
+#endif
+       aese    v1.16b, v18.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 0
+       ld1     {v21.4s}, [x8], #16                                             
                  //load rk3
+
+       aese    v2.16b, v18.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 0
+       ldr     q12, [x3, #32]                         //load h1l | h1h
+#ifndef __AARCH64EB__
+       ext     v12.16b, v12.16b, v12.16b, #8
+#endif
+
+       aese    v0.16b, v18.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 0
+       ld1     {v22.4s}, [x8], #16                                             
                  //load rk4
+
+       aese    v3.16b, v18.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 0
+       ld1     {v23.4s}, [x8], #16                                             
                  //load rk5
+
+       aese    v2.16b, v19.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 1
+       trn2    v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
+
+       aese    v0.16b, v19.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 1
+       ld1     {v24.4s}, [x8], #16                                             
                  //load rk6
+
+       aese    v1.16b, v19.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 1
+       ld1     {v25.4s}, [x8], #16                                             
                  //load rk7
+
+       aese    v3.16b, v19.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 1
+       trn1    v9.2d, v14.2d,    v15.2d                      //h4h | h3h
+
+       aese    v0.16b, v20.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 2
+       ld1     {v26.4s}, [x8], #16                                             
                  //load rk8
+
+       aese    v1.16b, v20.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 2
+       ldr     q13, [x3, #64]                         //load h2l | h2h
+#ifndef __AARCH64EB__
+       ext     v13.16b, v13.16b, v13.16b, #8
+#endif
+
+       aese    v3.16b, v20.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 2
+
+       aese    v2.16b, v20.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 2
+       eor     v17.16b, v17.16b, v9.16b                  //h4k | h3k
+
+       aese    v0.16b, v21.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 3
+
+       aese    v1.16b, v21.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 3
+
+       aese    v2.16b, v21.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 3
+       ld1     {v27.4s}, [x8], #16                                             
                  //load rk9
+
+       aese    v3.16b, v21.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 3
+
+       and     x5, x5, #0xffffffffffffffc0    //number of bytes to be 
processed in main loop (at least 1 byte must be handled by tail)
+       trn2    v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
+
+       aese    v3.16b, v22.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 4
+       add     x5, x5, x0
+
+       aese    v2.16b, v22.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 4
+       cmp     x0, x5                   //check if we have <= 4 blocks
+
+       aese    v0.16b, v22.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 4
+
+       aese    v3.16b, v23.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 5
+
+       aese    v2.16b, v23.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 5
+
+       aese    v0.16b, v23.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 5
+
+       aese    v3.16b, v24.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 6
+
+       aese    v1.16b, v22.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 4
+
+       aese    v2.16b, v24.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 6
+       trn1    v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
+
+       aese    v0.16b, v24.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 6
+
+       aese    v1.16b, v23.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 5
+
+       aese    v3.16b, v25.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 7
+
+       aese    v0.16b, v25.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 7
+
+       aese    v1.16b, v24.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 6
+
+       aese    v2.16b, v25.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 7
+
+       aese    v0.16b, v26.16b
+       aesmc   v0.16b, v0.16b          //AES block 0 - round 8
+
+       aese    v1.16b, v25.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 7
+
+       aese    v2.16b, v26.16b
+       aesmc   v2.16b, v2.16b          //AES block 2 - round 8
+
+       aese    v3.16b, v26.16b
+       aesmc   v3.16b, v3.16b          //AES block 3 - round 8
+
+       aese    v1.16b, v26.16b
+       aesmc   v1.16b, v1.16b          //AES block 1 - round 8
+
+       aese    v2.16b, v27.16b                                      //AES 
block 2 - round 9
+
+       aese    v0.16b, v27.16b                                      //AES 
block 0 - round 9
+
+       eor     v16.16b, v16.16b, v8.16b                     //h2k | h1k
+
+       aese    v1.16b, v27.16b                                      //AES 
block 1 - round 9
+
+       aese    v3.16b, v27.16b                                      //AES 
block 3 - round 9
+       b.ge    .L128_enc_tail                                    //handle tail
+
+       ldp     x6, x7, [x0, #0]            //AES block 0 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x6, x6
+       rev     x7, x7
+#endif
+       ldp     x21, x22, [x0, #32]           //AES block 2 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x21, x21
+       rev     x22, x22
+#endif
+       ldp     x19, x20, [x0, #16]           //AES block 1 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x19, x19
+       rev     x20, x20
+#endif
+       ldp     x23, x24, [x0, #48]           //AES block 3 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x23, x23
+       rev     x24, x24
+#endif
+       eor     x6, x6, x13                     //AES block 0 - round 10 low
+       eor     x7, x7, x14                     //AES block 0 - round 10 high
+
+       eor     x21, x21, x13                     //AES block 2 - round 10 low
+       fmov    d4, x6                               //AES block 0 - mov low
+
+       eor     x19, x19, x13                     //AES block 1 - round 10 low
+       eor     x22, x22, x14                     //AES block 2 - round 10 high
+       fmov    v4.d[1], x7                           //AES block 0 - mov high
+
+       fmov    d5, x19                               //AES block 1 - mov low
+       eor     x20, x20, x14                     //AES block 1 - round 10 high
+
+       eor     x23, x23, x13                     //AES block 3 - round 10 low
+       fmov    v5.d[1], x20                           //AES block 1 - mov high
+
+       fmov    d6, x21                               //AES block 2 - mov low
+       eor     x24, x24, x14                     //AES block 3 - round 10 high
+       rev     w9, w12                                 //CTR block 4
+
+       fmov    v6.d[1], x22                           //AES block 2 - mov high
+       orr     x9, x11, x9, lsl #32            //CTR block 4
+
+       eor     v4.16b, v4.16b, v0.16b                          //AES block 0 - 
result
+       fmov    d0, x10                               //CTR block 4
+       add     w12, w12, #1                            //CTR block 4
+
+       fmov    v0.d[1], x9                               //CTR block 4
+       rev     w9, w12                                 //CTR block 5
+
+       eor     v5.16b, v5.16b, v1.16b                          //AES block 1 - 
result
+       fmov    d1, x10                               //CTR block 5
+       orr     x9, x11, x9, lsl #32            //CTR block 5
+
+       add     w12, w12, #1                            //CTR block 5
+       add     x0, x0, #64                       //AES input_ptr update
+       fmov    v1.d[1], x9                               //CTR block 5
+
+       fmov    d7, x23                               //AES block 3 - mov low
+       rev     w9, w12                                 //CTR block 6
+       st1     { v4.16b}, [x2], #16                     //AES block 0 - store 
result
+
+       fmov    v7.d[1], x24                           //AES block 3 - mov high
+       orr     x9, x11, x9, lsl #32            //CTR block 6
+
+       add     w12, w12, #1                            //CTR block 6
+       eor     v6.16b, v6.16b, v2.16b                          //AES block 2 - 
result
+       st1     { v5.16b}, [x2], #16                     //AES block 1 - store 
result
+
+       fmov    d2, x10                               //CTR block 6
+       cmp     x0, x5                   //check if we have <= 8 blocks
+
+       fmov    v2.d[1], x9                               //CTR block 6
+       rev     w9, w12                                 //CTR block 7
+       st1     { v6.16b}, [x2], #16                     //AES block 2 - store 
result
+
+       orr     x9, x11, x9, lsl #32            //CTR block 7
+
+       eor     v7.16b, v7.16b, v3.16b                          //AES block 3 - 
result
+       st1     { v7.16b}, [x2], #16                     //AES block 3 - store 
result
+       b.ge    .L128_enc_prepretail                              //do 
prepretail
+
+.L128_enc_main_loop:   //main  loop start
+       ldp     x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x23, x23
+       rev     x24, x24
+#endif
+       rev64   v4.16b, v4.16b                                    //GHASH block 
4k (only t0 is free)
+       rev64   v6.16b, v6.16b                                    //GHASH block 
4k+2 (t0, t1, and t2 free)
+
+       aese    v2.16b, v18.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
+       fmov    d3, x10                               //CTR block 4k+3
+
+       ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
+       rev64   v5.16b, v5.16b                                    //GHASH block 
4k+1 (t0 and t1 free)
+
+       aese    v1.16b, v18.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
+       add     w12, w12, #1                            //CTR block 4k+3
+       fmov    v3.d[1], x9                               //CTR block 4k+3
+
+       aese    v0.16b, v18.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
+       mov     d31, v6.d[1]                                  //GHASH block 
4k+2 - mid
+
+       aese    v2.16b, v19.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
+       mov     d30, v5.d[1]                                  //GHASH block 
4k+1 - mid
+
+       aese    v1.16b, v19.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
+       eor     v4.16b, v4.16b, v11.16b                           //PRE 1
+
+       aese    v3.16b, v18.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
+       eor     x24, x24, x14                     //AES block 4k+3 - round 10 
high
+
+       pmull2  v28.1q, v5.2d, v14.2d                          //GHASH block 
4k+1 - high
+       eor     v31.8b, v31.8b, v6.8b                          //GHASH block 
4k+2 - mid
+       ldp     x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x6, x6
+       rev     x7, x7
+#endif
+       aese    v0.16b, v19.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
+       rev     w9, w12                                 //CTR block 4k+8
+
+       eor     v30.8b, v30.8b, v5.8b                          //GHASH block 
4k+1 - mid
+       mov     d8, v4.d[1]                                  //GHASH block 4k - 
mid
+       orr     x9, x11, x9, lsl #32            //CTR block 4k+8
+
+       pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - 
high
+       add     w12, w12, #1                            //CTR block 4k+8
+       mov     d10, v17.d[1]                               //GHASH block 4k - 
mid
+
+       aese    v0.16b, v20.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
+
+       pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - 
low
+       eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - 
mid
+
+       aese    v1.16b, v20.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
+
+       aese    v0.16b, v21.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
+       eor     v9.16b, v9.16b, v28.16b                         //GHASH block 
4k+1 - high
+
+       pmull   v28.1q, v6.1d, v13.1d                          //GHASH block 
4k+2 - low
+
+       pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - 
mid
+       rev64   v7.16b, v7.16b                                    //GHASH block 
4k+3 (t0, t1, t2 and t3 free)
+
+       pmull   v30.1q, v30.1d, v17.1d                          //GHASH block 
4k+1 - mid
+
+       pmull   v29.1q, v5.1d, v14.1d                          //GHASH block 
4k+1 - low
+       ins     v31.d[1], v31.d[0]                                //GHASH block 
4k+2 - mid
+
+       pmull2  v8.1q, v6.2d, v13.2d                          //GHASH block 
4k+2 - high
+       eor     x7, x7, x14                     //AES block 4k+4 - round 10 high
+
+       eor     v10.16b, v10.16b, v30.16b                         //GHASH block 
4k+1 - mid
+       mov     d30, v7.d[1]                                  //GHASH block 
4k+3 - mid
+
+       aese    v3.16b, v19.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
+       eor     v11.16b, v11.16b, v29.16b                         //GHASH block 
4k+1 - low
+
+       aese    v2.16b, v20.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
+       eor     x6, x6, x13                     //AES block 4k+4 - round 10 low
+
+       aese    v1.16b, v21.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
+       eor     v30.8b, v30.8b, v7.8b                          //GHASH block 
4k+3 - mid
+
+       pmull2  v4.1q, v7.2d, v12.2d                          //GHASH block 
4k+3 - high
+
+       aese    v2.16b, v21.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3
+       eor     v9.16b, v9.16b, v8.16b                         //GHASH block 
4k+2 - high
+
+       pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 
4k+2 - mid
+
+       pmull   v29.1q, v7.1d, v12.1d                          //GHASH block 
4k+3 - low
+       movi    v8.8b, #0xc2
+
+       pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 
4k+3 - mid
+       eor     v11.16b, v11.16b, v28.16b                         //GHASH block 
4k+2 - low
+
+       aese    v1.16b, v22.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4
+
+       aese    v3.16b, v20.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
+       shl     d8, d8, #56               //mod_constant
+
+       aese    v0.16b, v22.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
+       eor     v9.16b, v9.16b, v4.16b                         //GHASH block 
4k+3 - high
+
+       aese    v1.16b, v23.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5
+       ldp     x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x19, x19
+       rev     x20, x20
+#endif
+       aese    v3.16b, v21.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
+       eor     v10.16b, v10.16b, v31.16b                         //GHASH block 
4k+2 - mid
+
+       aese    v0.16b, v23.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
+       ldp     x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x21, x21
+       rev     x22, x22
+#endif
+       pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with 
mid
+       eor     v11.16b, v11.16b, v29.16b                         //GHASH block 
4k+3 - low
+
+       aese    v2.16b, v22.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
+       eor     x19, x19, x13                     //AES block 4k+5 - round 10 
low
+
+       aese    v3.16b, v22.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4
+       eor     v10.16b, v10.16b, v30.16b                         //GHASH block 
4k+3 - mid
+
+       aese    v1.16b, v24.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
+       eor     x23, x23, x13                     //AES block 4k+3 - round 10 
low
+
+       aese    v2.16b, v23.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
+       eor     v30.16b, v11.16b, v9.16b                         //MODULO - 
karatsuba tidy up
+
+       fmov    d4, x6                               //AES block 4k+4 - mov low
+       aese    v0.16b, v24.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
+       fmov    v4.d[1], x7                           //AES block 4k+4 - mov 
high
+
+       add     x0, x0, #64                       //AES input_ptr update
+       fmov    d7, x23                               //AES block 4k+3 - mov low
+       ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other 
top alignment
+
+       aese    v3.16b, v23.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
+       fmov    d5, x19                               //AES block 4k+5 - mov low
+
+       aese    v0.16b, v25.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7
+       eor     v10.16b, v10.16b, v30.16b                         //MODULO - 
karatsuba tidy up
+
+       aese    v2.16b, v24.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
+       eor     x20, x20, x14                     //AES block 4k+5 - round 10 
high
+
+       aese    v1.16b, v25.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
+       fmov    v5.d[1], x20                           //AES block 4k+5 - mov 
high
+
+       aese    v0.16b, v26.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
+       fmov    v7.d[1], x24                           //AES block 4k+3 - mov 
high
+
+       aese    v3.16b, v24.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
+       cmp     x0, x5                   //.LOOP CONTROL
+
+       aese    v1.16b, v26.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
+       eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold 
into mid
+
+       aese    v0.16b, v27.16b                                      //AES 
block 4k+4 - round 9
+       eor     x21, x21, x13                     //AES block 4k+6 - round 10 
low
+       eor     x22, x22, x14                     //AES block 4k+6 - round 10 
high
+
+       aese    v3.16b, v25.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
+       fmov    d6, x21                               //AES block 4k+6 - mov low
+
+       aese    v1.16b, v27.16b                                      //AES 
block 4k+5 - round 9
+       fmov    v6.d[1], x22                           //AES block 4k+6 - mov 
high
+
+       aese    v2.16b, v25.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
+       eor     v4.16b, v4.16b, v0.16b                          //AES block 
4k+4 - result
+
+       fmov    d0, x10                               //CTR block 4k+8
+       aese    v3.16b, v26.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
+
+       fmov    v0.d[1], x9                               //CTR block 4k+8
+       rev     w9, w12                                 //CTR block 4k+9
+       eor     v10.16b, v10.16b, v9.16b                         //MODULO - 
fold into mid
+
+       aese    v2.16b, v26.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
+       eor     v5.16b, v5.16b, v1.16b                          //AES block 
4k+5 - result
+
+       add     w12, w12, #1                            //CTR block 4k+9
+       orr     x9, x11, x9, lsl #32            //CTR block 4k+9
+       fmov    d1, x10                               //CTR block 4k+9
+
+       pmull   v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with 
low
+       fmov    v1.d[1], x9                               //CTR block 4k+9
+       rev     w9, w12                                 //CTR block 4k+10
+
+       aese    v2.16b, v27.16b                                      //AES 
block 4k+6 - round 9
+       st1     { v4.16b}, [x2], #16                     //AES block 4k+4 - 
store result
+       eor     v6.16b, v6.16b, v2.16b                          //AES block 
4k+6 - result
+       orr     x9, x11, x9, lsl #32            //CTR block 4k+10
+
+       aese    v3.16b, v27.16b                                      //AES 
block 4k+7 - round 9
+       add     w12, w12, #1                            //CTR block 4k+10
+       ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - 
other mid alignment
+       fmov    d2, x10                               //CTR block 4k+10
+
+       eor     v11.16b, v11.16b, v9.16b                         //MODULO - 
fold into low
+       st1     { v5.16b}, [x2], #16                     //AES block 4k+5 - 
store result
+
+       fmov    v2.d[1], x9                               //CTR block 4k+10
+       st1     { v6.16b}, [x2], #16                     //AES block 4k+6 - 
store result
+       rev     w9, w12                                 //CTR block 4k+11
+
+       orr     x9, x11, x9, lsl #32            //CTR block 4k+11
+       eor     v7.16b, v7.16b, v3.16b                          //AES block 
4k+3 - result
+
+       eor     v11.16b, v11.16b, v10.16b                         //MODULO - 
fold into low
+       st1     { v7.16b}, [x2], #16                     //AES block 4k+3 - 
store result
+       b.lt    .L128_enc_main_loop
+
+.L128_enc_prepretail:  //PREPRETAIL
+       rev64   v4.16b, v4.16b                                    //GHASH block 
4k (only t0 is free)
+       fmov    d3, x10                               //CTR block 4k+3
+       rev64   v5.16b, v5.16b                                    //GHASH block 
4k+1 (t0 and t1 free)
+
+       ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
+       add     w12, w12, #1                            //CTR block 4k+3
+       fmov    v3.d[1], x9                               //CTR block 4k+3
+
+       aese    v1.16b, v18.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
+       rev64   v6.16b, v6.16b                                    //GHASH block 
4k+2 (t0, t1, and t2 free)
+
+       pmull   v29.1q, v5.1d, v14.1d                          //GHASH block 
4k+1 - low
+
+       rev64   v7.16b, v7.16b                                    //GHASH block 
4k+3 (t0, t1, t2 and t3 free)
+       eor     v4.16b, v4.16b, v11.16b                           //PRE 1
+
+       pmull2  v28.1q, v5.2d, v14.2d                          //GHASH block 
4k+1 - high
+
+       aese    v3.16b, v18.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
+       mov     d30, v5.d[1]                                  //GHASH block 
4k+1 - mid
+
+       pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - 
low
+       mov     d8, v4.d[1]                                  //GHASH block 4k - 
mid
+
+       mov     d31, v6.d[1]                                  //GHASH block 
4k+2 - mid
+       mov     d10, v17.d[1]                               //GHASH block 4k - 
mid
+
+       aese    v1.16b, v19.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
+       eor     v30.8b, v30.8b, v5.8b                          //GHASH block 
4k+1 - mid
+
+       eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - 
mid
+
+       pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - 
high
+       eor     v31.8b, v31.8b, v6.8b                          //GHASH block 
4k+2 - mid
+
+       aese    v3.16b, v19.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
+
+       pmull   v30.1q, v30.1d, v17.1d                          //GHASH block 
4k+1 - mid
+       eor     v11.16b, v11.16b, v29.16b                         //GHASH block 
4k+1 - low
+
+       pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - 
mid
+
+       aese    v0.16b, v18.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
+       ins     v31.d[1], v31.d[0]                                //GHASH block 
4k+2 - mid
+
+       aese    v2.16b, v18.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
+
+       eor     v10.16b, v10.16b, v30.16b                         //GHASH block 
4k+1 - mid
+       mov     d30, v7.d[1]                                  //GHASH block 
4k+3 - mid
+
+       aese    v0.16b, v19.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
+       eor     v9.16b, v9.16b, v28.16b                         //GHASH block 
4k+1 - high
+
+       pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 
4k+2 - mid
+
+       pmull2  v8.1q, v6.2d, v13.2d                          //GHASH block 
4k+2 - high
+       eor     v30.8b, v30.8b, v7.8b                          //GHASH block 
4k+3 - mid
+
+       pmull2  v4.1q, v7.2d, v12.2d                          //GHASH block 
4k+3 - high
+
+       pmull   v28.1q, v6.1d, v13.1d                          //GHASH block 
4k+2 - low
+
+       aese    v2.16b, v19.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
+       eor     v9.16b, v9.16b, v8.16b                         //GHASH block 
4k+2 - high
+
+       aese    v0.16b, v20.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
+
+       pmull   v29.1q, v7.1d, v12.1d                          //GHASH block 
4k+3 - low
+       movi    v8.8b, #0xc2
+
+       aese    v2.16b, v20.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
+       eor     v11.16b, v11.16b, v28.16b                         //GHASH block 
4k+2 - low
+
+       aese    v3.16b, v20.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
+
+       pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 
4k+3 - mid
+       eor     v10.16b, v10.16b, v31.16b                         //GHASH block 
4k+2 - mid
+
+       aese    v2.16b, v21.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3
+
+       aese    v1.16b, v20.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
+       eor     v9.16b, v9.16b, v4.16b                         //GHASH block 
4k+3 - high
+
+       aese    v0.16b, v21.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
+
+       eor     v10.16b, v10.16b, v30.16b                         //GHASH block 
4k+3 - mid
+       shl     d8, d8, #56               //mod_constant
+
+       aese    v1.16b, v21.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
+       eor     v11.16b, v11.16b, v29.16b                         //GHASH block 
4k+3 - low
+
+       aese    v0.16b, v22.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
+
+       pmull   v28.1q, v9.1d, v8.1d
+       eor     v10.16b, v10.16b, v9.16b                         //karatsuba 
tidy up
+
+       aese    v1.16b, v22.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4
+
+       aese    v0.16b, v23.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
+       ext     v9.16b, v9.16b, v9.16b, #8
+
+       aese    v3.16b, v21.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
+
+       aese    v2.16b, v22.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
+       eor     v10.16b, v10.16b, v11.16b
+
+       aese    v0.16b, v24.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
+
+       aese    v3.16b, v22.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4
+
+       aese    v1.16b, v23.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5
+
+       aese    v2.16b, v23.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
+       eor     v10.16b, v10.16b, v28.16b
+
+       aese    v3.16b, v23.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
+
+       aese    v1.16b, v24.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
+
+       aese    v2.16b, v24.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
+
+       aese    v3.16b, v24.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
+       eor     v10.16b, v10.16b, v9.16b
+
+       aese    v0.16b, v25.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7
+
+       aese    v2.16b, v25.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
+
+       aese    v3.16b, v25.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
+
+       pmull   v28.1q, v10.1d, v8.1d
+
+       aese    v1.16b, v25.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
+       ext     v10.16b, v10.16b, v10.16b, #8
+
+       aese    v3.16b, v26.16b
+       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
+
+       aese    v0.16b, v26.16b
+       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
+       eor     v11.16b, v11.16b, v28.16b
+
+       aese    v1.16b, v26.16b
+       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
+
+       aese    v3.16b, v27.16b                                      //AES 
block 4k+7 - round 9
+
+       aese    v2.16b, v26.16b
+       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
+
+       aese    v0.16b, v27.16b                                      //AES 
block 4k+4 - round 9
+
+       aese    v1.16b, v27.16b                                      //AES 
block 4k+5 - round 9
+       eor     v11.16b, v11.16b, v10.16b
+
+       aese    v2.16b, v27.16b                                      //AES 
block 4k+6 - round 9
+.L128_enc_tail:        //TAIL
+
+       sub     x5, x4, x0   //main_end_input_ptr is number of bytes left to 
process
+       ldp     x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
+#ifdef __AARCH64EB__
+       rev     x6, x6
+       rev     x7, x7
+#endif
+       cmp     x5, #48
+
+       ext     v8.16b, v11.16b, v11.16b, #8                     //prepare 
final partial tag
+       eor     x6, x6, x13                     //AES block 4k+4 - round 10 low
+       eor     x7, x7, x14                     //AES block 4k+4 - round 10 high
+
+       fmov    d4, x6                               //AES block 4k+4 - mov low
+
+       fmov    v4.d[1], x7                           //AES block 4k+4 - mov 
high
+
+       eor     v5.16b, v4.16b, v0.16b                          //AES block 
4k+4 - result
+
+       b.gt    .L128_enc_blocks_more_than_3
+
+       sub     w12, w12, #1
+       movi    v11.8b, #0
+       mov     v3.16b, v2.16b
+
+       cmp     x5, #32
+       mov     v2.16b, v1.16b
+       movi    v9.8b, #0
+
+       movi    v10.8b, #0
+       b.gt    .L128_enc_blocks_more_than_2
+
+       mov     v3.16b, v1.16b
+       cmp     x5, #16
+
+       sub     w12, w12, #1
+       b.gt    .L128_enc_blocks_more_than_1
+
+       sub     w12, w12, #1
+       b       .L128_enc_blocks_less_than_1
+.L128_enc_blocks_more_than_3:  //blocks        left >  3
+       st1     { v5.16b}, [x2], #16                     //AES final-3 block  - 
store result
+
+       ldp     x6, x7, [x0], #16           //AES final-2 block - load input 
low & high
+#ifdef __AARCH64EB__
+       rev     x6, x6
+       rev     x7, x7
+#endif
+       rev64   v4.16b, v5.16b                                    //GHASH 
final-3 block
+
+       eor     v4.16b, v4.16b, v8.16b                           //feed in 
partial tag
+       eor     x7, x7, x14                     //AES final-2 block - round 10 
high
+       eor     x6, x6, x13                     //AES final-2 block - round 10 
low
+
+       fmov    d5, x6                                 //AES final-2 block - 
mov low
+
+       movi    v8.8b, #0                                        //suppress 
further partial tag feed in
+       fmov    v5.d[1], x7                             //AES final-2 block - 
mov high
+
+       pmull   v11.1q, v4.1d, v15.1d                       //GHASH final-3 
block - low
+       mov     d22, v4.d[1]                                 //GHASH final-3 
block - mid
+
+       pmull2  v9.1q, v4.2d, v15.2d                       //GHASH final-3 
block - high
+
+       mov     d10, v17.d[1]                               //GHASH final-3 
block - mid
+
+       eor     v5.16b, v5.16b, v1.16b                            //AES final-2 
block - result
+       eor     v22.8b, v22.8b, v4.8b                      //GHASH final-3 
block - mid
+
+       pmull   v10.1q, v22.1d, v10.1d                    //GHASH final-3 block 
- mid
+.L128_enc_blocks_more_than_2:  //blocks        left >  2
+
+       st1     { v5.16b}, [x2], #16                     //AES final-2 block - 
store result
+
+       rev64   v4.16b, v5.16b                                    //GHASH 
final-2 block
+       ldp     x6, x7, [x0], #16           //AES final-1 block - load input 
low & high
+#ifdef __AARCH64EB__
+       rev     x6, x6
+       rev     x7, x7
+#endif
+       eor     v4.16b, v4.16b, v8.16b                           //feed in 
partial tag
+
+       eor     x6, x6, x13                     //AES final-1 block - round 10 
low
+
+       fmov    d5, x6                                 //AES final-1 block - 
mov low
+       eor     x7, x7, x14                     //AES final-1 block - round 10 
high
+
+       pmull2  v20.1q, v4.2d, v14.2d                          //GHASH final-2 
block - high
+       fmov    v5.d[1], x7                             //AES final-1 block - 
mov high
+
+       mov     d22, v4.d[1]                                 //GHASH final-2 
block - mid
+
+       pmull   v21.1q, v4.1d, v14.1d                          //GHASH final-2 
block - low
+
+       eor     v9.16b, v9.16b, v20.16b                            //GHASH 
final-2 block - high
+
+       eor     v22.8b, v22.8b, v4.8b                      //GHASH final-2 
block - mid
+
+       eor     v5.16b, v5.16b, v2.16b                            //AES final-1 
block - result
+
+       eor     v11.16b, v11.16b, v21.16b                            //GHASH 
final-2 block - low
+
+       pmull   v22.1q, v22.1d, v17.1d                      //GHASH final-2 
block - mid
+
+       movi    v8.8b, #0                                        //suppress 
further partial tag feed in
+
+       eor     v10.16b, v10.16b, v22.16b                       //GHASH final-2 
block - mid
+.L128_enc_blocks_more_than_1:  //blocks        left >  1
+
+       st1     { v5.16b}, [x2], #16                     //AES final-1 block - 
store result
+
+       rev64   v4.16b, v5.16b                                    //GHASH 
final-1 block
+       ldp     x6, x7, [x0], #16           //AES final block - load input low 
& high
*** 45312 LINES SKIPPED ***

Reply via email to