[PATCH 2/3 v2] RFC4106 AES-GCM Driver Using Intel New Instructions - fixed build with binutils 2.16

tadeusz . struk Fri, 10 Dec 2010 04:34:21 -0800

>From [email protected] Mon Sep 17 00:00:00 2001
From: root <[email protected]>
Date: Fri, 10 Dec 2010 12:10:57 +0000
Subject: [PATCH 2/3 v2] RFC4106 AES-GCM Driver Using Intel New Instructions - 
fixed build with binutils 2.16


Hi Herbert,
This patch fixes the problem with 2.16 binutils.
Regards,
Tadeusz

Signed-off-by: Aidan O'Mahony <[email protected]>
Signed-off-by: Adrian Hoban <[email protected]>
Signed-off-by: Gabriele Paoloni <[email protected]>
Signed-off-by: Tadeusz Struk <[email protected]>
---
 arch/x86/crypto/aesni-intel_asm.S |  598 ++++++++++++++++++++++++++++++++-----
 1 files changed, 519 insertions(+), 79 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index d528fde..8fe2a49 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -204,9 +204,9 @@ enc:        .octa 0x2
 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
 */
 
-.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
-XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 
+.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 
XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
        mov        arg7, %r10           # %r10 = AAD
        mov        arg8, %r12           # %r12 = aadLen
        mov        %r12, %r11
@@ -228,19 +228,25 @@ _get_AAD_loop2\num_initial_blocks\operation:
        cmp        %r11, %r12
        jne        _get_AAD_loop2\num_initial_blocks\operation
 _get_AAD_loop2_done\num_initial_blocks\operation:
-       pshufb     SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
+
        xor        %r11, %r11 # initialise the data pointer offset as zero
 
         # start AES for num_initial_blocks blocks
 
        mov        %arg5, %rax                      # %rax = *Y0
        movdqu     (%rax), \XMM0                    # XMM0 = Y0
-       pshufb     SHUF_MASK(%rip), \XMM0
-.if \i_seq != 0
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
 .irpc index, \i_seq
        paddd      ONE(%rip), \XMM0                 # INCR Y0
        movdqa     \XMM0, %xmm\index
-       pshufb     SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
+
 .endr
 .irpc index, \i_seq
        pxor       16*0(%arg1), %xmm\index
@@ -291,10 +297,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
        movdqu     %xmm\index, (%arg2 , %r11, 1)
        # write back plaintext/ciphertext for num_initial_blocks
        add        $16, %r11
-.if \operation == dec
+
        movdqa     \TMP1, %xmm\index
-.endif
-       pshufb     SHUF_MASK(%rip), %xmm\index
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM         %xmm14, %xmm\index
+
                # prepare plaintext/ciphertext for GHASH computation
 .endr
 .endif
@@ -327,16 +334,24 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 */
        paddd      ONE(%rip), \XMM0              # INCR Y0
        movdqa     \XMM0, \XMM1
-       pshufb     SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
+
        paddd      ONE(%rip), \XMM0              # INCR Y0
        movdqa     \XMM0, \XMM2
-       pshufb     SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
+
        paddd      ONE(%rip), \XMM0              # INCR Y0
        movdqa     \XMM0, \XMM3
-       pshufb     SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
+
        paddd      ONE(%rip), \XMM0              # INCR Y0
        movdqa     \XMM0, \XMM4
-       pshufb     SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
+
        pxor       16*0(%arg1), \XMM1
        pxor       16*0(%arg1), \XMM2
        pxor       16*0(%arg1), \XMM3
@@ -385,41 +400,268 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
        AESENCLAST \TMP2, \XMM4
        movdqu     16*0(%arg3 , %r11 , 1), \TMP1
        pxor       \TMP1, \XMM1
-.if \operation == dec
        movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
        movdqa     \TMP1, \XMM1
-.endif
        movdqu     16*1(%arg3 , %r11 , 1), \TMP1
        pxor       \TMP1, \XMM2
-.if \operation == dec
        movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
        movdqa     \TMP1, \XMM2
-.endif
        movdqu     16*2(%arg3 , %r11 , 1), \TMP1
        pxor       \TMP1, \XMM3
-.if \operation == dec
        movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
        movdqa     \TMP1, \XMM3
-.endif
        movdqu     16*3(%arg3 , %r11 , 1), \TMP1
        pxor       \TMP1, \XMM4
-.if \operation == dec
        movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
        movdqa     \TMP1, \XMM4
-.else
+       add        $64, %r11
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+       pxor       \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 
XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+       mov        arg7, %r10           # %r10 = AAD
+       mov        arg8, %r12           # %r12 = aadLen
+       mov        %r12, %r11
+       pxor       %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+       movd       (%r10), \TMP1
+       pslldq     $12, \TMP1
+       psrldq     $4, %xmm\i
+       pxor       \TMP1, %xmm\i
+       add        $4, %r10
+       sub        $4, %r12
+       jne        _get_AAD_loop\num_initial_blocks\operation
+       cmp        $16, %r11
+       je         _get_AAD_loop2_done\num_initial_blocks\operation
+       mov        $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+       psrldq     $4, %xmm\i
+       sub        $4, %r12
+       cmp        %r11, %r12
+       jne        _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
+
+       xor        %r11, %r11 # initialise the data pointer offset as zero
+
+        # start AES for num_initial_blocks blocks
+
+       mov        %arg5, %rax                      # %rax = *Y0
+       movdqu     (%rax), \XMM0                    # XMM0 = Y0
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+       paddd      ONE(%rip), \XMM0                 # INCR Y0
+       movdqa     \XMM0, %xmm\index
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+       pxor       16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+       movaps 0x10(%rdi), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+       movaps 0x20(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x30(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x40(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x50(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x60(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x70(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x80(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x90(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0xa0(%arg1), \TMP1
+       AESENCLAST \TMP1, %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+       movdqu     (%arg3 , %r11, 1), \TMP1
+       pxor       \TMP1, %xmm\index
+       movdqu     %xmm\index, (%arg2 , %r11, 1)
+       # write back plaintext/ciphertext for num_initial_blocks
+       add        $16, %r11
+
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM         %xmm14, %xmm\index
+
+               # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+       GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+       GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+       cmp        $64, %r13
+       jl      _initial_blocks_done\num_initial_blocks\operation
+       # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM1
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM2
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM3
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM4
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
+
+       pxor       16*0(%arg1), \XMM1
+       pxor       16*0(%arg1), \XMM2
+       pxor       16*0(%arg1), \XMM3
+       pxor       16*0(%arg1), \XMM4
+       movdqa     \TMP3, \TMP5
+       pshufd     $78, \TMP3, \TMP1
+       pxor       \TMP3, \TMP1
+       movdqa     \TMP1, HashKey_k(%rsp)
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+       movdqa     \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+       movaps 0x10*\index(%arg1), \TMP1
+       AESENC     \TMP1, \XMM1
+       AESENC     \TMP1, \XMM2
+       AESENC     \TMP1, \XMM3
+       AESENC     \TMP1, \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_3(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+       movaps 0x10*\index(%arg1), \TMP1
+       AESENC     \TMP1, \XMM1
+       AESENC     \TMP1, \XMM2
+       AESENC     \TMP1, \XMM3
+       AESENC     \TMP1, \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_4(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_4_k(%rsp)
+       movaps 0xa0(%arg1), \TMP2
+       AESENCLAST \TMP2, \XMM1
+       AESENCLAST \TMP2, \XMM2
+       AESENCLAST \TMP2, \XMM3
+       AESENCLAST \TMP2, \XMM4
+       movdqu     16*0(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM1
+       movdqu     16*1(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM2
+       movdqu     16*2(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM3
+       movdqu     16*3(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM4
        movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
        movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
        movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
        movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
-.endif
+
        add        $64, %r11
-       pshufb     SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
        pxor       \XMMDst, \XMM1
 # combine GHASHed value with the corresponding ciphertext
-       pshufb     SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
-       pshufb     SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
-       pshufb     SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
 _initial_blocks_done\num_initial_blocks\operation:
+
 .endm
 
 /*
@@ -428,7 +670,199 @@ _initial_blocks_done\num_initial_blocks\operation:
 * arg1, %arg2, %arg3 are used as pointers only, not modified
 * %r11 is the data offset value
 */
-.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
+.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+       movdqa    \XMM1, \XMM5
+       movdqa    \XMM2, \XMM6
+       movdqa    \XMM3, \XMM7
+       movdqa    \XMM4, \XMM8
+
+        movdqa    SHUF_MASK(%rip), %xmm15
+        # multiply TMP5 * HashKey using karatsuba
+
+       movdqa    \XMM5, \TMP4
+       pshufd    $78, \XMM5, \TMP6
+       pxor      \XMM5, \TMP6
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    HashKey_4(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+       movdqa    \XMM0, \XMM1
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM2
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM3
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM4
+       PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+       PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+       PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
+
+       pxor      (%arg1), \XMM1
+       pxor      (%arg1), \XMM2
+       pxor      (%arg1), \XMM3
+       pxor      (%arg1), \XMM4
+       movdqa    HashKey_4_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+       movaps 0x10(%arg1), \TMP1
+       AESENC    \TMP1, \XMM1              # Round 1
+       AESENC    \TMP1, \XMM2
+       AESENC    \TMP1, \XMM3
+       AESENC    \TMP1, \XMM4
+       movaps 0x20(%arg1), \TMP1
+       AESENC    \TMP1, \XMM1              # Round 2
+       AESENC    \TMP1, \XMM2
+       AESENC    \TMP1, \XMM3
+       AESENC    \TMP1, \XMM4
+       movdqa    \XMM6, \TMP1
+       pshufd    $78, \XMM6, \TMP2
+       pxor      \XMM6, \TMP2
+       movdqa    HashKey_3(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+       movaps 0x30(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 3
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+       movaps 0x40(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 4
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       movdqa    HashKey_3_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       movaps 0x50(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 5
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       pxor      \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM6, \XMM5
+       pxor      \TMP2, \TMP6
+       movdqa    \XMM7, \TMP1
+       pshufd    $78, \XMM7, \TMP2
+       pxor      \XMM7, \TMP2
+       movdqa    HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+       PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+       movaps 0x60(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 6
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+       movaps 0x70(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1             # Round 7
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       movdqa    HashKey_2_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       movaps 0x80(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1             # Round 8
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       pxor      \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM7, \XMM5
+       pxor      \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+       movdqa    \XMM8, \TMP1
+       pshufd    $78, \XMM8, \TMP2
+       pxor      \XMM8, \TMP2
+       movdqa    HashKey(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+       movaps 0x90(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1            # Round 9
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+       movaps 0xa0(%arg1), \TMP3
+       AESENCLAST \TMP3, \XMM1           # Round 10
+       AESENCLAST \TMP3, \XMM2
+       AESENCLAST \TMP3, \XMM3
+       AESENCLAST \TMP3, \XMM4
+       movdqa    HashKey_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+       movdqu    (%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+       movdqu    16(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+       movdqu    32(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+       movdqu    48(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+       PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
+
+       pxor      \TMP4, \TMP1
+       pxor      \XMM8, \XMM5
+       pxor      \TMP6, \TMP2
+       pxor      \TMP1, \TMP2
+       pxor      \XMM5, \TMP2
+       movdqa    \TMP2, \TMP3
+       pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
+       psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
+       pxor      \TMP3, \XMM5
+       pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+       movdqa    \XMM5, \TMP2
+       movdqa    \XMM5, \TMP3
+       movdqa    \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+       pslld     $31, \TMP2                   # packed right shift << 31
+       pslld     $30, \TMP3                   # packed right shift << 30
+       pslld     $25, \TMP4                   # packed right shift << 25
+       pxor      \TMP3, \TMP2                 # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP5
+       psrldq    $4, \TMP5                    # right shift T5 1 DW
+       pslldq    $12, \TMP2                   # left shift T2 3 DWs
+       pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+       movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+       movdqa    \XMM5,\TMP3
+       movdqa    \XMM5,\TMP4
+       psrld     $1, \TMP2                    # packed left shift >>1
+       psrld     $2, \TMP3                    # packed left shift >>2
+       psrld     $7, \TMP4                    # packed left shift >>7
+       pxor      \TMP3,\TMP2                  # xor the shifted versions
+       pxor      \TMP4,\TMP2
+       pxor      \TMP5, \TMP2
+       pxor      \TMP2, \XMM5
+       pxor      \TMP1, \XMM5                 # result is in TMP1
+
+       pxor      \XMM5, \XMM1
+.endm
+
+/*
+* decrypt 4 blocks at a time
+* ghash the 4 previously decrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 
        movdqa    \XMM1, \XMM5
@@ -436,6 +870,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        movdqa    \XMM3, \XMM7
        movdqa    \XMM4, \XMM8
 
+        movdqa    SHUF_MASK(%rip), %xmm15
         # multiply TMP5 * HashKey using karatsuba
 
        movdqa    \XMM5, \TMP4
@@ -451,11 +886,12 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 
operation
        movdqa    \XMM0, \XMM3
        paddd     ONE(%rip), \XMM0              # INCR CNT
        movdqa    \XMM0, \XMM4
-       pshufb    SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
        PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
-       pshufb    SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
-       pshufb    SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
-       pshufb    SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
+
        pxor      (%arg1), \XMM1
        pxor      (%arg1), \XMM2
        pxor      (%arg1), \XMM3
@@ -553,37 +989,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 
operation
        PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
        movdqu    (%arg3,%r11,1), \TMP3
        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
-.if \operation == dec
        movdqu    \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
        movdqa    \TMP3, \XMM1
-.endif
        movdqu    16(%arg3,%r11,1), \TMP3
        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
-.if \operation == dec
        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
        movdqa    \TMP3, \XMM2
-.endif
        movdqu    32(%arg3,%r11,1), \TMP3
        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
-.if \operation == dec
        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
        movdqa    \TMP3, \XMM3
-.endif
        movdqu    48(%arg3,%r11,1), \TMP3
        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
-.if \operation == dec
        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
        movdqa    \TMP3, \XMM4
-.else
-    movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
-    movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
-    movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
-    movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
-.endif
-       pshufb    SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
-       pshufb    SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
-       pshufb    SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
-       pshufb    SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway
+       PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
 
        pxor      \TMP4, \TMP1
        pxor      \XMM8, \XMM5
@@ -853,7 +1276,9 @@ ENTRY(aesni_gcm_dec)
        and     $~63, %rsp                        # align rsp to 64 bytes
        mov     %arg6, %r12
        movdqu  (%r12), %xmm13                    # %xmm13 = HashKey
-       pshufb  SHUF_MASK(%rip), %xmm13
+        movdqa  SHUF_MASK(%rip), %xmm2
+       PSHUFB_XMM %xmm2, %xmm13
+
 
 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
 
@@ -885,22 +1310,22 @@ ENTRY(aesni_gcm_dec)
        jb _initial_num_blocks_is_1_decrypt
        je _initial_num_blocks_is_2_decrypt
 _initial_num_blocks_is_3_decrypt:
-       INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+       INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
        sub     $48, %r13
        jmp     _initial_blocks_decrypted
 _initial_num_blocks_is_2_decrypt:
-       INITIAL_BLOCKS  2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+       INITIAL_BLOCKS_DEC      2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, 
%xmm0, \
 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
        sub     $32, %r13
        jmp     _initial_blocks_decrypted
 _initial_num_blocks_is_1_decrypt:
-       INITIAL_BLOCKS  1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+       INITIAL_BLOCKS_DEC      1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, 
%xmm0, \
 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
        sub     $16, %r13
        jmp     _initial_blocks_decrypted
 _initial_num_blocks_is_0_decrypt:
-       INITIAL_BLOCKS  0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+       INITIAL_BLOCKS_DEC      0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, 
%xmm0, \
 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
 _initial_blocks_decrypted:
        cmp     $0, %r13
@@ -908,7 +1333,7 @@ _initial_blocks_decrypted:
        sub     $64, %r13
        je      _four_cipher_left_decrypt
 _decrypt_by_4:
-       GHASH_4_ENCRYPT_4_PARALLEL      %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+       GHASH_4_ENCRYPT_4_PARALLEL_DEC  %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
        add     $64, %r11
        sub     $64, %r13
@@ -924,7 +1349,9 @@ _zero_cipher_left_decrypt:
         # Handle the last <16 byte block seperately
 
        paddd ONE(%rip), %xmm0         # increment CNT to get Yn
-       pshufb SHUF_MASK(%rip), %xmm0
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm0
+
        ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
        sub $16, %r11
        add %r13, %r11
@@ -934,14 +1361,17 @@ _zero_cipher_left_decrypt:
 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
 # (%r13 is the number of bytes in plaintext mod 16)
        movdqu (%r12), %xmm2           # get the appropriate shuffle mask
-       pshufb %xmm2, %xmm1            # right shift 16-%r13 butes
+       PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
+
        movdqa  %xmm1, %xmm2
        pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
        movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
        # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
        pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
        pand    %xmm1, %xmm2
-       pshufb SHUF_MASK(%rip),%xmm2
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10 ,%xmm2
+
        pxor %xmm2, %xmm8
        GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
                  # GHASH computation for the last <16 byte block
@@ -949,13 +1379,13 @@ _zero_cipher_left_decrypt:
        add $16, %r11
 
         # output %r13 bytes
-       movq    %xmm0, %rax
+       MOVQ_R64_XMM    %xmm0, %rax
        cmp     $8, %r13
        jle     _less_than_8_bytes_left_decrypt
        mov     %rax, (%arg2 , %r11, 1)
        add     $8, %r11
        psrldq  $8, %xmm0
-       movq    %xmm0, %rax
+       MOVQ_R64_XMM    %xmm0, %rax
        sub     $8, %r13
 _less_than_8_bytes_left_decrypt:
        mov     %al,  (%arg2, %r11, 1)
@@ -968,13 +1398,15 @@ _multiple_of_16_bytes_decrypt:
        shl     $3, %r12                  # convert into number of bits
        movd    %r12d, %xmm15             # len(A) in %xmm15
        shl     $3, %arg4                 # len(C) in bits (*128)
-       movq    %arg4, %xmm1
+       MOVQ_R64_XMM    %arg4, %xmm1
        pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
        pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
        pxor    %xmm15, %xmm8
        GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
                 # final GHASH computation
-       pshufb  SHUF_MASK(%rip), %xmm8
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm8
+
        mov     %arg5, %rax               # %rax = *Y0
        movdqu  (%rax), %xmm0             # %xmm0 = Y0
        ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
@@ -987,11 +1419,11 @@ _return_T_decrypt:
        cmp     $12, %r11
        je      _T_12_decrypt
 _T_8_decrypt:
-       movq    %xmm0, %rax
+       MOVQ_R64_XMM    %xmm0, %rax
        mov     %rax, (%r10)
        jmp     _return_T_done_decrypt
 _T_12_decrypt:
-       movq    %xmm0, %rax
+       MOVQ_R64_XMM    %xmm0, %rax
        mov     %rax, (%r10)
        psrldq  $8, %xmm0
        movd    %xmm0, %eax
@@ -1103,7 +1535,9 @@ ENTRY(aesni_gcm_enc)
        and     $~63, %rsp
        mov     %arg6, %r12
        movdqu  (%r12), %xmm13
-       pshufb  SHUF_MASK(%rip), %xmm13
+        movdqa  SHUF_MASK(%rip), %xmm2
+       PSHUFB_XMM %xmm2, %xmm13
+
 
 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 
@@ -1134,22 +1568,22 @@ ENTRY(aesni_gcm_enc)
        jb      _initial_num_blocks_is_1_encrypt
        je      _initial_num_blocks_is_2_encrypt
 _initial_num_blocks_is_3_encrypt:
-       INITIAL_BLOCKS  3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+       INITIAL_BLOCKS_ENC      3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, 
%xmm0, \
 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
        sub     $48, %r13
        jmp     _initial_blocks_encrypted
 _initial_num_blocks_is_2_encrypt:
-       INITIAL_BLOCKS  2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+       INITIAL_BLOCKS_ENC      2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, 
%xmm0, \
 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
        sub     $32, %r13
        jmp     _initial_blocks_encrypted
 _initial_num_blocks_is_1_encrypt:
-       INITIAL_BLOCKS  1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+       INITIAL_BLOCKS_ENC      1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, 
%xmm0, \
 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
        sub     $16, %r13
        jmp     _initial_blocks_encrypted
 _initial_num_blocks_is_0_encrypt:
-       INITIAL_BLOCKS  0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+       INITIAL_BLOCKS_ENC      0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, 
%xmm0, \
 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
 _initial_blocks_encrypted:
 
@@ -1160,7 +1594,7 @@ _initial_blocks_encrypted:
        sub     $64, %r13
        je      _four_cipher_left_encrypt
 _encrypt_by_4_encrypt:
-       GHASH_4_ENCRYPT_4_PARALLEL      %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+       GHASH_4_ENCRYPT_4_PARALLEL_ENC  %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
        add     $64, %r11
        sub     $64, %r13
@@ -1175,7 +1609,9 @@ _zero_cipher_left_encrypt:
 
          # Handle the last <16 Byte block seperately
        paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
-       pshufb SHUF_MASK(%rip), %xmm0
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm0
+
        ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
        sub $16, %r11
        add %r13, %r11
@@ -1185,29 +1621,31 @@ _zero_cipher_left_encrypt:
        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
        # (%r13 is the number of bytes in plaintext mod 16)
        movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
-       pshufb  %xmm2, %xmm1            # shift right 16-r13 byte
+       PSHUFB_XMM      %xmm2, %xmm1            # shift right 16-r13 byte
        pxor    %xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
        movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
        # get the appropriate mask to mask out top 16-r13 bytes of xmm0
        pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10,%xmm0
 
-       pshufb  SHUF_MASK(%rip),%xmm0
        pxor    %xmm0, %xmm8
        GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
        # GHASH computation for the last <16 byte block
        sub     %r13, %r11
        add     $16, %r11
-       pshufb SHUF_MASK(%rip), %xmm0
+       PSHUFB_XMM %xmm10, %xmm1
+
        # shuffle xmm0 back to output as ciphertext
 
         # Output %r13 bytes
-       movq %xmm0, %rax
+       MOVQ_R64_XMM %xmm0, %rax
        cmp $8, %r13
        jle _less_than_8_bytes_left_encrypt
        mov %rax, (%arg2 , %r11, 1)
        add $8, %r11
        psrldq $8, %xmm0
-       movq %xmm0, %rax
+       MOVQ_R64_XMM %xmm0, %rax
        sub $8, %r13
 _less_than_8_bytes_left_encrypt:
        mov %al,  (%arg2, %r11, 1)
@@ -1220,14 +1658,15 @@ _multiple_of_16_bytes_encrypt:
        shl     $3, %r12
        movd    %r12d, %xmm15       # len(A) in %xmm15
        shl     $3, %arg4               # len(C) in bits (*128)
-       movq    %arg4, %xmm1
+       MOVQ_R64_XMM    %arg4, %xmm1
        pslldq  $8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
        pxor    %xmm1, %xmm15       # %xmm15 = len(A)||len(C)
        pxor    %xmm15, %xmm8
        GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
        # final GHASH computation
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
 
-       pshufb  SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
        mov     %arg5, %rax                    # %rax  = *Y0
        movdqu  (%rax), %xmm0                  # %xmm0 = Y0
        ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm15         # Encrypt(K, Y0)
@@ -1240,11 +1679,11 @@ _return_T_encrypt:
        cmp     $12, %r11
        je      _T_12_encrypt
 _T_8_encrypt:
-       movq    %xmm0, %rax
+       MOVQ_R64_XMM    %xmm0, %rax
        mov     %rax, (%r10)
        jmp     _return_T_done_encrypt
 _T_12_encrypt:
-       movq    %xmm0, %rax
+       MOVQ_R64_XMM    %xmm0, %rax
        mov     %rax, (%r10)
        psrldq  $8, %xmm0
        movd    %xmm0, %eax
@@ -1258,6 +1697,7 @@ _return_T_done_encrypt:
        pop     %r13
        pop     %r12
        ret
+
 #endif
 
 
-- 
1.6.4.2

--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole 
use of the intended recipient(s). Any review or distribution by others is 
strictly prohibited. If you are not the intended recipient, please contact the 
sender and delete all copies.


--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3 v2] RFC4106 AES-GCM Driver Using Intel New Instructions - fixed build with binutils 2.16

Reply via email to