It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
reading ahead beyond its intended data, and causing a crash if the next
block is beyond page boundary:
http://marc.info/?l=linux-crypto-vger&m=149373371023377

This patch makes sure that there is no overflow for any buffer length.

It passes the tests written by Jan Stancek that revealed this problem:
https://github.com/jstancek/sha1-avx2-crash

Jan, can you verify this fix?
Herbert, can you re-enable sha1-avx2 once Jan has checked it out and
revert commit b82ce24426a4071da9529d726057e4e642948667 ?

Signed-off-by: Megha Dey <megha....@linux.intel.com>
Reported-by: Jan Stancek <jstan...@redhat.com>
---
 arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 ++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S 
b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1cd792d..1eab79c 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -117,11 +117,10 @@
        .set T1, REG_T1
 .endm
 
-#define K_BASE         %r8
 #define HASH_PTR       %r9
+#define BLOCKS_CTR     %r8
 #define BUFFER_PTR     %r10
 #define BUFFER_PTR2    %r13
-#define BUFFER_END     %r11
 
 #define PRECALC_BUF    %r14
 #define WK_BUF         %r15
@@ -205,14 +204,14 @@
                 * blended AVX2 and ALU instruction scheduling
                 * 1 vector iteration per 8 rounds
                 */
-               vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+               vmovdqu (i * 2)(BUFFER_PTR), W_TMP
        .elseif ((i & 7) == 1)
-               vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
+               vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
                         WY_TMP, WY_TMP
        .elseif ((i & 7) == 2)
                vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
        .elseif ((i & 7) == 4)
-               vpaddd  K_XMM(K_BASE), WY, WY_TMP
+               vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
        .elseif ((i & 7) == 7)
                vmovdqu  WY_TMP, PRECALC_WK(i&~7)
 
@@ -255,7 +254,7 @@
                vpxor   WY, WY_TMP, WY_TMP
        .elseif ((i & 7) == 7)
                vpxor   WY_TMP2, WY_TMP, WY
-               vpaddd  K_XMM(K_BASE), WY, WY_TMP
+               vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
                vmovdqu WY_TMP, PRECALC_WK(i&~7)
 
                PRECALC_ROTATE_WY
@@ -291,7 +290,7 @@
                vpsrld  $30, WY, WY
                vpor    WY, WY_TMP, WY
        .elseif ((i & 7) == 7)
-               vpaddd  K_XMM(K_BASE), WY, WY_TMP
+               vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
                vmovdqu WY_TMP, PRECALC_WK(i&~7)
 
                PRECALC_ROTATE_WY
@@ -446,6 +445,16 @@
 
 .endm
 
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+       mov     \a, RTA
+       add     $\d, RTA
+       cmp     $\c, \b
+       cmovge  RTA, \a
+.endm
+
 /*
  * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
  */
@@ -463,13 +472,16 @@
        lea     (2*4*80+32)(%rsp), WK_BUF
 
        # Precalc WK for first 2 blocks
-       PRECALC_OFFSET = 0
+       ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
        .set i, 0
        .rept    160
                PRECALC i
                .set i, i + 1
        .endr
-       PRECALC_OFFSET = 128
+
+       /* Go to next block if needed */
+       ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+       ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
        xchg    WK_BUF, PRECALC_BUF
 
        .align 32
@@ -479,8 +491,8 @@ _loop:
         * we use K_BASE value as a signal of a last block,
         * it is set below by: cmovae BUFFER_PTR, K_BASE
         */
-       cmp     K_BASE, BUFFER_PTR
-       jne     _begin
+       test BLOCKS_CTR, BLOCKS_CTR
+       jnz _begin
        .align 32
        jmp     _end
        .align 32
@@ -512,10 +524,10 @@ _loop0:
                .set j, j+2
        .endr
 
-       add     $(2*64), BUFFER_PTR       /* move to next odd-64-byte block */
-       cmp     BUFFER_END, BUFFER_PTR    /* is current block the last one? */
-       cmovae  K_BASE, BUFFER_PTR      /* signal the last iteration smartly */
-
+       /* Update Counter */
+       sub $1, BLOCKS_CTR
+       /* Move to the next block only if needed*/
+       ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
        /*
         * rounds
         * 60,62,64,66,68
@@ -532,8 +544,8 @@ _loop0:
        UPDATE_HASH     12(HASH_PTR), D
        UPDATE_HASH     16(HASH_PTR), E
 
-       cmp     K_BASE, BUFFER_PTR      /* is current block the last one? */
-       je      _loop
+       test    BLOCKS_CTR, BLOCKS_CTR
+       jz      _loop
 
        mov     TB, B
 
@@ -575,10 +587,10 @@ _loop2:
                .set j, j+2
        .endr
 
-       add     $(2*64), BUFFER_PTR2      /* move to next even-64-byte block */
-
-       cmp     BUFFER_END, BUFFER_PTR2   /* is current block the last one */
-       cmovae  K_BASE, BUFFER_PTR       /* signal the last iteration smartly */
+       /* update counter */
+       sub     $1, BLOCKS_CTR
+       /* Move to the next block only if needed*/
+       ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 
        jmp     _loop3
 _loop3:
@@ -641,19 +653,12 @@ _loop3:
 
        avx2_zeroupper
 
-       lea     K_XMM_AR(%rip), K_BASE
-
+       /* Setup initial values */
        mov     CTX, HASH_PTR
        mov     BUF, BUFFER_PTR
-       lea     64(BUF), BUFFER_PTR2
-
-       shl     $6, CNT                 /* mul by 64 */
-       add     BUF, CNT
-       add     $64, CNT
-       mov     CNT, BUFFER_END
 
-       cmp     BUFFER_END, BUFFER_PTR2
-       cmovae  K_BASE, BUFFER_PTR2
+       mov     BUF, BUFFER_PTR2
+       mov     CNT, BLOCKS_CTR
 
        xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
 
-- 
1.9.1

Reply via email to