diff --git a/Makefile b/Makefile
index c8b8e902d5a4..af101b556ba0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 4
PATCHLEVEL = 15
-SUBLEVEL = 0
+SUBLEVEL = 1
EXTRAVERSION =
NAME = Fearless Coyote
diff --git a/arch/x86/crypto/aesni-intel_asm.S
b/arch/x86/crypto/aesni-intel_asm.S
index 3d09e3aca18d..12e8484a8ee7 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -90,30 +90,6 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0x
.octa 0x
-.section .rodata
-.align 16
-.type aad_shift_arr, @object
-.size aad_shift_arr, 272
-aad_shift_arr:
-.octa 0x
-.octa 0xff0C
-.octa 0x0D0C
-.octa 0xff0E0D0C
-.octa 0x0F0E0D0C
-.octa 0xff0C0B0A0908
-.octa 0x0D0C0B0A0908
-.octa 0xff0E0D0C0B0A0908
-.octa 0x0F0E0D0C0B0A0908
-.octa 0xff0C0B0A090807060504
-.octa 0x0D0C0B0A090807060504
-.octa 0xff0E0D0C0B0A090807060504
-.octa 0x0F0E0D0C0B0A090807060504
-.octa 0xff0C0B0A09080706050403020100
-.octa 0x0D0C0B0A09080706050403020100
-.octa 0xff0E0D0C0B0A09080706050403020100
-.octa 0x0F0E0D0C0B0A09080706050403020100
-
-
.text
@@ -257,6 +233,37 @@ aad_shift_arr:
pxor \TMP1, \GH# result is in TMP1
.endm
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN and XMM1
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
+cmp $8, \DLEN
+jl _read_lt8_\@
+mov (\DPTR), %rax
+MOVQ_R64_XMM %rax, \XMMDst
+sub $8, \DLEN
+jz _done_read_partial_block_\@
+ xor %eax, %eax
+_read_next_byte_\@:
+shl $8, %rax
+mov 7(\DPTR, \DLEN, 1), %al
+dec \DLEN
+jnz _read_next_byte_\@
+MOVQ_R64_XMM %rax, \XMM1
+ pslldq $8, \XMM1
+por \XMM1, \XMMDst
+ jmp _done_read_partial_block_\@
+_read_lt8_\@:
+ xor %eax, %eax
+_read_next_byte_lt8_\@:
+shl $8, %rax
+mov -1(\DPTR, \DLEN, 1), %al
+dec \DLEN
+jnz _read_next_byte_lt8_\@
+MOVQ_R64_XMM %rax, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
/*
* if a = number of total plaintext bytes
* b = floor(a/16)
@@ -273,62 +280,30 @@ aad_shift_arr:
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
MOVADQ SHUF_MASK(%rip), %xmm14
movarg7, %r10 # %r10 = AAD
- movarg8, %r12 # %r12 = aadLen
- mov%r12, %r11
+ movarg8, %r11 # %r11 = aadLen
pxor %xmm\i, %xmm\i
pxor \XMM2, \XMM2
cmp$16, %r11
- jl _get_AAD_rest8\num_initial_blocks\operation
+ jl _get_AAD_rest\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
movdqu (%r10), %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor %xmm\i, \XMM2
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
add$16, %r10
- sub$16, %r12
sub$16, %r11
cmp$16, %r11
jge_get_AAD_blocks\num_initial_blocks\operation
movdqu \XMM2, %xmm\i
+
+ /* read the last <16B of AAD */
+_get_AAD_rest\num_initial_blocks\operation:
cmp$0, %r11
je _get_AAD_done\num_initial_blocks\operation
- pxor %xmm\i,%xmm\i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\num_initial_blocks\operation:
- cmp$4, %r11
- jle_get_AAD_rest4\num_initial_blocks\operation
- movq (%r10), \TMP1
- add$8, %r10
- sub$8, %r11
- pslldq $8, \TMP1
- psrldq $8, %xmm\i
- pxor \TMP1, %xmm\i
- jmp_get_AAD_rest8\num_initial_blocks\operation
-_get_AAD_rest4\num_initial_blocks\operation:
- cmp$0, %r11
- jle_get_AAD_rest0\num_initial_blocks\operation
- mov(%r10), %eax
- movq %rax, \TMP1
- add$4, %r10
- sub$4, %r10
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
- pxor \TMP1, %xmm\i
-_get_AAD_rest0\num_initial_blocks\operation:
- /* finalize: shift out the extra bytes we read, and align
- lef