salsa20 - remove x86 salsa20 implementations

Eric Biggers Sat, 26 May 2018 00:13:09 -0700

From: Eric Biggers <ebigg...@google.com>

The x86 assembly implementations of Salsa20 use the frame base pointer
register (%ebp or %rbp), which breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.
Recent (v4.10+) kernels will warn about this, e.g.


WARNING: kernel stack regs at 00000000a8291e69 in syzkaller047086:4677 has bad 
'bp' value 000000001077994c
[...]

But after looking into it, I believe there's very little reason to still
retain the x86 Salsa20 code.  First, these are *not* vectorized
(SSE2/SSSE3/AVX2) implementations, which would be needed to get anywhere
close to the best Salsa20 performance on any remotely modern x86
processor; they're just regular x86 assembly.  Second, it's still
unclear that anyone is actually using the kernel's Salsa20 at all,
especially given that now ChaCha20 is supported too, and with much more
efficient SSSE3 and AVX2 implementations.  Finally, in benchmarks I did
on both Intel and AMD processors with both gcc 8.1.0 and gcc 4.9.4, the
x86_64 salsa20-asm is actually slightly *slower* than salsa20-generic
(~3% slower on Skylake, ~10% slower on Zen), while the i686 salsa20-asm
is only slightly faster than salsa20-generic (~15% faster on Skylake,
~20% faster on Zen).  The gcc version made little difference.

So, the x86_64 salsa20-asm is pretty clearly useless.  That leaves just
the i686 salsa20-asm, which based on my tests provides a 15-20% speed
boost.  But that's without updating the code to not use %ebp.  And given
the maintenance cost, the small speed difference vs. salsa20-generic,
the fact that few people still use i686 kernels, the doubt that anyone
is even using the kernel's Salsa20 at all, and the fact that a SSE2
implementation would almost certainly be much faster on any remotely
modern x86 processor yet no one has cared enough to add one yet, I don't
think it's worthwhile to keep.

Thus, just remove both the x86_64 and i686 salsa20-asm implementations.

Reported-by: syzbot+ffa3a158337bbc01f...@syzkaller.appspotmail.com
Signed-off-by: Eric Biggers <ebigg...@google.com>
---
 arch/x86/crypto/Makefile                |   4 -
 arch/x86/crypto/salsa20-i586-asm_32.S   | 938 ------------------------
 arch/x86/crypto/salsa20-x86_64-asm_64.S | 805 --------------------
 arch/x86/crypto/salsa20_glue.c          |  91 ---
 crypto/Kconfig                          |  28 -
 5 files changed, 1866 deletions(-)
 delete mode 100644 arch/x86/crypto/salsa20-i586-asm_32.S
 delete mode 100644 arch/x86/crypto/salsa20-x86_64-asm_64.S
 delete mode 100644 arch/x86/crypto/salsa20_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3813e7cdaada..2e07a0e66314 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
-obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
@@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
@@ -68,7 +66,6 @@ endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
@@ -77,7 +74,6 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S 
b/arch/x86/crypto/salsa20-i586-asm_32.S
deleted file mode 100644
index 6014b7b9e52a..000000000000
--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ /dev/null
@@ -1,938 +0,0 @@
-# Derived from:
-#      salsa20_pm.s version 20051229
-#      D. J. Bernstein
-#      Public domain.
-
-#include <linux/linkage.h>
-
-.text
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
-       mov     %esp,%eax
-       and     $31,%eax
-       add     $256,%eax
-       sub     %eax,%esp
-       # eax_stack = eax
-       movl    %eax,80(%esp)
-       # ebx_stack = ebx
-       movl    %ebx,84(%esp)
-       # esi_stack = esi
-       movl    %esi,88(%esp)
-       # edi_stack = edi
-       movl    %edi,92(%esp)
-       # ebp_stack = ebp
-       movl    %ebp,96(%esp)
-       # x = arg1
-       movl    4(%esp,%eax),%edx
-       # m = arg2
-       movl    8(%esp,%eax),%esi
-       # out = arg3
-       movl    12(%esp,%eax),%edi
-       # bytes = arg4
-       movl    16(%esp,%eax),%ebx
-       # bytes -= 0
-       sub     $0,%ebx
-       # goto done if unsigned<=
-       jbe     ._done
-._start:
-       # in0 = *(uint32 *) (x + 0)
-       movl    0(%edx),%eax
-       # in1 = *(uint32 *) (x + 4)
-       movl    4(%edx),%ecx
-       # in2 = *(uint32 *) (x + 8)
-       movl    8(%edx),%ebp
-       # j0 = in0
-       movl    %eax,164(%esp)
-       # in3 = *(uint32 *) (x + 12)
-       movl    12(%edx),%eax
-       # j1 = in1
-       movl    %ecx,168(%esp)
-       # in4 = *(uint32 *) (x + 16)
-       movl    16(%edx),%ecx
-       # j2 = in2
-       movl    %ebp,172(%esp)
-       # in5 = *(uint32 *) (x + 20)
-       movl    20(%edx),%ebp
-       # j3 = in3
-       movl    %eax,176(%esp)
-       # in6 = *(uint32 *) (x + 24)
-       movl    24(%edx),%eax
-       # j4 = in4
-       movl    %ecx,180(%esp)
-       # in7 = *(uint32 *) (x + 28)
-       movl    28(%edx),%ecx
-       # j5 = in5
-       movl    %ebp,184(%esp)
-       # in8 = *(uint32 *) (x + 32)
-       movl    32(%edx),%ebp
-       # j6 = in6
-       movl    %eax,188(%esp)
-       # in9 = *(uint32 *) (x + 36)
-       movl    36(%edx),%eax
-       # j7 = in7
-       movl    %ecx,192(%esp)
-       # in10 = *(uint32 *) (x + 40)
-       movl    40(%edx),%ecx
-       # j8 = in8
-       movl    %ebp,196(%esp)
-       # in11 = *(uint32 *) (x + 44)
-       movl    44(%edx),%ebp
-       # j9 = in9
-       movl    %eax,200(%esp)
-       # in12 = *(uint32 *) (x + 48)
-       movl    48(%edx),%eax
-       # j10 = in10
-       movl    %ecx,204(%esp)
-       # in13 = *(uint32 *) (x + 52)
-       movl    52(%edx),%ecx
-       # j11 = in11
-       movl    %ebp,208(%esp)
-       # in14 = *(uint32 *) (x + 56)
-       movl    56(%edx),%ebp
-       # j12 = in12
-       movl    %eax,212(%esp)
-       # in15 = *(uint32 *) (x + 60)
-       movl    60(%edx),%eax
-       # j13 = in13
-       movl    %ecx,216(%esp)
-       # j14 = in14
-       movl    %ebp,220(%esp)
-       # j15 = in15
-       movl    %eax,224(%esp)
-       # x_backup = x
-       movl    %edx,64(%esp)
-._bytesatleast1:
-       #   bytes - 64
-       cmp     $64,%ebx
-       #   goto nocopy if unsigned>=
-       jae     ._nocopy
-       #     ctarget = out
-       movl    %edi,228(%esp)
-       #     out = &tmp
-       leal    0(%esp),%edi
-       #     i = bytes
-       mov     %ebx,%ecx
-       #     while (i) { *out++ = *m++; --i }
-       rep     movsb
-       #     out = &tmp
-       leal    0(%esp),%edi
-       #     m = &tmp
-       leal    0(%esp),%esi
-._nocopy:
-       #   out_backup = out
-       movl    %edi,72(%esp)
-       #   m_backup = m
-       movl    %esi,68(%esp)
-       #   bytes_backup = bytes
-       movl    %ebx,76(%esp)
-       #   in0 = j0
-       movl    164(%esp),%eax
-       #   in1 = j1
-       movl    168(%esp),%ecx
-       #   in2 = j2
-       movl    172(%esp),%edx
-       #   in3 = j3
-       movl    176(%esp),%ebx
-       #   x0 = in0
-       movl    %eax,100(%esp)
-       #   x1 = in1
-       movl    %ecx,104(%esp)
-       #   x2 = in2
-       movl    %edx,108(%esp)
-       #   x3 = in3
-       movl    %ebx,112(%esp)
-       #   in4 = j4
-       movl    180(%esp),%eax
-       #   in5 = j5
-       movl    184(%esp),%ecx
-       #   in6 = j6
-       movl    188(%esp),%edx
-       #   in7 = j7
-       movl    192(%esp),%ebx
-       #   x4 = in4
-       movl    %eax,116(%esp)
-       #   x5 = in5
-       movl    %ecx,120(%esp)
-       #   x6 = in6
-       movl    %edx,124(%esp)
-       #   x7 = in7
-       movl    %ebx,128(%esp)
-       #   in8 = j8
-       movl    196(%esp),%eax
-       #   in9 = j9
-       movl    200(%esp),%ecx
-       #   in10 = j10
-       movl    204(%esp),%edx
-       #   in11 = j11
-       movl    208(%esp),%ebx
-       #   x8 = in8
-       movl    %eax,132(%esp)
-       #   x9 = in9
-       movl    %ecx,136(%esp)
-       #   x10 = in10
-       movl    %edx,140(%esp)
-       #   x11 = in11
-       movl    %ebx,144(%esp)
-       #   in12 = j12
-       movl    212(%esp),%eax
-       #   in13 = j13
-       movl    216(%esp),%ecx
-       #   in14 = j14
-       movl    220(%esp),%edx
-       #   in15 = j15
-       movl    224(%esp),%ebx
-       #   x12 = in12
-       movl    %eax,148(%esp)
-       #   x13 = in13
-       movl    %ecx,152(%esp)
-       #   x14 = in14
-       movl    %edx,156(%esp)
-       #   x15 = in15
-       movl    %ebx,160(%esp)
-       #   i = 20
-       mov     $20,%ebp
-       # p = x0
-       movl    100(%esp),%eax
-       # s = x5
-       movl    120(%esp),%ecx
-       # t = x10
-       movl    140(%esp),%edx
-       # w = x15
-       movl    160(%esp),%ebx
-._mainloop:
-       # x0 = p
-       movl    %eax,100(%esp)
-       #                               x10 = t
-       movl    %edx,140(%esp)
-       # p += x12
-       addl    148(%esp),%eax
-       #               x5 = s
-       movl    %ecx,120(%esp)
-       #                               t += x6
-       addl    124(%esp),%edx
-       #                                               x15 = w
-       movl    %ebx,160(%esp)
-       #               r = x1
-       movl    104(%esp),%esi
-       #               r += s
-       add     %ecx,%esi
-       #                                               v = x11
-       movl    144(%esp),%edi
-       #                                               v += w
-       add     %ebx,%edi
-       # p <<<= 7
-       rol     $7,%eax
-       # p ^= x4
-       xorl    116(%esp),%eax
-       #                               t <<<= 7
-       rol     $7,%edx
-       #                               t ^= x14
-       xorl    156(%esp),%edx
-       #               r <<<= 7
-       rol     $7,%esi
-       #               r ^= x9
-       xorl    136(%esp),%esi
-       #                                               v <<<= 7
-       rol     $7,%edi
-       #                                               v ^= x3
-       xorl    112(%esp),%edi
-       # x4 = p
-       movl    %eax,116(%esp)
-       #                               x14 = t
-       movl    %edx,156(%esp)
-       # p += x0
-       addl    100(%esp),%eax
-       #               x9 = r
-       movl    %esi,136(%esp)
-       #                               t += x10
-       addl    140(%esp),%edx
-       #                                               x3 = v
-       movl    %edi,112(%esp)
-       # p <<<= 9
-       rol     $9,%eax
-       # p ^= x8
-       xorl    132(%esp),%eax
-       #                               t <<<= 9
-       rol     $9,%edx
-       #                               t ^= x2
-       xorl    108(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 9
-       rol     $9,%ecx
-       #               s ^= x13
-       xorl    152(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 9
-       rol     $9,%ebx
-       #                                               w ^= x7
-       xorl    128(%esp),%ebx
-       # x8 = p
-       movl    %eax,132(%esp)
-       #                               x2 = t
-       movl    %edx,108(%esp)
-       # p += x4
-       addl    116(%esp),%eax
-       #               x13 = s
-       movl    %ecx,152(%esp)
-       #                               t += x14
-       addl    156(%esp),%edx
-       #                                               x7 = w
-       movl    %ebx,128(%esp)
-       # p <<<= 13
-       rol     $13,%eax
-       # p ^= x12
-       xorl    148(%esp),%eax
-       #                               t <<<= 13
-       rol     $13,%edx
-       #                               t ^= x6
-       xorl    124(%esp),%edx
-       #               r += s
-       add     %ecx,%esi
-       #               r <<<= 13
-       rol     $13,%esi
-       #               r ^= x1
-       xorl    104(%esp),%esi
-       #                                               v += w
-       add     %ebx,%edi
-       #                                               v <<<= 13
-       rol     $13,%edi
-       #                                               v ^= x11
-       xorl    144(%esp),%edi
-       # x12 = p
-       movl    %eax,148(%esp)
-       #                               x6 = t
-       movl    %edx,124(%esp)
-       # p += x8
-       addl    132(%esp),%eax
-       #               x1 = r
-       movl    %esi,104(%esp)
-       #                               t += x2
-       addl    108(%esp),%edx
-       #                                               x11 = v
-       movl    %edi,144(%esp)
-       # p <<<= 18
-       rol     $18,%eax
-       # p ^= x0
-       xorl    100(%esp),%eax
-       #                               t <<<= 18
-       rol     $18,%edx
-       #                               t ^= x10
-       xorl    140(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 18
-       rol     $18,%ecx
-       #               s ^= x5
-       xorl    120(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 18
-       rol     $18,%ebx
-       #                                               w ^= x15
-       xorl    160(%esp),%ebx
-       # x0 = p
-       movl    %eax,100(%esp)
-       #                               x10 = t
-       movl    %edx,140(%esp)
-       # p += x3
-       addl    112(%esp),%eax
-       # p <<<= 7
-       rol     $7,%eax
-       #               x5 = s
-       movl    %ecx,120(%esp)
-       #                               t += x9
-       addl    136(%esp),%edx
-       #                                               x15 = w
-       movl    %ebx,160(%esp)
-       #               r = x4
-       movl    116(%esp),%esi
-       #               r += s
-       add     %ecx,%esi
-       #                                               v = x14
-       movl    156(%esp),%edi
-       #                                               v += w
-       add     %ebx,%edi
-       # p ^= x1
-       xorl    104(%esp),%eax
-       #                               t <<<= 7
-       rol     $7,%edx
-       #                               t ^= x11
-       xorl    144(%esp),%edx
-       #               r <<<= 7
-       rol     $7,%esi
-       #               r ^= x6
-       xorl    124(%esp),%esi
-       #                                               v <<<= 7
-       rol     $7,%edi
-       #                                               v ^= x12
-       xorl    148(%esp),%edi
-       # x1 = p
-       movl    %eax,104(%esp)
-       #                               x11 = t
-       movl    %edx,144(%esp)
-       # p += x0
-       addl    100(%esp),%eax
-       #               x6 = r
-       movl    %esi,124(%esp)
-       #                               t += x10
-       addl    140(%esp),%edx
-       #                                               x12 = v
-       movl    %edi,148(%esp)
-       # p <<<= 9
-       rol     $9,%eax
-       # p ^= x2
-       xorl    108(%esp),%eax
-       #                               t <<<= 9
-       rol     $9,%edx
-       #                               t ^= x8
-       xorl    132(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 9
-       rol     $9,%ecx
-       #               s ^= x7
-       xorl    128(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 9
-       rol     $9,%ebx
-       #                                               w ^= x13
-       xorl    152(%esp),%ebx
-       # x2 = p
-       movl    %eax,108(%esp)
-       #                               x8 = t
-       movl    %edx,132(%esp)
-       # p += x1
-       addl    104(%esp),%eax
-       #               x7 = s
-       movl    %ecx,128(%esp)
-       #                               t += x11
-       addl    144(%esp),%edx
-       #                                               x13 = w
-       movl    %ebx,152(%esp)
-       # p <<<= 13
-       rol     $13,%eax
-       # p ^= x3
-       xorl    112(%esp),%eax
-       #                               t <<<= 13
-       rol     $13,%edx
-       #                               t ^= x9
-       xorl    136(%esp),%edx
-       #               r += s
-       add     %ecx,%esi
-       #               r <<<= 13
-       rol     $13,%esi
-       #               r ^= x4
-       xorl    116(%esp),%esi
-       #                                               v += w
-       add     %ebx,%edi
-       #                                               v <<<= 13
-       rol     $13,%edi
-       #                                               v ^= x14
-       xorl    156(%esp),%edi
-       # x3 = p
-       movl    %eax,112(%esp)
-       #                               x9 = t
-       movl    %edx,136(%esp)
-       # p += x2
-       addl    108(%esp),%eax
-       #               x4 = r
-       movl    %esi,116(%esp)
-       #                               t += x8
-       addl    132(%esp),%edx
-       #                                               x14 = v
-       movl    %edi,156(%esp)
-       # p <<<= 18
-       rol     $18,%eax
-       # p ^= x0
-       xorl    100(%esp),%eax
-       #                               t <<<= 18
-       rol     $18,%edx
-       #                               t ^= x10
-       xorl    140(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 18
-       rol     $18,%ecx
-       #               s ^= x5
-       xorl    120(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 18
-       rol     $18,%ebx
-       #                                               w ^= x15
-       xorl    160(%esp),%ebx
-       # x0 = p
-       movl    %eax,100(%esp)
-       #                               x10 = t
-       movl    %edx,140(%esp)
-       # p += x12
-       addl    148(%esp),%eax
-       #               x5 = s
-       movl    %ecx,120(%esp)
-       #                               t += x6
-       addl    124(%esp),%edx
-       #                                               x15 = w
-       movl    %ebx,160(%esp)
-       #               r = x1
-       movl    104(%esp),%esi
-       #               r += s
-       add     %ecx,%esi
-       #                                               v = x11
-       movl    144(%esp),%edi
-       #                                               v += w
-       add     %ebx,%edi
-       # p <<<= 7
-       rol     $7,%eax
-       # p ^= x4
-       xorl    116(%esp),%eax
-       #                               t <<<= 7
-       rol     $7,%edx
-       #                               t ^= x14
-       xorl    156(%esp),%edx
-       #               r <<<= 7
-       rol     $7,%esi
-       #               r ^= x9
-       xorl    136(%esp),%esi
-       #                                               v <<<= 7
-       rol     $7,%edi
-       #                                               v ^= x3
-       xorl    112(%esp),%edi
-       # x4 = p
-       movl    %eax,116(%esp)
-       #                               x14 = t
-       movl    %edx,156(%esp)
-       # p += x0
-       addl    100(%esp),%eax
-       #               x9 = r
-       movl    %esi,136(%esp)
-       #                               t += x10
-       addl    140(%esp),%edx
-       #                                               x3 = v
-       movl    %edi,112(%esp)
-       # p <<<= 9
-       rol     $9,%eax
-       # p ^= x8
-       xorl    132(%esp),%eax
-       #                               t <<<= 9
-       rol     $9,%edx
-       #                               t ^= x2
-       xorl    108(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 9
-       rol     $9,%ecx
-       #               s ^= x13
-       xorl    152(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 9
-       rol     $9,%ebx
-       #                                               w ^= x7
-       xorl    128(%esp),%ebx
-       # x8 = p
-       movl    %eax,132(%esp)
-       #                               x2 = t
-       movl    %edx,108(%esp)
-       # p += x4
-       addl    116(%esp),%eax
-       #               x13 = s
-       movl    %ecx,152(%esp)
-       #                               t += x14
-       addl    156(%esp),%edx
-       #                                               x7 = w
-       movl    %ebx,128(%esp)
-       # p <<<= 13
-       rol     $13,%eax
-       # p ^= x12
-       xorl    148(%esp),%eax
-       #                               t <<<= 13
-       rol     $13,%edx
-       #                               t ^= x6
-       xorl    124(%esp),%edx
-       #               r += s
-       add     %ecx,%esi
-       #               r <<<= 13
-       rol     $13,%esi
-       #               r ^= x1
-       xorl    104(%esp),%esi
-       #                                               v += w
-       add     %ebx,%edi
-       #                                               v <<<= 13
-       rol     $13,%edi
-       #                                               v ^= x11
-       xorl    144(%esp),%edi
-       # x12 = p
-       movl    %eax,148(%esp)
-       #                               x6 = t
-       movl    %edx,124(%esp)
-       # p += x8
-       addl    132(%esp),%eax
-       #               x1 = r
-       movl    %esi,104(%esp)
-       #                               t += x2
-       addl    108(%esp),%edx
-       #                                               x11 = v
-       movl    %edi,144(%esp)
-       # p <<<= 18
-       rol     $18,%eax
-       # p ^= x0
-       xorl    100(%esp),%eax
-       #                               t <<<= 18
-       rol     $18,%edx
-       #                               t ^= x10
-       xorl    140(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 18
-       rol     $18,%ecx
-       #               s ^= x5
-       xorl    120(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 18
-       rol     $18,%ebx
-       #                                               w ^= x15
-       xorl    160(%esp),%ebx
-       # x0 = p
-       movl    %eax,100(%esp)
-       #                               x10 = t
-       movl    %edx,140(%esp)
-       # p += x3
-       addl    112(%esp),%eax
-       # p <<<= 7
-       rol     $7,%eax
-       #               x5 = s
-       movl    %ecx,120(%esp)
-       #                               t += x9
-       addl    136(%esp),%edx
-       #                                               x15 = w
-       movl    %ebx,160(%esp)
-       #               r = x4
-       movl    116(%esp),%esi
-       #               r += s
-       add     %ecx,%esi
-       #                                               v = x14
-       movl    156(%esp),%edi
-       #                                               v += w
-       add     %ebx,%edi
-       # p ^= x1
-       xorl    104(%esp),%eax
-       #                               t <<<= 7
-       rol     $7,%edx
-       #                               t ^= x11
-       xorl    144(%esp),%edx
-       #               r <<<= 7
-       rol     $7,%esi
-       #               r ^= x6
-       xorl    124(%esp),%esi
-       #                                               v <<<= 7
-       rol     $7,%edi
-       #                                               v ^= x12
-       xorl    148(%esp),%edi
-       # x1 = p
-       movl    %eax,104(%esp)
-       #                               x11 = t
-       movl    %edx,144(%esp)
-       # p += x0
-       addl    100(%esp),%eax
-       #               x6 = r
-       movl    %esi,124(%esp)
-       #                               t += x10
-       addl    140(%esp),%edx
-       #                                               x12 = v
-       movl    %edi,148(%esp)
-       # p <<<= 9
-       rol     $9,%eax
-       # p ^= x2
-       xorl    108(%esp),%eax
-       #                               t <<<= 9
-       rol     $9,%edx
-       #                               t ^= x8
-       xorl    132(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 9
-       rol     $9,%ecx
-       #               s ^= x7
-       xorl    128(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 9
-       rol     $9,%ebx
-       #                                               w ^= x13
-       xorl    152(%esp),%ebx
-       # x2 = p
-       movl    %eax,108(%esp)
-       #                               x8 = t
-       movl    %edx,132(%esp)
-       # p += x1
-       addl    104(%esp),%eax
-       #               x7 = s
-       movl    %ecx,128(%esp)
-       #                               t += x11
-       addl    144(%esp),%edx
-       #                                               x13 = w
-       movl    %ebx,152(%esp)
-       # p <<<= 13
-       rol     $13,%eax
-       # p ^= x3
-       xorl    112(%esp),%eax
-       #                               t <<<= 13
-       rol     $13,%edx
-       #                               t ^= x9
-       xorl    136(%esp),%edx
-       #               r += s
-       add     %ecx,%esi
-       #               r <<<= 13
-       rol     $13,%esi
-       #               r ^= x4
-       xorl    116(%esp),%esi
-       #                                               v += w
-       add     %ebx,%edi
-       #                                               v <<<= 13
-       rol     $13,%edi
-       #                                               v ^= x14
-       xorl    156(%esp),%edi
-       # x3 = p
-       movl    %eax,112(%esp)
-       #                               x9 = t
-       movl    %edx,136(%esp)
-       # p += x2
-       addl    108(%esp),%eax
-       #               x4 = r
-       movl    %esi,116(%esp)
-       #                               t += x8
-       addl    132(%esp),%edx
-       #                                               x14 = v
-       movl    %edi,156(%esp)
-       # p <<<= 18
-       rol     $18,%eax
-       # p ^= x0
-       xorl    100(%esp),%eax
-       #                               t <<<= 18
-       rol     $18,%edx
-       #                               t ^= x10
-       xorl    140(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 18
-       rol     $18,%ecx
-       #               s ^= x5
-       xorl    120(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 18
-       rol     $18,%ebx
-       #                                               w ^= x15
-       xorl    160(%esp),%ebx
-       # i -= 4
-       sub     $4,%ebp
-       # goto mainloop if unsigned >
-       ja      ._mainloop
-       # x0 = p
-       movl    %eax,100(%esp)
-       # x5 = s
-       movl    %ecx,120(%esp)
-       # x10 = t
-       movl    %edx,140(%esp)
-       # x15 = w
-       movl    %ebx,160(%esp)
-       #   out = out_backup
-       movl    72(%esp),%edi
-       #   m = m_backup
-       movl    68(%esp),%esi
-       #   in0 = x0
-       movl    100(%esp),%eax
-       #   in1 = x1
-       movl    104(%esp),%ecx
-       #   in0 += j0
-       addl    164(%esp),%eax
-       #   in1 += j1
-       addl    168(%esp),%ecx
-       #   in0 ^= *(uint32 *) (m + 0)
-       xorl    0(%esi),%eax
-       #   in1 ^= *(uint32 *) (m + 4)
-       xorl    4(%esi),%ecx
-       #   *(uint32 *) (out + 0) = in0
-       movl    %eax,0(%edi)
-       #   *(uint32 *) (out + 4) = in1
-       movl    %ecx,4(%edi)
-       #   in2 = x2
-       movl    108(%esp),%eax
-       #   in3 = x3
-       movl    112(%esp),%ecx
-       #   in2 += j2
-       addl    172(%esp),%eax
-       #   in3 += j3
-       addl    176(%esp),%ecx
-       #   in2 ^= *(uint32 *) (m + 8)
-       xorl    8(%esi),%eax
-       #   in3 ^= *(uint32 *) (m + 12)
-       xorl    12(%esi),%ecx
-       #   *(uint32 *) (out + 8) = in2
-       movl    %eax,8(%edi)
-       #   *(uint32 *) (out + 12) = in3
-       movl    %ecx,12(%edi)
-       #   in4 = x4
-       movl    116(%esp),%eax
-       #   in5 = x5
-       movl    120(%esp),%ecx
-       #   in4 += j4
-       addl    180(%esp),%eax
-       #   in5 += j5
-       addl    184(%esp),%ecx
-       #   in4 ^= *(uint32 *) (m + 16)
-       xorl    16(%esi),%eax
-       #   in5 ^= *(uint32 *) (m + 20)
-       xorl    20(%esi),%ecx
-       #   *(uint32 *) (out + 16) = in4
-       movl    %eax,16(%edi)
-       #   *(uint32 *) (out + 20) = in5
-       movl    %ecx,20(%edi)
-       #   in6 = x6
-       movl    124(%esp),%eax
-       #   in7 = x7
-       movl    128(%esp),%ecx
-       #   in6 += j6
-       addl    188(%esp),%eax
-       #   in7 += j7
-       addl    192(%esp),%ecx
-       #   in6 ^= *(uint32 *) (m + 24)
-       xorl    24(%esi),%eax
-       #   in7 ^= *(uint32 *) (m + 28)
-       xorl    28(%esi),%ecx
-       #   *(uint32 *) (out + 24) = in6
-       movl    %eax,24(%edi)
-       #   *(uint32 *) (out + 28) = in7
-       movl    %ecx,28(%edi)
-       #   in8 = x8
-       movl    132(%esp),%eax
-       #   in9 = x9
-       movl    136(%esp),%ecx
-       #   in8 += j8
-       addl    196(%esp),%eax
-       #   in9 += j9
-       addl    200(%esp),%ecx
-       #   in8 ^= *(uint32 *) (m + 32)
-       xorl    32(%esi),%eax
-       #   in9 ^= *(uint32 *) (m + 36)
-       xorl    36(%esi),%ecx
-       #   *(uint32 *) (out + 32) = in8
-       movl    %eax,32(%edi)
-       #   *(uint32 *) (out + 36) = in9
-       movl    %ecx,36(%edi)
-       #   in10 = x10
-       movl    140(%esp),%eax
-       #   in11 = x11
-       movl    144(%esp),%ecx
-       #   in10 += j10
-       addl    204(%esp),%eax
-       #   in11 += j11
-       addl    208(%esp),%ecx
-       #   in10 ^= *(uint32 *) (m + 40)
-       xorl    40(%esi),%eax
-       #   in11 ^= *(uint32 *) (m + 44)
-       xorl    44(%esi),%ecx
-       #   *(uint32 *) (out + 40) = in10
-       movl    %eax,40(%edi)
-       #   *(uint32 *) (out + 44) = in11
-       movl    %ecx,44(%edi)
-       #   in12 = x12
-       movl    148(%esp),%eax
-       #   in13 = x13
-       movl    152(%esp),%ecx
-       #   in12 += j12
-       addl    212(%esp),%eax
-       #   in13 += j13
-       addl    216(%esp),%ecx
-       #   in12 ^= *(uint32 *) (m + 48)
-       xorl    48(%esi),%eax
-       #   in13 ^= *(uint32 *) (m + 52)
-       xorl    52(%esi),%ecx
-       #   *(uint32 *) (out + 48) = in12
-       movl    %eax,48(%edi)
-       #   *(uint32 *) (out + 52) = in13
-       movl    %ecx,52(%edi)
-       #   in14 = x14
-       movl    156(%esp),%eax
-       #   in15 = x15
-       movl    160(%esp),%ecx
-       #   in14 += j14
-       addl    220(%esp),%eax
-       #   in15 += j15
-       addl    224(%esp),%ecx
-       #   in14 ^= *(uint32 *) (m + 56)
-       xorl    56(%esi),%eax
-       #   in15 ^= *(uint32 *) (m + 60)
-       xorl    60(%esi),%ecx
-       #   *(uint32 *) (out + 56) = in14
-       movl    %eax,56(%edi)
-       #   *(uint32 *) (out + 60) = in15
-       movl    %ecx,60(%edi)
-       #   bytes = bytes_backup
-       movl    76(%esp),%ebx
-       #   in8 = j8
-       movl    196(%esp),%eax
-       #   in9 = j9
-       movl    200(%esp),%ecx
-       #   in8 += 1
-       add     $1,%eax
-       #   in9 += 0 + carry
-       adc     $0,%ecx
-       #   j8 = in8
-       movl    %eax,196(%esp)
-       #   j9 = in9
-       movl    %ecx,200(%esp)
-       #   bytes - 64
-       cmp     $64,%ebx
-       #   goto bytesatleast65 if unsigned>
-       ja      ._bytesatleast65
-       #     goto bytesatleast64 if unsigned>=
-       jae     ._bytesatleast64
-       #       m = out
-       mov     %edi,%esi
-       #       out = ctarget
-       movl    228(%esp),%edi
-       #       i = bytes
-       mov     %ebx,%ecx
-       #       while (i) { *out++ = *m++; --i }
-       rep     movsb
-._bytesatleast64:
-       #     x = x_backup
-       movl    64(%esp),%eax
-       #     in8 = j8
-       movl    196(%esp),%ecx
-       #     in9 = j9
-       movl    200(%esp),%edx
-       #     *(uint32 *) (x + 32) = in8
-       movl    %ecx,32(%eax)
-       #     *(uint32 *) (x + 36) = in9
-       movl    %edx,36(%eax)
-._done:
-       #     eax = eax_stack
-       movl    80(%esp),%eax
-       #     ebx = ebx_stack
-       movl    84(%esp),%ebx
-       #     esi = esi_stack
-       movl    88(%esp),%esi
-       #     edi = edi_stack
-       movl    92(%esp),%edi
-       #     ebp = ebp_stack
-       movl    96(%esp),%ebp
-       #     leave
-       add     %eax,%esp
-       ret
-._bytesatleast65:
-       #   bytes -= 64
-       sub     $64,%ebx
-       #   out += 64
-       add     $64,%edi
-       #   m += 64
-       add     $64,%esi
-       # goto bytesatleast1
-       jmp     ._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S 
b/arch/x86/crypto/salsa20-x86_64-asm_64.S
deleted file mode 100644
index 03a4918f41ee..000000000000
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ /dev/null
@@ -1,805 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
-       mov     %rsp,%r11
-       and     $31,%r11
-       add     $256,%r11
-       sub     %r11,%rsp
-       # x = arg1
-       mov     %rdi,%r8
-       # m = arg2
-       mov     %rsi,%rsi
-       # out = arg3
-       mov     %rdx,%rdi
-       # bytes = arg4
-       mov     %rcx,%rdx
-       #               unsigned>? bytes - 0
-       cmp     $0,%rdx
-       # comment:fp stack unchanged by jump
-       # goto done if !unsigned>
-       jbe     ._done
-       # comment:fp stack unchanged by fallthrough
-# start:
-._start:
-       # r11_stack = r11
-       movq    %r11,0(%rsp)
-       # r12_stack = r12
-       movq    %r12,8(%rsp)
-       # r13_stack = r13
-       movq    %r13,16(%rsp)
-       # r14_stack = r14
-       movq    %r14,24(%rsp)
-       # r15_stack = r15
-       movq    %r15,32(%rsp)
-       # rbx_stack = rbx
-       movq    %rbx,40(%rsp)
-       # rbp_stack = rbp
-       movq    %rbp,48(%rsp)
-       # in0 = *(uint64 *) (x + 0)
-       movq    0(%r8),%rcx
-       # in2 = *(uint64 *) (x + 8)
-       movq    8(%r8),%r9
-       # in4 = *(uint64 *) (x + 16)
-       movq    16(%r8),%rax
-       # in6 = *(uint64 *) (x + 24)
-       movq    24(%r8),%r10
-       # in8 = *(uint64 *) (x + 32)
-       movq    32(%r8),%r11
-       # in10 = *(uint64 *) (x + 40)
-       movq    40(%r8),%r12
-       # in12 = *(uint64 *) (x + 48)
-       movq    48(%r8),%r13
-       # in14 = *(uint64 *) (x + 56)
-       movq    56(%r8),%r14
-       # j0 = in0
-       movq    %rcx,56(%rsp)
-       # j2 = in2
-       movq    %r9,64(%rsp)
-       # j4 = in4
-       movq    %rax,72(%rsp)
-       # j6 = in6
-       movq    %r10,80(%rsp)
-       # j8 = in8
-       movq    %r11,88(%rsp)
-       # j10 = in10
-       movq    %r12,96(%rsp)
-       # j12 = in12
-       movq    %r13,104(%rsp)
-       # j14 = in14
-       movq    %r14,112(%rsp)
-       # x_backup = x
-       movq    %r8,120(%rsp)
-# bytesatleast1:
-._bytesatleast1:
-       #                   unsigned<? bytes - 64
-       cmp     $64,%rdx
-       # comment:fp stack unchanged by jump
-       #   goto nocopy if !unsigned<
-       jae     ._nocopy
-       #     ctarget = out
-       movq    %rdi,128(%rsp)
-       #     out = &tmp
-       leaq    192(%rsp),%rdi
-       #     i = bytes
-       mov     %rdx,%rcx
-       #     while (i) { *out++ = *m++; --i }
-       rep     movsb
-       #     out = &tmp
-       leaq    192(%rsp),%rdi
-       #     m = &tmp
-       leaq    192(%rsp),%rsi
-       # comment:fp stack unchanged by fallthrough
-#   nocopy:
-._nocopy:
-       #   out_backup = out
-       movq    %rdi,136(%rsp)
-       #   m_backup = m
-       movq    %rsi,144(%rsp)
-       #   bytes_backup = bytes
-       movq    %rdx,152(%rsp)
-       #   x1 = j0
-       movq    56(%rsp),%rdi
-       #   x0 = x1
-       mov     %rdi,%rdx
-       #   (uint64) x1 >>= 32
-       shr     $32,%rdi
-       #               x3 = j2
-       movq    64(%rsp),%rsi
-       #               x2 = x3
-       mov     %rsi,%rcx
-       #               (uint64) x3 >>= 32
-       shr     $32,%rsi
-       #   x5 = j4
-       movq    72(%rsp),%r8
-       #   x4 = x5
-       mov     %r8,%r9
-       #   (uint64) x5 >>= 32
-       shr     $32,%r8
-       #   x5_stack = x5
-       movq    %r8,160(%rsp)
-       #               x7 = j6
-       movq    80(%rsp),%r8
-       #               x6 = x7
-       mov     %r8,%rax
-       #               (uint64) x7 >>= 32
-       shr     $32,%r8
-       #   x9 = j8
-       movq    88(%rsp),%r10
-       #   x8 = x9
-       mov     %r10,%r11
-       #   (uint64) x9 >>= 32
-       shr     $32,%r10
-       #               x11 = j10
-       movq    96(%rsp),%r12
-       #               x10 = x11
-       mov     %r12,%r13
-       #               x10_stack = x10
-       movq    %r13,168(%rsp)
-       #               (uint64) x11 >>= 32
-       shr     $32,%r12
-       #   x13 = j12
-       movq    104(%rsp),%r13
-       #   x12 = x13
-       mov     %r13,%r14
-       #   (uint64) x13 >>= 32
-       shr     $32,%r13
-       #               x15 = j14
-       movq    112(%rsp),%r15
-       #               x14 = x15
-       mov     %r15,%rbx
-       #               (uint64) x15 >>= 32
-       shr     $32,%r15
-       #               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #   i = 20
-       mov     $20,%r15
-#   mainloop:
-._mainloop:
-       #   i_backup = i
-       movq    %r15,184(%rsp)
-       #               x5 = x5_stack
-       movq    160(%rsp),%r15
-       # a = x12 + x0
-       lea     (%r14,%rdx),%rbp
-       # (uint32) a <<<= 7
-       rol     $7,%ebp
-       # x4 ^= a
-       xor     %rbp,%r9
-       #               b = x1 + x5
-       lea     (%rdi,%r15),%rbp
-       #               (uint32) b <<<= 7
-       rol     $7,%ebp
-       #               x9 ^= b
-       xor     %rbp,%r10
-       # a = x0 + x4
-       lea     (%rdx,%r9),%rbp
-       # (uint32) a <<<= 9
-       rol     $9,%ebp
-       # x8 ^= a
-       xor     %rbp,%r11
-       #               b = x5 + x9
-       lea     (%r15,%r10),%rbp
-       #               (uint32) b <<<= 9
-       rol     $9,%ebp
-       #               x13 ^= b
-       xor     %rbp,%r13
-       # a = x4 + x8
-       lea     (%r9,%r11),%rbp
-       # (uint32) a <<<= 13
-       rol     $13,%ebp
-       # x12 ^= a
-       xor     %rbp,%r14
-       #               b = x9 + x13
-       lea     (%r10,%r13),%rbp
-       #               (uint32) b <<<= 13
-       rol     $13,%ebp
-       #               x1 ^= b
-       xor     %rbp,%rdi
-       # a = x8 + x12
-       lea     (%r11,%r14),%rbp
-       # (uint32) a <<<= 18
-       rol     $18,%ebp
-       # x0 ^= a
-       xor     %rbp,%rdx
-       #               b = x13 + x1
-       lea     (%r13,%rdi),%rbp
-       #               (uint32) b <<<= 18
-       rol     $18,%ebp
-       #               x5 ^= b
-       xor     %rbp,%r15
-       #                               x10 = x10_stack
-       movq    168(%rsp),%rbp
-       #               x5_stack = x5
-       movq    %r15,160(%rsp)
-       #                               c = x6 + x10
-       lea     (%rax,%rbp),%r15
-       #                               (uint32) c <<<= 7
-       rol     $7,%r15d
-       #                               x14 ^= c
-       xor     %r15,%rbx
-       #                               c = x10 + x14
-       lea     (%rbp,%rbx),%r15
-       #                               (uint32) c <<<= 9
-       rol     $9,%r15d
-       #                               x2 ^= c
-       xor     %r15,%rcx
-       #                               c = x14 + x2
-       lea     (%rbx,%rcx),%r15
-       #                               (uint32) c <<<= 13
-       rol     $13,%r15d
-       #                               x6 ^= c
-       xor     %r15,%rax
-       #                               c = x2 + x6
-       lea     (%rcx,%rax),%r15
-       #                               (uint32) c <<<= 18
-       rol     $18,%r15d
-       #                               x10 ^= c
-       xor     %r15,%rbp
-       #                                               x15 = x15_stack
-       movq    176(%rsp),%r15
-       #                               x10_stack = x10
-       movq    %rbp,168(%rsp)
-       #                                               d = x11 + x15
-       lea     (%r12,%r15),%rbp
-       #                                               (uint32) d <<<= 7
-       rol     $7,%ebp
-       #                                               x3 ^= d
-       xor     %rbp,%rsi
-       #                                               d = x15 + x3
-       lea     (%r15,%rsi),%rbp
-       #                                               (uint32) d <<<= 9
-       rol     $9,%ebp
-       #                                               x7 ^= d
-       xor     %rbp,%r8
-       #                                               d = x3 + x7
-       lea     (%rsi,%r8),%rbp
-       #                                               (uint32) d <<<= 13
-       rol     $13,%ebp
-       #                                               x11 ^= d
-       xor     %rbp,%r12
-       #                                               d = x7 + x11
-       lea     (%r8,%r12),%rbp
-       #                                               (uint32) d <<<= 18
-       rol     $18,%ebp
-       #                                               x15 ^= d
-       xor     %rbp,%r15
-       #                                               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #               x5 = x5_stack
-       movq    160(%rsp),%r15
-       # a = x3 + x0
-       lea     (%rsi,%rdx),%rbp
-       # (uint32) a <<<= 7
-       rol     $7,%ebp
-       # x1 ^= a
-       xor     %rbp,%rdi
-       #               b = x4 + x5
-       lea     (%r9,%r15),%rbp
-       #               (uint32) b <<<= 7
-       rol     $7,%ebp
-       #               x6 ^= b
-       xor     %rbp,%rax
-       # a = x0 + x1
-       lea     (%rdx,%rdi),%rbp
-       # (uint32) a <<<= 9
-       rol     $9,%ebp
-       # x2 ^= a
-       xor     %rbp,%rcx
-       #               b = x5 + x6
-       lea     (%r15,%rax),%rbp
-       #               (uint32) b <<<= 9
-       rol     $9,%ebp
-       #               x7 ^= b
-       xor     %rbp,%r8
-       # a = x1 + x2
-       lea     (%rdi,%rcx),%rbp
-       # (uint32) a <<<= 13
-       rol     $13,%ebp
-       # x3 ^= a
-       xor     %rbp,%rsi
-       #               b = x6 + x7
-       lea     (%rax,%r8),%rbp
-       #               (uint32) b <<<= 13
-       rol     $13,%ebp
-       #               x4 ^= b
-       xor     %rbp,%r9
-       # a = x2 + x3
-       lea     (%rcx,%rsi),%rbp
-       # (uint32) a <<<= 18
-       rol     $18,%ebp
-       # x0 ^= a
-       xor     %rbp,%rdx
-       #               b = x7 + x4
-       lea     (%r8,%r9),%rbp
-       #               (uint32) b <<<= 18
-       rol     $18,%ebp
-       #               x5 ^= b
-       xor     %rbp,%r15
-       #                               x10 = x10_stack
-       movq    168(%rsp),%rbp
-       #               x5_stack = x5
-       movq    %r15,160(%rsp)
-       #                               c = x9 + x10
-       lea     (%r10,%rbp),%r15
-       #                               (uint32) c <<<= 7
-       rol     $7,%r15d
-       #                               x11 ^= c
-       xor     %r15,%r12
-       #                               c = x10 + x11
-       lea     (%rbp,%r12),%r15
-       #                               (uint32) c <<<= 9
-       rol     $9,%r15d
-       #                               x8 ^= c
-       xor     %r15,%r11
-       #                               c = x11 + x8
-       lea     (%r12,%r11),%r15
-       #                               (uint32) c <<<= 13
-       rol     $13,%r15d
-       #                               x9 ^= c
-       xor     %r15,%r10
-       #                               c = x8 + x9
-       lea     (%r11,%r10),%r15
-       #                               (uint32) c <<<= 18
-       rol     $18,%r15d
-       #                               x10 ^= c
-       xor     %r15,%rbp
-       #                                               x15 = x15_stack
-       movq    176(%rsp),%r15
-       #                               x10_stack = x10
-       movq    %rbp,168(%rsp)
-       #                                               d = x14 + x15
-       lea     (%rbx,%r15),%rbp
-       #                                               (uint32) d <<<= 7
-       rol     $7,%ebp
-       #                                               x12 ^= d
-       xor     %rbp,%r14
-       #                                               d = x15 + x12
-       lea     (%r15,%r14),%rbp
-       #                                               (uint32) d <<<= 9
-       rol     $9,%ebp
-       #                                               x13 ^= d
-       xor     %rbp,%r13
-       #                                               d = x12 + x13
-       lea     (%r14,%r13),%rbp
-       #                                               (uint32) d <<<= 13
-       rol     $13,%ebp
-       #                                               x14 ^= d
-       xor     %rbp,%rbx
-       #                                               d = x13 + x14
-       lea     (%r13,%rbx),%rbp
-       #                                               (uint32) d <<<= 18
-       rol     $18,%ebp
-       #                                               x15 ^= d
-       xor     %rbp,%r15
-       #                                               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #               x5 = x5_stack
-       movq    160(%rsp),%r15
-       # a = x12 + x0
-       lea     (%r14,%rdx),%rbp
-       # (uint32) a <<<= 7
-       rol     $7,%ebp
-       # x4 ^= a
-       xor     %rbp,%r9
-       #               b = x1 + x5
-       lea     (%rdi,%r15),%rbp
-       #               (uint32) b <<<= 7
-       rol     $7,%ebp
-       #               x9 ^= b
-       xor     %rbp,%r10
-       # a = x0 + x4
-       lea     (%rdx,%r9),%rbp
-       # (uint32) a <<<= 9
-       rol     $9,%ebp
-       # x8 ^= a
-       xor     %rbp,%r11
-       #               b = x5 + x9
-       lea     (%r15,%r10),%rbp
-       #               (uint32) b <<<= 9
-       rol     $9,%ebp
-       #               x13 ^= b
-       xor     %rbp,%r13
-       # a = x4 + x8
-       lea     (%r9,%r11),%rbp
-       # (uint32) a <<<= 13
-       rol     $13,%ebp
-       # x12 ^= a
-       xor     %rbp,%r14
-       #               b = x9 + x13
-       lea     (%r10,%r13),%rbp
-       #               (uint32) b <<<= 13
-       rol     $13,%ebp
-       #               x1 ^= b
-       xor     %rbp,%rdi
-       # a = x8 + x12
-       lea     (%r11,%r14),%rbp
-       # (uint32) a <<<= 18
-       rol     $18,%ebp
-       # x0 ^= a
-       xor     %rbp,%rdx
-       #               b = x13 + x1
-       lea     (%r13,%rdi),%rbp
-       #               (uint32) b <<<= 18
-       rol     $18,%ebp
-       #               x5 ^= b
-       xor     %rbp,%r15
-       #                               x10 = x10_stack
-       movq    168(%rsp),%rbp
-       #               x5_stack = x5
-       movq    %r15,160(%rsp)
-       #                               c = x6 + x10
-       lea     (%rax,%rbp),%r15
-       #                               (uint32) c <<<= 7
-       rol     $7,%r15d
-       #                               x14 ^= c
-       xor     %r15,%rbx
-       #                               c = x10 + x14
-       lea     (%rbp,%rbx),%r15
-       #                               (uint32) c <<<= 9
-       rol     $9,%r15d
-       #                               x2 ^= c
-       xor     %r15,%rcx
-       #                               c = x14 + x2
-       lea     (%rbx,%rcx),%r15
-       #                               (uint32) c <<<= 13
-       rol     $13,%r15d
-       #                               x6 ^= c
-       xor     %r15,%rax
-       #                               c = x2 + x6
-       lea     (%rcx,%rax),%r15
-       #                               (uint32) c <<<= 18
-       rol     $18,%r15d
-       #                               x10 ^= c
-       xor     %r15,%rbp
-       #                                               x15 = x15_stack
-       movq    176(%rsp),%r15
-       #                               x10_stack = x10
-       movq    %rbp,168(%rsp)
-       #                                               d = x11 + x15
-       lea     (%r12,%r15),%rbp
-       #                                               (uint32) d <<<= 7
-       rol     $7,%ebp
-       #                                               x3 ^= d
-       xor     %rbp,%rsi
-       #                                               d = x15 + x3
-       lea     (%r15,%rsi),%rbp
-       #                                               (uint32) d <<<= 9
-       rol     $9,%ebp
-       #                                               x7 ^= d
-       xor     %rbp,%r8
-       #                                               d = x3 + x7
-       lea     (%rsi,%r8),%rbp
-       #                                               (uint32) d <<<= 13
-       rol     $13,%ebp
-       #                                               x11 ^= d
-       xor     %rbp,%r12
-       #                                               d = x7 + x11
-       lea     (%r8,%r12),%rbp
-       #                                               (uint32) d <<<= 18
-       rol     $18,%ebp
-       #                                               x15 ^= d
-       xor     %rbp,%r15
-       #                                               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #               x5 = x5_stack
-       movq    160(%rsp),%r15
-       # a = x3 + x0
-       lea     (%rsi,%rdx),%rbp
-       # (uint32) a <<<= 7
-       rol     $7,%ebp
-       # x1 ^= a
-       xor     %rbp,%rdi
-       #               b = x4 + x5
-       lea     (%r9,%r15),%rbp
-       #               (uint32) b <<<= 7
-       rol     $7,%ebp
-       #               x6 ^= b
-       xor     %rbp,%rax
-       # a = x0 + x1
-       lea     (%rdx,%rdi),%rbp
-       # (uint32) a <<<= 9
-       rol     $9,%ebp
-       # x2 ^= a
-       xor     %rbp,%rcx
-       #               b = x5 + x6
-       lea     (%r15,%rax),%rbp
-       #               (uint32) b <<<= 9
-       rol     $9,%ebp
-       #               x7 ^= b
-       xor     %rbp,%r8
-       # a = x1 + x2
-       lea     (%rdi,%rcx),%rbp
-       # (uint32) a <<<= 13
-       rol     $13,%ebp
-       # x3 ^= a
-       xor     %rbp,%rsi
-       #               b = x6 + x7
-       lea     (%rax,%r8),%rbp
-       #               (uint32) b <<<= 13
-       rol     $13,%ebp
-       #               x4 ^= b
-       xor     %rbp,%r9
-       # a = x2 + x3
-       lea     (%rcx,%rsi),%rbp
-       # (uint32) a <<<= 18
-       rol     $18,%ebp
-       # x0 ^= a
-       xor     %rbp,%rdx
-       #               b = x7 + x4
-       lea     (%r8,%r9),%rbp
-       #               (uint32) b <<<= 18
-       rol     $18,%ebp
-       #               x5 ^= b
-       xor     %rbp,%r15
-       #                               x10 = x10_stack
-       movq    168(%rsp),%rbp
-       #               x5_stack = x5
-       movq    %r15,160(%rsp)
-       #                               c = x9 + x10
-       lea     (%r10,%rbp),%r15
-       #                               (uint32) c <<<= 7
-       rol     $7,%r15d
-       #                               x11 ^= c
-       xor     %r15,%r12
-       #                               c = x10 + x11
-       lea     (%rbp,%r12),%r15
-       #                               (uint32) c <<<= 9
-       rol     $9,%r15d
-       #                               x8 ^= c
-       xor     %r15,%r11
-       #                               c = x11 + x8
-       lea     (%r12,%r11),%r15
-       #                               (uint32) c <<<= 13
-       rol     $13,%r15d
-       #                               x9 ^= c
-       xor     %r15,%r10
-       #                               c = x8 + x9
-       lea     (%r11,%r10),%r15
-       #                               (uint32) c <<<= 18
-       rol     $18,%r15d
-       #                               x10 ^= c
-       xor     %r15,%rbp
-       #                                               x15 = x15_stack
-       movq    176(%rsp),%r15
-       #                               x10_stack = x10
-       movq    %rbp,168(%rsp)
-       #                                               d = x14 + x15
-       lea     (%rbx,%r15),%rbp
-       #                                               (uint32) d <<<= 7
-       rol     $7,%ebp
-       #                                               x12 ^= d
-       xor     %rbp,%r14
-       #                                               d = x15 + x12
-       lea     (%r15,%r14),%rbp
-       #                                               (uint32) d <<<= 9
-       rol     $9,%ebp
-       #                                               x13 ^= d
-       xor     %rbp,%r13
-       #                                               d = x12 + x13
-       lea     (%r14,%r13),%rbp
-       #                                               (uint32) d <<<= 13
-       rol     $13,%ebp
-       #                                               x14 ^= d
-       xor     %rbp,%rbx
-       #                                               d = x13 + x14
-       lea     (%r13,%rbx),%rbp
-       #                                               (uint32) d <<<= 18
-       rol     $18,%ebp
-       #                                               x15 ^= d
-       xor     %rbp,%r15
-       #                                               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #   i = i_backup
-       movq    184(%rsp),%r15
-       #                  unsigned>? i -= 4
-       sub     $4,%r15
-       # comment:fp stack unchanged by jump
-       # goto mainloop if unsigned>
-       ja      ._mainloop
-       #   (uint32) x2 += j2
-       addl    64(%rsp),%ecx
-       #   x3 <<= 32
-       shl     $32,%rsi
-       #   x3 += j2
-       addq    64(%rsp),%rsi
-       #   (uint64) x3 >>= 32
-       shr     $32,%rsi
-       #   x3 <<= 32
-       shl     $32,%rsi
-       #   x2 += x3
-       add     %rsi,%rcx
-       #   (uint32) x6 += j6
-       addl    80(%rsp),%eax
-       #   x7 <<= 32
-       shl     $32,%r8
-       #   x7 += j6
-       addq    80(%rsp),%r8
-       #   (uint64) x7 >>= 32
-       shr     $32,%r8
-       #   x7 <<= 32
-       shl     $32,%r8
-       #   x6 += x7
-       add     %r8,%rax
-       #   (uint32) x8 += j8
-       addl    88(%rsp),%r11d
-       #   x9 <<= 32
-       shl     $32,%r10
-       #   x9 += j8
-       addq    88(%rsp),%r10
-       #   (uint64) x9 >>= 32
-       shr     $32,%r10
-       #   x9 <<= 32
-       shl     $32,%r10
-       #   x8 += x9
-       add     %r10,%r11
-       #   (uint32) x12 += j12
-       addl    104(%rsp),%r14d
-       #   x13 <<= 32
-       shl     $32,%r13
-       #   x13 += j12
-       addq    104(%rsp),%r13
-       #   (uint64) x13 >>= 32
-       shr     $32,%r13
-       #   x13 <<= 32
-       shl     $32,%r13
-       #   x12 += x13
-       add     %r13,%r14
-       #   (uint32) x0 += j0
-       addl    56(%rsp),%edx
-       #   x1 <<= 32
-       shl     $32,%rdi
-       #   x1 += j0
-       addq    56(%rsp),%rdi
-       #   (uint64) x1 >>= 32
-       shr     $32,%rdi
-       #   x1 <<= 32
-       shl     $32,%rdi
-       #   x0 += x1
-       add     %rdi,%rdx
-       #   x5 = x5_stack
-       movq    160(%rsp),%rdi
-       #   (uint32) x4 += j4
-       addl    72(%rsp),%r9d
-       #   x5 <<= 32
-       shl     $32,%rdi
-       #   x5 += j4
-       addq    72(%rsp),%rdi
-       #   (uint64) x5 >>= 32
-       shr     $32,%rdi
-       #   x5 <<= 32
-       shl     $32,%rdi
-       #   x4 += x5
-       add     %rdi,%r9
-       #   x10 = x10_stack
-       movq    168(%rsp),%r8
-       #   (uint32) x10 += j10
-       addl    96(%rsp),%r8d
-       #   x11 <<= 32
-       shl     $32,%r12
-       #   x11 += j10
-       addq    96(%rsp),%r12
-       #   (uint64) x11 >>= 32
-       shr     $32,%r12
-       #   x11 <<= 32
-       shl     $32,%r12
-       #   x10 += x11
-       add     %r12,%r8
-       #   x15 = x15_stack
-       movq    176(%rsp),%rdi
-       #   (uint32) x14 += j14
-       addl    112(%rsp),%ebx
-       #   x15 <<= 32
-       shl     $32,%rdi
-       #   x15 += j14
-       addq    112(%rsp),%rdi
-       #   (uint64) x15 >>= 32
-       shr     $32,%rdi
-       #   x15 <<= 32
-       shl     $32,%rdi
-       #   x14 += x15
-       add     %rdi,%rbx
-       #   out = out_backup
-       movq    136(%rsp),%rdi
-       #   m = m_backup
-       movq    144(%rsp),%rsi
-       #   x0 ^= *(uint64 *) (m + 0)
-       xorq    0(%rsi),%rdx
-       #   *(uint64 *) (out + 0) = x0
-       movq    %rdx,0(%rdi)
-       #   x2 ^= *(uint64 *) (m + 8)
-       xorq    8(%rsi),%rcx
-       #   *(uint64 *) (out + 8) = x2
-       movq    %rcx,8(%rdi)
-       #   x4 ^= *(uint64 *) (m + 16)
-       xorq    16(%rsi),%r9
-       #   *(uint64 *) (out + 16) = x4
-       movq    %r9,16(%rdi)
-       #   x6 ^= *(uint64 *) (m + 24)
-       xorq    24(%rsi),%rax
-       #   *(uint64 *) (out + 24) = x6
-       movq    %rax,24(%rdi)
-       #   x8 ^= *(uint64 *) (m + 32)
-       xorq    32(%rsi),%r11
-       #   *(uint64 *) (out + 32) = x8
-       movq    %r11,32(%rdi)
-       #   x10 ^= *(uint64 *) (m + 40)
-       xorq    40(%rsi),%r8
-       #   *(uint64 *) (out + 40) = x10
-       movq    %r8,40(%rdi)
-       #   x12 ^= *(uint64 *) (m + 48)
-       xorq    48(%rsi),%r14
-       #   *(uint64 *) (out + 48) = x12
-       movq    %r14,48(%rdi)
-       #   x14 ^= *(uint64 *) (m + 56)
-       xorq    56(%rsi),%rbx
-       #   *(uint64 *) (out + 56) = x14
-       movq    %rbx,56(%rdi)
-       #   bytes = bytes_backup
-       movq    152(%rsp),%rdx
-       #   in8 = j8
-       movq    88(%rsp),%rcx
-       #   in8 += 1
-       add     $1,%rcx
-       #   j8 = in8
-       movq    %rcx,88(%rsp)
-       #                          unsigned>? unsigned<? bytes - 64
-       cmp     $64,%rdx
-       # comment:fp stack unchanged by jump
-       #   goto bytesatleast65 if unsigned>
-       ja      ._bytesatleast65
-       # comment:fp stack unchanged by jump
-       #     goto bytesatleast64 if !unsigned<
-       jae     ._bytesatleast64
-       #       m = out
-       mov     %rdi,%rsi
-       #       out = ctarget
-       movq    128(%rsp),%rdi
-       #       i = bytes
-       mov     %rdx,%rcx
-       #       while (i) { *out++ = *m++; --i }
-       rep     movsb
-       # comment:fp stack unchanged by fallthrough
-#     bytesatleast64:
-._bytesatleast64:
-       #     x = x_backup
-       movq    120(%rsp),%rdi
-       #     in8 = j8
-       movq    88(%rsp),%rsi
-       #     *(uint64 *) (x + 32) = in8
-       movq    %rsi,32(%rdi)
-       #     r11 = r11_stack
-       movq    0(%rsp),%r11
-       #     r12 = r12_stack
-       movq    8(%rsp),%r12
-       #     r13 = r13_stack
-       movq    16(%rsp),%r13
-       #     r14 = r14_stack
-       movq    24(%rsp),%r14
-       #     r15 = r15_stack
-       movq    32(%rsp),%r15
-       #     rbx = rbx_stack
-       movq    40(%rsp),%rbx
-       #     rbp = rbp_stack
-       movq    48(%rsp),%rbp
-       # comment:fp stack unchanged by fallthrough
-#     done:
-._done:
-       #     leave
-       add     %r11,%rsp
-       mov     %rdi,%rax
-       mov     %rsi,%rdx
-       ret
-#   bytesatleast65:
-._bytesatleast65:
-       #   bytes -= 64
-       sub     $64,%rdx
-       #   out += 64
-       add     $64,%rdi
-       #   m += 64
-       add     $64,%rsi
-       # comment:fp stack unchanged by jump
-       # goto bytesatleast1
-       jmp     ._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
deleted file mode 100644
index b07d7d959806..000000000000
--- a/arch/x86/crypto/salsa20_glue.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Glue code for optimized assembly version of  Salsa20.
- *
- * Copyright (c) 2007 Tan Swee Heng <thesweeh...@gmail.com>
- *
- * The assembly codes are public domain assembly codes written by Daniel. J.
- * Bernstein <d...@cr.yp.to>. The codes are modified to include indentation
- * and to remove extraneous comments and functions that are not needed.
- * - i586 version, renamed as salsa20-i586-asm_32.S
- *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
- * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
- *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
- *
- * Also modified to set up the initial state using the generic C code rather
- * than in assembly.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <asm/unaligned.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/salsa20.h>
-#include <linux/module.h>
-
-asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst,
-                                     u32 bytes);
-
-static int salsa20_asm_crypt(struct skcipher_request *req)
-{
-       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-       const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm);
-       struct skcipher_walk walk;
-       u32 state[16];
-       int err;
-
-       err = skcipher_walk_virt(&walk, req, true);
-
-       crypto_salsa20_init(state, ctx, walk.iv);
-
-       while (walk.nbytes > 0) {
-               unsigned int nbytes = walk.nbytes;
-
-               if (nbytes < walk.total)
-                       nbytes = round_down(nbytes, walk.stride);
-
-               salsa20_encrypt_bytes(state, walk.src.virt.addr,
-                                     walk.dst.virt.addr, nbytes);
-               err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-       }
-
-       return err;
-}
-
-static struct skcipher_alg alg = {
-       .base.cra_name          = "salsa20",
-       .base.cra_driver_name   = "salsa20-asm",
-       .base.cra_priority      = 200,
-       .base.cra_blocksize     = 1,
-       .base.cra_ctxsize       = sizeof(struct salsa20_ctx),
-       .base.cra_module        = THIS_MODULE,
-
-       .min_keysize            = SALSA20_MIN_KEY_SIZE,
-       .max_keysize            = SALSA20_MAX_KEY_SIZE,
-       .ivsize                 = SALSA20_IV_SIZE,
-       .chunksize              = SALSA20_BLOCK_SIZE,
-       .setkey                 = crypto_salsa20_setkey,
-       .encrypt                = salsa20_asm_crypt,
-       .decrypt                = salsa20_asm_crypt,
-};
-
-static int __init init(void)
-{
-       return crypto_register_skcipher(&alg);
-}
-
-static void __exit fini(void)
-{
-       crypto_unregister_skcipher(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly 
version)");
-MODULE_ALIAS_CRYPTO("salsa20");
-MODULE_ALIAS_CRYPTO("salsa20-asm");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index b5d754db0b16..b659d05fc113 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1434,34 +1434,6 @@ config CRYPTO_SALSA20
          The Salsa20 stream cipher algorithm is designed by Daniel J.
          Bernstein <d...@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
 
-config CRYPTO_SALSA20_586
-       tristate "Salsa20 stream cipher algorithm (i586)"
-       depends on (X86 || UML_X86) && !64BIT
-       select CRYPTO_BLKCIPHER
-       select CRYPTO_SALSA20
-       help
-         Salsa20 stream cipher algorithm.
-
-         Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
-         Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
-
-         The Salsa20 stream cipher algorithm is designed by Daniel J.
-         Bernstein <d...@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
-
-config CRYPTO_SALSA20_X86_64
-       tristate "Salsa20 stream cipher algorithm (x86_64)"
-       depends on (X86 || UML_X86) && 64BIT
-       select CRYPTO_BLKCIPHER
-       select CRYPTO_SALSA20
-       help
-         Salsa20 stream cipher algorithm.
-
-         Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
-         Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
-
-         The Salsa20 stream cipher algorithm is designed by Daniel J.
-         Bernstein <d...@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
-
 config CRYPTO_CHACHA20
        tristate "ChaCha20 cipher algorithm"
        select CRYPTO_BLKCIPHER
-- 
2.17.0

[PATCH 1/2] crypto: x86/salsa20 - remove x86 salsa20 implementations

Reply via email to