blake2s - add ARM scalar optimized BLAKE2s

Eric Biggers Wed, 23 Dec 2020 00:15:32 -0800

From: Eric Biggers <ebigg...@google.com>

Add an ARM scalar optimized implementation of BLAKE2s.


NEON isn't very useful for BLAKE2s because the BLAKE2s block size is too
small for NEON to help.  Each NEON instruction would depend on the
previous one, resulting in poor performance.

With scalar instructions, on the other hand, we can take advantage of
ARM's "free" rotations (like I did in chacha-scalar-core.S) to get an
implementation get runs much faster than the C implementation.

Performance results on Cortex-A7 in cycles per byte using the shash API:

        4096-byte messages:
                blake2s-256-arm:     18.8
                blake2s-256-generic: 26.0

        500-byte messages:
                blake2s-256-arm:     20.3
                blake2s-256-generic: 27.9

        100-byte messages:
                blake2s-256-arm:     29.7
                blake2s-256-generic: 39.2

        32-byte messages:
                blake2s-256-arm:     50.6
                blake2s-256-generic: 66.2

Except on very short messages, this is still slower than the NEON
implementation of BLAKE2b which I've written; that is 14.0, 16.4, 25.8,
and 76.1 cpb on 4096, 500, 100, and 32-byte messages, respectively.
However, optimized BLAKE2s is useful for cases where BLAKE2s is used
instead of BLAKE2b, such as WireGuard.

This new implementation is added in the form of a new module
blake2s-arm.ko, which is analogous to blake2s-x86_64.ko in that it
provides blake2s_compress_arch() for use by the library API as well as
optionally register the algorithms with the shash API.

Acked-by: Ard Biesheuvel <a...@kernel.org>
Signed-off-by: Eric Biggers <ebigg...@google.com>
---
 arch/arm/crypto/Kconfig        |   9 ++
 arch/arm/crypto/Makefile       |   2 +
 arch/arm/crypto/blake2s-core.S | 285 +++++++++++++++++++++++++++++++++
 arch/arm/crypto/blake2s-glue.c |  78 +++++++++
 4 files changed, 374 insertions(+)
 create mode 100644 arch/arm/crypto/blake2s-core.S
 create mode 100644 arch/arm/crypto/blake2s-glue.c

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index c9bf2df85cb90..281c829c12d0b 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -62,6 +62,15 @@ config CRYPTO_SHA512_ARM
          SHA-512 secure hash standard (DFIPS 180-2) implemented
          using optimized ARM assembler and NEON, when available.
 
+config CRYPTO_BLAKE2S_ARM
+       tristate "BLAKE2s digest algorithm (ARM)"
+       select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+       help
+         BLAKE2s digest algorithm optimized with ARM scalar instructions.  This
+         is faster than the generic implementations of BLAKE2s and BLAKE2b, but
+         slower than the NEON implementation of BLAKE2b.  (There is no NEON
+         implementation of BLAKE2s, since NEON doesn't really help with it.)
+
 config CRYPTO_AES_ARM
        tristate "Scalar AES cipher for ARM"
        select CRYPTO_ALGAPI
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b745c17d356fe..5ad1e985a718b 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
@@ -29,6 +30,7 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := 
sha256_neon_glue.o
 sha256-arm-y   := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
 sha512-arm-y   := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
+blake2s-arm-y   := blake2s-core.o blake2s-glue.o
 sha1-arm-ce-y  := sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y  := sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y   := aes-ce-core.o aes-ce-glue.o
diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
new file mode 100644
index 0000000000000..bed897e9a181a
--- /dev/null
+++ b/arch/arm/crypto/blake2s-core.S
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebigg...@google.com>
+ */
+
+#include <linux/linkage.h>
+
+       // Registers used to hold message words temporarily.  There aren't
+       // enough ARM registers to hold the whole message block, so we have to
+       // load the words on-demand.
+       M_0             .req    r12
+       M_1             .req    r14
+
+// The BLAKE2s initialization vector
+.Lblake2s_IV:
+       .word   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+       .word   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+
+.macro __ldrd          a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+       ldrd            \a, \b, [\src, #\offset]
+#else
+       ldr             \a, [\src, #\offset]
+       ldr             \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd          a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+       strd            \a, \b, [\dst, #\offset]
+#else
+       str             \a, [\dst, #\offset]
+       str             \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
+// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
+// columns/diagonals.  s0-s1 are the word offsets to the message words the 
first
+// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
+// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
+//
+// Note that to save instructions, the rotations don't happen when the
+// pseudocode says they should, but rather they are delayed until the values 
are
+// used.  See the comment above _blake2s_round().
+.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
+
+       ldr             M_0, [sp, #32 + 4 * \s0]
+       ldr             M_1, [sp, #32 + 4 * \s2]
+
+       // a += b + m[blake2s_sigma[r][2*i + 0]];
+       add             \a0, \a0, \b0, ror #brot
+       add             \a1, \a1, \b1, ror #brot
+       add             \a0, \a0, M_0
+       add             \a1, \a1, M_1
+
+       // d = ror32(d ^ a, 16);
+       eor             \d0, \a0, \d0, ror #drot
+       eor             \d1, \a1, \d1, ror #drot
+
+       // c += d;
+       add             \c0, \c0, \d0, ror #16
+       add             \c1, \c1, \d1, ror #16
+
+       // b = ror32(b ^ c, 12);
+       eor             \b0, \c0, \b0, ror #brot
+       eor             \b1, \c1, \b1, ror #brot
+
+       ldr             M_0, [sp, #32 + 4 * \s1]
+       ldr             M_1, [sp, #32 + 4 * \s3]
+
+       // a += b + m[blake2s_sigma[r][2*i + 1]];
+       add             \a0, \a0, \b0, ror #12
+       add             \a1, \a1, \b1, ror #12
+       add             \a0, \a0, M_0
+       add             \a1, \a1, M_1
+
+       // d = ror32(d ^ a, 8);
+       eor             \d0, \a0, \d0, ror#16
+       eor             \d1, \a1, \d1, ror#16
+
+       // c += d;
+       add             \c0, \c0, \d0, ror#8
+       add             \c1, \c1, \d1, ror#8
+
+       // b = ror32(b ^ c, 7);
+       eor             \b0, \c0, \b0, ror#12
+       eor             \b1, \c1, \b1, ror#12
+.endm
+
+// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
+// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
+// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
+// r14 are free to use.  The macro arguments s0-s15 give the order in which the
+// message words are used in this round.
+//
+// All rotates are performed using the implicit rotate operand accepted by the
+// 'add' and 'eor' instructions.  This is faster than using explicit rotate
+// instructions.  To make this work, we allow the values in the second and last
+// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
+// wrong rotation amount.  The rotation amount is then fixed up just in time
+// when the values are used.  'brot' is the number of bits the values in row 
'b'
+// need to be rotated right to arrive at the correct values, and 'drot'
+// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+// that they end up as (7, 8) after every round.
+.macro _blake2s_round  s0, s1, s2, s3, s4, s5, s6, s7, \
+                       s8, s9, s10, s11, s12, s13, s14, s15
+
+       // Mix first two columns:
+       // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
+       __ldrd          r10, r11, sp, 16        // load v[12] and v[13]
+       _blake2s_quarterround   r0, r4, r8, r10,  r1, r5, r9, r11, \
+                               \s0, \s1, \s2, \s3
+       __strd          r8, r9, sp, 0
+       __strd          r10, r11, sp, 16
+
+       // Mix second two columns:
+       // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
+       __ldrd          r8, r9, sp, 8           // load v[10] and v[11]
+       __ldrd          r10, r11, sp, 24        // load v[14] and v[15]
+       _blake2s_quarterround   r2, r6, r8, r10,  r3, r7, r9, r11, \
+                               \s4, \s5, \s6, \s7
+       str             r10, [sp, #24]          // store v[14]
+       // v[10], v[11], and v[15] are used below, so no need to store them yet.
+
+       .set brot, 7
+       .set drot, 8
+
+       // Mix first two diagonals:
+       // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
+       ldr             r10, [sp, #16]          // load v[12]
+       _blake2s_quarterround   r0, r5, r8, r11,  r1, r6, r9, r10, \
+                               \s8, \s9, \s10, \s11
+       __strd          r8, r9, sp, 8
+       str             r11, [sp, #28]
+       str             r10, [sp, #16]
+
+       // Mix second two diagonals:
+       // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
+       __ldrd          r8, r9, sp, 0           // load v[8] and v[9]
+       __ldrd          r10, r11, sp, 20        // load v[13] and v[14]
+       _blake2s_quarterround   r2, r7, r8, r10,  r3, r4, r9, r11, \
+                               \s12, \s13, \s14, \s15
+       __strd          r10, r11, sp, 20
+.endm
+
+//
+// void blake2s_compress_arch(struct blake2s_state *state,
+//                           const u8 *block, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_state are used:
+//     u32 h[8];       (inout)
+//     u32 t[2];       (inout)
+//     u32 f[2];       (in)
+//
+       .align          5
+ENTRY(blake2s_compress_arch)
+       push            {r0-r2,r4-r11,lr}       // keep this an even number
+
+.Lnext_block:
+       // r0 is 'state'
+       // r1 is 'block'
+       // r3 is 'inc'
+
+       // Load and increment the counter t[0..1].
+       __ldrd          r10, r11, r0, 32
+       adds            r10, r10, r3
+       adc             r11, r11, #0
+       __strd          r10, r11, r0, 32
+
+       // _blake2s_round is very short on registers, so copy the message block
+       // to the stack to save a register during the rounds.  This also has the
+       // advantage that misalignment only needs to be dealt with in one place.
+       sub             sp, sp, #64
+       mov             r12, sp
+       tst             r1, #3
+       bne             .Lcopy_block_misaligned
+       ldmia           r1!, {r2-r9}
+       stmia           r12!, {r2-r9}
+       ldmia           r1!, {r2-r9}
+       stmia           r12, {r2-r9}
+.Lcopy_block_done:
+       str             r1, [sp, #68]           // Update message pointer
+
+       // Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
+       // for spilling v[8..9].  Leave v[8..9] in r8-r9.
+       mov             r14, r0                 // r14 = state
+       adr             r12, .Lblake2s_IV
+       ldmia           r12!, {r8-r9}           // load IV[0..1]
+       __ldrd          r0, r1, r14, 40         // load f[0..1]
+       ldm             r12, {r2-r7}            // load IV[3..7]
+       eor             r4, r4, r10             // v[12] = IV[4] ^ t[0]
+       eor             r5, r5, r11             // v[13] = IV[5] ^ t[1]
+       eor             r6, r6, r0              // v[14] = IV[6] ^ f[0]
+       eor             r7, r7, r1              // v[15] = IV[7] ^ f[1]
+       push            {r2-r7}                 // push v[9..15]
+       sub             sp, sp, #8              // leave space for v[8..9]
+
+       // Load h[0..7] == v[0..7].
+       ldm             r14, {r0-r7}
+
+       // Execute the rounds.  Each round is provided the order in which it
+       // needs to use the message words.
+       .set brot, 0
+       .set drot, 0
+       _blake2s_round  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+       _blake2s_round  14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+       _blake2s_round  11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+       _blake2s_round  7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+       _blake2s_round  9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+       _blake2s_round  2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+       _blake2s_round  12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+       _blake2s_round  13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+       _blake2s_round  6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+       _blake2s_round  10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+
+       // Fold the final state matrix into the hash chaining value:
+       //
+       //      for (i = 0; i < 8; i++)
+       //              h[i] ^= v[i] ^ v[i + 8];
+       //
+       ldr             r14, [sp, #96]          // r14 = &h[0]
+       add             sp, sp, #8              // v[8..9] are already loaded.
+       pop             {r10-r11}               // load v[10..11]
+       eor             r0, r0, r8
+       eor             r1, r1, r9
+       eor             r2, r2, r10
+       eor             r3, r3, r11
+       ldm             r14, {r8-r11}           // load h[0..3]
+       eor             r0, r0, r8
+       eor             r1, r1, r9
+       eor             r2, r2, r10
+       eor             r3, r3, r11
+       stmia           r14!, {r0-r3}           // store new h[0..3]
+       ldm             r14, {r0-r3}            // load old h[4..7]
+       pop             {r8-r11}                // load v[12..15]
+       eor             r0, r0, r4, ror #brot
+       eor             r1, r1, r5, ror #brot
+       eor             r2, r2, r6, ror #brot
+       eor             r3, r3, r7, ror #brot
+       eor             r0, r0, r8, ror #drot
+       eor             r1, r1, r9, ror #drot
+       eor             r2, r2, r10, ror #drot
+       eor             r3, r3, r11, ror #drot
+         add           sp, sp, #64             // skip copy of message block
+       stm             r14, {r0-r3}            // store new h[4..7]
+
+       // Advance to the next block, if there is one.  Note that if there are
+       // multiple blocks, then 'inc' (the counter increment amount) must be
+       // 64.  So we can simply set it to 64 without re-loading it.
+       ldm             sp, {r0, r1, r2}        // load (state, block, nblocks)
+       mov             r3, #64                 // set 'inc'
+       subs            r2, r2, #1              // nblocks--
+       str             r2, [sp, #8]
+       bne             .Lnext_block            // nblocks != 0?
+
+       pop             {r0-r2,r4-r11,pc}
+
+       // The next message block (pointed to by r1) isn't 4-byte aligned, so it
+       // can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
+       // by r12) using an alternative method.  r2-r9 are free to use.
+.Lcopy_block_misaligned:
+       mov             r2, #64
+1:
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       ldr             r3, [r1], #4
+#else
+       ldrb            r3, [r1, #0]
+       ldrb            r4, [r1, #1]
+       ldrb            r5, [r1, #2]
+       ldrb            r6, [r1, #3]
+       add             r1, r1, #4
+       orr             r3, r3, r4, lsl #8
+       orr             r3, r3, r5, lsl #16
+       orr             r3, r3, r6, lsl #24
+#endif
+       subs            r2, r2, #4
+       str             r3, [r12], #4
+       bne             1b
+       b               .Lcopy_block_done
+ENDPROC(blake2s_compress_arch)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
new file mode 100644
index 0000000000000..f2cc1e5fc9ec1
--- /dev/null
+++ b/arch/arm/crypto/blake2s-glue.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <crypto/internal/blake2s.h>
+#include <crypto/internal/hash.h>
+
+#include <linux/module.h>
+
+/* defined in blake2s-core.S */
+EXPORT_SYMBOL(blake2s_compress_arch);
+
+static int crypto_blake2s_update_arm(struct shash_desc *desc,
+                                    const u8 *in, unsigned int inlen)
+{
+       return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
+}
+
+static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
+{
+       return crypto_blake2s_final(desc, out, blake2s_compress_arch);
+}
+
+#define BLAKE2S_ALG(name, driver_name, digest_size)                    \
+       {                                                               \
+               .base.cra_name          = name,                         \
+               .base.cra_driver_name   = driver_name,                  \
+               .base.cra_priority      = 200,                          \
+               .base.cra_flags         = CRYPTO_ALG_OPTIONAL_KEY,      \
+               .base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,           \
+               .base.cra_ctxsize       = sizeof(struct blake2s_tfm_ctx), \
+               .base.cra_module        = THIS_MODULE,                  \
+               .digestsize             = digest_size,                  \
+               .setkey                 = crypto_blake2s_setkey,        \
+               .init                   = crypto_blake2s_init,          \
+               .update                 = crypto_blake2s_update_arm,    \
+               .final                  = crypto_blake2s_final_arm,     \
+               .descsize               = sizeof(struct blake2s_state), \
+       }
+
+static struct shash_alg blake2s_arm_algs[] = {
+       BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
+       BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
+       BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
+       BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
+};
+
+static int __init blake2s_arm_mod_init(void)
+{
+       return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+               crypto_register_shashes(blake2s_arm_algs,
+                                       ARRAY_SIZE(blake2s_arm_algs)) : 0;
+}
+
+static void __exit blake2s_arm_mod_exit(void)
+{
+       if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+               crypto_unregister_shashes(blake2s_arm_algs,
+                                         ARRAY_SIZE(blake2s_arm_algs));
+}
+
+module_init(blake2s_arm_mod_init);
+module_exit(blake2s_arm_mod_exit);
+
+MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebigg...@google.com>");
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-arm");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-arm");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-arm");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-arm");
-- 
2.29.2

[PATCH v3 10/14] crypto: arm/blake2s - add ARM scalar optimized BLAKE2s

Reply via email to