ghash - add NEON accelerated fallback for 64-bit PMULL

Ard Biesheuvel Mon, 24 Jul 2017 03:29:49 -0700

Implement a NEON fallback for systems that do support NEON but have
no support for the optional 64x64->128 polynomial multiplication
instruction that is part of the ARMv8 Crypto Extensions. It is based
on the paper "Fast Software Polynomial Multiplication on ARM Processors
Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
Ricardo Dahab (https://hal.inria.fr/hal-01506572), but has been reworked
extensively for the AArch64 ISA.


On a low-end core such as the Cortex-A53 found in the Raspberry Pi3, the
NEON based implementation is 4x faster than the table based one, and
is time invariant as well, making it less vulnerable to timing attacks.
When combined with the bit-sliced NEON implementation of AES-CTR, the
AES-GCM performance increases by 2x (from 58 to 29 cycles per byte).

Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org>
---
 arch/arm64/crypto/ghash-ce-core.S | 248 +++++++++++++++++---
 arch/arm64/crypto/ghash-ce-glue.c |  40 +++-
 2 files changed, 252 insertions(+), 36 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S 
b/arch/arm64/crypto/ghash-ce-core.S
index cb22459eba85..11ebf1ae248a 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -1,7 +1,7 @@
 /*
  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
  *
- * Copyright (C) 2014 Linaro Ltd. <ard.biesheu...@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheu...@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published
@@ -11,31 +11,215 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-       SHASH   .req    v0
-       SHASH2  .req    v1
-       T1      .req    v2
-       T2      .req    v3
-       MASK    .req    v4
-       XL      .req    v5
-       XM      .req    v6
-       XH      .req    v7
-       IN1     .req    v7
+       SHASH           .req    v0
+       SHASH2          .req    v1
+       T1              .req    v2
+       T2              .req    v3
+       MASK            .req    v4
+       XL              .req    v5
+       XM              .req    v6
+       XH              .req    v7
+       IN1             .req    v7
+
+       k00_16          .req    v8
+       k32_48          .req    v9
+
+       t3              .req    v10
+       t4              .req    v11
+       t5              .req    v12
+       t6              .req    v13
+       t7              .req    v14
+       t8              .req    v15
+       t9              .req    v16
+
+       perm1           .req    v17
+       perm2           .req    v18
+       perm3           .req    v19
+
+       sh1             .req    v20
+       sh2             .req    v21
+       sh3             .req    v22
+       sh4             .req    v23
+
+       ss1             .req    v24
+       ss2             .req    v25
+       ss3             .req    v26
+       ss4             .req    v27
 
        .text
        .arch           armv8-a+crypto
 
-       /*
-        * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-        *                         struct ghash_key const *k, const char *head)
-        */
-ENTRY(pmull_ghash_update)
+       .macro          __pmull_p64, rd, rn, rm
+       pmull           \rd\().1q, \rn\().1d, \rm\().1d
+       .endm
+
+       .macro          __pmull2_p64, rd, rn, rm
+       pmull2          \rd\().1q, \rn\().2d, \rm\().2d
+       .endm
+
+       .macro          __pmull_p8, rq, ad, bd
+       ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
+       ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
+       ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
+
+       __pmull_p8_\bd  \rq, \ad
+       .endm
+
+       .macro          __pmull2_p8, rq, ad, bd
+       tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
+       tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
+       tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
+
+       __pmull2_p8_\bd \rq, \ad
+       .endm
+
+       .macro          __pmull_p8_SHASH, rq, ad
+       __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
+       .endm
+
+       .macro          __pmull_p8_SHASH2, rq, ad
+       __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
+       .endm
+
+       .macro          __pmull2_p8_SHASH, rq, ad
+       __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
+       .endm
+
+       .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
+       pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
+       pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
+       pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
+       pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
+       pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
+       pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
+       pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
+       pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
+
+       eor             t3.16b, t3.16b, t4.16b                  // L = E + F
+       eor             t5.16b, t5.16b, t6.16b                  // M = G + H
+       eor             t7.16b, t7.16b, t8.16b                  // N = I + J
+
+       uzp1            t4.2d, t3.2d, t5.2d
+       uzp2            t3.2d, t3.2d, t5.2d
+       uzp1            t6.2d, t7.2d, t9.2d
+       uzp2            t7.2d, t7.2d, t9.2d
+
+       // t3 = (L) (P0 + P1) << 8
+       // t5 = (M) (P2 + P3) << 16
+       eor             t4.16b, t4.16b, t3.16b
+       and             t3.16b, t3.16b, k32_48.16b
+
+       // t7 = (N) (P4 + P5) << 24
+       // t9 = (K) (P6 + P7) << 32
+       eor             t6.16b, t6.16b, t7.16b
+       and             t7.16b, t7.16b, k00_16.16b
+
+       eor             t4.16b, t4.16b, t3.16b
+       eor             t6.16b, t6.16b, t7.16b
+
+       zip2            t5.2d, t4.2d, t3.2d
+       zip1            t3.2d, t4.2d, t3.2d
+       zip2            t9.2d, t6.2d, t7.2d
+       zip1            t7.2d, t6.2d, t7.2d
+
+       ext             t3.16b, t3.16b, t3.16b, #15
+       ext             t5.16b, t5.16b, t5.16b, #14
+       ext             t7.16b, t7.16b, t7.16b, #13
+       ext             t9.16b, t9.16b, t9.16b, #12
+
+       eor             t3.16b, t3.16b, t5.16b
+       eor             t7.16b, t7.16b, t9.16b
+       eor             \rq\().16b, \rq\().16b, t3.16b
+       eor             \rq\().16b, \rq\().16b, t7.16b
+       .endm
+
+       .macro          __pmull_pre_p64
+       movi            MASK.16b, #0xe1
+       shl             MASK.2d, MASK.2d, #57
+       .endm
+
+       .macro          __pmull_pre_p8
+       // k00_16 := 0x0000000000000000_000000000000ffff
+       // k32_48 := 0x00000000ffffffff_0000ffffffffffff
+       movi            k32_48.2d, #0xffffffff
+       mov             k32_48.h[2], k32_48.h[0]
+       ushr            k00_16.2d, k32_48.2d, #32
+
+       // prepare the permutation vectors
+       mov_q           x5, 0x080f0e0d0c0b0a09
+       movi            T1.8b, #8
+       dup             perm1.2d, x5
+       eor             perm1.16b, perm1.16b, T1.16b
+       ushr            perm2.2d, perm1.2d, #8
+       ushr            perm3.2d, perm1.2d, #16
+       ushr            T1.2d, perm1.2d, #24
+       sli             perm2.2d, perm1.2d, #56
+       sli             perm3.2d, perm1.2d, #48
+       sli             T1.2d, perm1.2d, #40
+
+       // precompute loop invariants
+       tbl             sh1.16b, {SHASH.16b}, perm1.16b
+       tbl             sh2.16b, {SHASH.16b}, perm2.16b
+       tbl             sh3.16b, {SHASH.16b}, perm3.16b
+       tbl             sh4.16b, {SHASH.16b}, T1.16b
+       ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
+       ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
+       ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
+       ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
+       .endm
+
+       //
+       // PMULL (64x64->128) based reduction for CPUs that can do
+       // it in a single instruction.
+       //
+       .macro          __pmull_reduce_p64
+       pmull           T2.1q, XL.1d, MASK.1d
+       eor             XM.16b, XM.16b, T1.16b
+
+       mov             XH.d[0], XM.d[1]
+       mov             XM.d[1], XL.d[0]
+
+       eor             XL.16b, XM.16b, T2.16b
+       ext             T2.16b, XL.16b, XL.16b, #8
+       pmull           XL.1q, XL.1d, MASK.1d
+       .endm
+
+       //
+       // Alternative reduction for CPUs that lack support for the
+       // 64x64->128 PMULL instruction
+       //
+       .macro          __pmull_reduce_p8
+       eor             XM.16b, XM.16b, T1.16b
+
+       mov             XL.d[1], XM.d[0]
+       mov             XH.d[0], XM.d[1]
+
+       shl             T1.2d, XL.2d, #57
+       shl             T2.2d, XL.2d, #62
+       eor             T2.16b, T2.16b, T1.16b
+       shl             T1.2d, XL.2d, #63
+       eor             T2.16b, T2.16b, T1.16b
+       ext             T1.16b, XL.16b, XH.16b, #8
+       eor             T2.16b, T2.16b, T1.16b
+
+       mov             XL.d[1], T2.d[0]
+       mov             XH.d[0], T2.d[1]
+
+       ushr            T2.2d, XL.2d, #1
+       eor             XH.16b, XH.16b, XL.16b
+       eor             XL.16b, XL.16b, T2.16b
+       ushr            T2.2d, T2.2d, #6
+       ushr            XL.2d, XL.2d, #1
+       .endm
+
+       .macro          __pmull_ghash, pn
        ld1             {SHASH.2d}, [x3]
        ld1             {XL.2d}, [x1]
-       movi            MASK.16b, #0xe1
        ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
-       shl             MASK.2d, MASK.2d, #57
        eor             SHASH2.16b, SHASH2.16b, SHASH.16b
 
+       __pmull_pre_\pn
+
        /* do the head block first, if supplied */
        cbz             x4, 0f
        ld1             {T1.2d}, [x4]
@@ -52,23 +236,17 @@ CPU_LE(    rev64           T1.16b, T1.16b  )
        eor             T1.16b, T1.16b, T2.16b
        eor             XL.16b, XL.16b, IN1.16b
 
-       pmull2          XH.1q, SHASH.2d, XL.2d          // a1 * b1
+       __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
        eor             T1.16b, T1.16b, XL.16b
-       pmull           XL.1q, SHASH.1d, XL.1d          // a0 * b0
-       pmull           XM.1q, SHASH2.1d, T1.1d         // (a1 + a0)(b1 + b0)
+       __pmull_\pn     XL, XL, SHASH                   // a0 * b0
+       __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
 
-       ext             T1.16b, XL.16b, XH.16b, #8
        eor             T2.16b, XL.16b, XH.16b
-       eor             XM.16b, XM.16b, T1.16b
+       ext             T1.16b, XL.16b, XH.16b, #8
        eor             XM.16b, XM.16b, T2.16b
-       pmull           T2.1q, XL.1d, MASK.1d
 
-       mov             XH.d[0], XM.d[1]
-       mov             XM.d[1], XL.d[0]
+       __pmull_reduce_\pn
 
-       eor             XL.16b, XM.16b, T2.16b
-       ext             T2.16b, XL.16b, XL.16b, #8
-       pmull           XL.1q, XL.1d, MASK.1d
        eor             T2.16b, T2.16b, XH.16b
        eor             XL.16b, XL.16b, T2.16b
 
@@ -76,7 +254,19 @@ CPU_LE(     rev64           T1.16b, T1.16b  )
 
        st1             {XL.2d}, [x1]
        ret
-ENDPROC(pmull_ghash_update)
+       .endm
+
+       /*
+        * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+        *                         struct ghash_key const *k, const char *head)
+        */
+ENTRY(pmull_ghash_update_p64)
+       __pmull_ghash   p64
+ENDPROC(pmull_ghash_update_p64)
+
+ENTRY(pmull_ghash_update_p8)
+       __pmull_ghash   p8
+ENDPROC(pmull_ghash_update_p8)
 
        KS              .req    v8
        CTR             .req    v9
diff --git a/arch/arm64/crypto/ghash-ce-glue.c 
b/arch/arm64/crypto/ghash-ce-glue.c
index ee6aaac05905..cfc9c92814fd 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -26,6 +26,7 @@
 MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheu...@linaro.org>");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("ghash");
 
 #define GHASH_BLOCK_SIZE       16
 #define GHASH_DIGEST_SIZE      16
@@ -48,8 +49,17 @@ struct gcm_aes_ctx {
        struct ghash_key        ghash_key;
 };
 
-asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-                                  struct ghash_key const *k, const char *head);
+asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+                                      struct ghash_key const *k,
+                                      const char *head);
+
+asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+                                     struct ghash_key const *k,
+                                     const char *head);
+
+static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
+                                 struct ghash_key const *k,
+                                 const char *head);
 
 asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
                                  const u8 src[], struct ghash_key const *k,
@@ -557,13 +567,24 @@ static int __init ghash_ce_mod_init(void)
 {
        int ret;
 
-       ret = crypto_register_aead(&gcm_aes_alg);
-       if (ret)
-               return ret;
+       if (!(elf_hwcap & HWCAP_ASIMD))
+               return -ENODEV;
+
+       if (elf_hwcap & HWCAP_PMULL)
+               pmull_ghash_update = pmull_ghash_update_p64;
+
+       else
+               pmull_ghash_update = pmull_ghash_update_p8;
 
        ret = crypto_register_shash(&ghash_alg);
        if (ret)
-               crypto_unregister_aead(&gcm_aes_alg);
+               return ret;
+
+       if (elf_hwcap & HWCAP_PMULL) {
+               ret = crypto_register_aead(&gcm_aes_alg);
+               if (ret)
+                       crypto_unregister_shash(&ghash_alg);
+       }
        return ret;
 }
 
@@ -573,5 +594,10 @@ static void __exit ghash_ce_mod_exit(void)
        crypto_unregister_aead(&gcm_aes_alg);
 }
 
-module_cpu_feature_match(PMULL, ghash_ce_mod_init);
+static const struct cpu_feature ghash_cpu_feature[] = {
+       { cpu_feature(PMULL) }, { }
+};
+MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature);
+
+module_init(ghash_ce_mod_init);
 module_exit(ghash_ce_mod_exit);
-- 
2.9.3

[PATCH resend 16/18] crypto: arm64/ghash - add NEON accelerated fallback for 64-bit PMULL

Reply via email to