This is better than my simple fix, thank you.

Out of curiosity, why doesn't NEON code use barrier() to prevent
reordering?

On Tue, 21 Nov 2017 13:40:17 +0000
Ard Biesheuvel <ard.biesheu...@linaro.org> wrote:

> Most crypto drivers involving kernel mode NEON take care to put the
> code that actually touches the NEON register file in a separate
> compilation unit, to prevent the compiler from reordering code that
> preserves or restores the NEON context with code that may corrupt it.
> This is necessary because we currently have no way to express the
> restrictions imposed upon use of the NEON in kernel mode in a way
> that the compiler understands.
> 
> However, in the case of aes-ce-cipher, it did not seem unreasonable to
> deviate from this rule, given how it does not seem possible for the
> compiler to reorder cross object function calls with asm blocks whose
> in- and output constraints reflect that it reads from and writes to
> memory.
> 
> Now that LTO is being proposed for the arm64 kernel, it is time to
> revisit this. The link time optimization may replace the function
> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
> of the IR that make up its implementation, allowing further reordering
> with the asm block.
> 
> So let's clean this up, and move the asm() blocks into a separate .S
> file.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org>
> ---
>  arch/arm64/crypto/Makefile                         |   2 +-
>  arch/arm64/crypto/aes-ce-core.S                    |  87
> ++++++++++++++++ .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      |
> 115 +++------------------ 3 files changed, 100 insertions(+), 104
> deletions(-) create mode 100644 arch/arm64/crypto/aes-ce-core.S
>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
> 
> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
> index b5edc5918c28..f5e8295fd756 100644
> --- a/arch/arm64/crypto/Makefile
> +++ b/arch/arm64/crypto/Makefile
> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>  
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>  
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
> diff --git a/arch/arm64/crypto/aes-ce-core.S
> b/arch/arm64/crypto/aes-ce-core.S new file mode 100644
> index 000000000000..8efdfdade393
> --- /dev/null
> +++ b/arch/arm64/crypto/aes-ce-core.S
> @@ -0,0 +1,87 @@
> +/*
> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheu...@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or
> modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +     .arch           armv8-a+crypto
> +
> +ENTRY(__aes_ce_encrypt)
> +     sub             w3, w3, #2
> +     ld1             {v0.16b}, [x2]
> +     ld1             {v1.4s}, [x0], #16
> +     cmp             w3, #10
> +     bmi             0f
> +     bne             3f
> +     mov             v3.16b, v1.16b
> +     b               2f
> +0:   mov             v2.16b, v1.16b
> +     ld1             {v3.4s}, [x0], #16
> +1:   aese            v0.16b, v2.16b
> +     aesmc           v0.16b, v0.16b
> +2:   ld1             {v1.4s}, [x0], #16
> +     aese            v0.16b, v3.16b
> +     aesmc           v0.16b, v0.16b
> +3:   ld1             {v2.4s}, [x0], #16
> +     subs            w3, w3, #3
> +     aese            v0.16b, v1.16b
> +     aesmc           v0.16b, v0.16b
> +     ld1             {v3.4s}, [x0], #16
> +     bpl             1b
> +     aese            v0.16b, v2.16b
> +     eor             v0.16b, v0.16b, v3.16b
> +     st1             {v0.16b}, [x1]
> +     ret
> +ENDPROC(__aes_ce_encrypt)
> +
> +ENTRY(__aes_ce_decrypt)
> +     sub             w3, w3, #2
> +     ld1             {v0.16b}, [x2]
> +     ld1             {v1.4s}, [x0], #16
> +     cmp             w3, #10
> +     bmi             0f
> +     bne             3f
> +     mov             v3.16b, v1.16b
> +     b               2f
> +0:   mov             v2.16b, v1.16b
> +     ld1             {v3.4s}, [x0], #16
> +1:   aesd            v0.16b, v2.16b
> +     aesimc          v0.16b, v0.16b
> +2:   ld1             {v1.4s}, [x0], #16
> +     aesd            v0.16b, v3.16b
> +     aesimc          v0.16b, v0.16b
> +3:   ld1             {v2.4s}, [x0], #16
> +     subs            w3, w3, #3
> +     aesd            v0.16b, v1.16b
> +     aesimc          v0.16b, v0.16b
> +     ld1             {v3.4s}, [x0], #16
> +     bpl             1b
> +     aesd            v0.16b, v2.16b
> +     eor             v0.16b, v0.16b, v3.16b
> +     st1             {v0.16b}, [x1]
> +     ret
> +ENDPROC(__aes_ce_decrypt)
> +
> +/*
> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
> + *                  substitution on each byte in 'input'
> + */
> +ENTRY(__aes_ce_sub)
> +     dup             v1.4s, w0
> +     movi            v0.16b, #0
> +     aese            v0.16b, v1.16b
> +     umov            w0, v0.s[0]
> +     ret
> +ENDPROC(__aes_ce_sub)
> +
> +ENTRY(__aes_ce_invert)
> +     ld1             {v0.4s}, [x1]
> +     aesimc          v1.16b, v0.16b
> +     st1             {v1.4s}, [x0]
> +     ret
> +ENDPROC(__aes_ce_invert)
> diff --git a/arch/arm64/crypto/aes-ce-cipher.c
> b/arch/arm64/crypto/aes-ce-glue.c similarity index 62%
> rename from arch/arm64/crypto/aes-ce-cipher.c
> rename to arch/arm64/crypto/aes-ce-glue.c
> index 6a75cd75ed11..e6b3227bbf57 100644
> --- a/arch/arm64/crypto/aes-ce-cipher.c
> +++ b/arch/arm64/crypto/aes-ce-glue.c
> @@ -29,6 +29,13 @@ struct aes_block {
>       u8 b[AES_BLOCK_SIZE];
>  };
>  
> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int
> rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8
> *in, int rounds); +
> +asmlinkage u32 __aes_ce_sub(u32 l);
> +asmlinkage void __aes_ce_invert(struct aes_block *out,
> +                             const struct aes_block *in);
> +
>  static int num_rounds(struct crypto_aes_ctx *ctx)
>  {
>       /*
> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8
> const src[]) {
>       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -     struct aes_block *out = (struct aes_block *)dst;
> -     struct aes_block const *in = (struct aes_block *)src;
> -     void *dummy0;
> -     int dummy1;
>  
>       if (!may_use_simd()) {
>               __aes_arm64_encrypt(ctx->key_enc, dst, src,
> num_rounds(ctx)); @@ -55,49 +58,13 @@ static void
> aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>  
>       kernel_neon_begin();
> -
> -     __asm__("       ld1     {v0.16b},
> %[in]                 ;"
> -             "       ld1     {v1.4s}, [%[key]],
> #16           ;"
> -             "       cmp     %w[rounds],
> #10                   ;"
> -             "       bmi
> 0f                            ;"
> -             "       bne
> 3f                            ;"
> -             "       mov     v3.16b,
> v1.16b                        ;"
> -             "       b
> 2f                            ;"
> -             "0:     mov     v2.16b,
> v1.16b                        ;"
> -             "       ld1     {v3.4s}, [%[key]],
> #16           ;"
> -             "1:     aese    v0.16b,
> v2.16b                        ;"
> -             "       aesmc   v0.16b,
> v0.16b                        ;"
> -             "2:     ld1     {v1.4s}, [%[key]],
> #16           ;"
> -             "       aese    v0.16b,
> v3.16b                        ;"
> -             "       aesmc   v0.16b,
> v0.16b                        ;"
> -             "3:     ld1     {v2.4s}, [%[key]],
> #16           ;"
> -             "       subs    %w[rounds], %w[rounds],
> #3    ;"
> -             "       aese    v0.16b,
> v1.16b                        ;"
> -             "       aesmc   v0.16b,
> v0.16b                        ;"
> -             "       ld1     {v3.4s}, [%[key]],
> #16           ;"
> -             "       bpl
> 1b                            ;"
> -             "       aese    v0.16b,
> v2.16b                        ;"
> -             "       eor     v0.16b, v0.16b,
> v3.16b                ;"
> -             "       st1     {v0.16b},
> %[out]                ;" -
> -     :       [out]           "=Q"(*out),
> -             [key]           "=r"(dummy0),
> -             [rounds]        "=r"(dummy1)
> -     :       [in]            "Q"(*in),
> -                             "1"(ctx->key_enc),
> -                             "2"(num_rounds(ctx) - 2)
> -     :       "cc");
> -
> +     __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>       kernel_neon_end();
>  }
>  
>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8
> const src[]) {
>       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -     struct aes_block *out = (struct aes_block *)dst;
> -     struct aes_block const *in = (struct aes_block *)src;
> -     void *dummy0;
> -     int dummy1;
>  
>       if (!may_use_simd()) {
>               __aes_arm64_decrypt(ctx->key_dec, dst, src,
> num_rounds(ctx)); @@ -105,62 +72,10 @@ static void
> aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>  
>       kernel_neon_begin();
> -
> -     __asm__("       ld1     {v0.16b},
> %[in]                 ;"
> -             "       ld1     {v1.4s}, [%[key]],
> #16           ;"
> -             "       cmp     %w[rounds],
> #10                   ;"
> -             "       bmi
> 0f                            ;"
> -             "       bne
> 3f                            ;"
> -             "       mov     v3.16b,
> v1.16b                        ;"
> -             "       b
> 2f                            ;"
> -             "0:     mov     v2.16b,
> v1.16b                        ;"
> -             "       ld1     {v3.4s}, [%[key]],
> #16           ;"
> -             "1:     aesd    v0.16b,
> v2.16b                        ;"
> -             "       aesimc  v0.16b,
> v0.16b                        ;"
> -             "2:     ld1     {v1.4s}, [%[key]],
> #16           ;"
> -             "       aesd    v0.16b,
> v3.16b                        ;"
> -             "       aesimc  v0.16b,
> v0.16b                        ;"
> -             "3:     ld1     {v2.4s}, [%[key]],
> #16           ;"
> -             "       subs    %w[rounds], %w[rounds],
> #3    ;"
> -             "       aesd    v0.16b,
> v1.16b                        ;"
> -             "       aesimc  v0.16b,
> v0.16b                        ;"
> -             "       ld1     {v3.4s}, [%[key]],
> #16           ;"
> -             "       bpl
> 1b                            ;"
> -             "       aesd    v0.16b,
> v2.16b                        ;"
> -             "       eor     v0.16b, v0.16b,
> v3.16b                ;"
> -             "       st1     {v0.16b},
> %[out]                ;" -
> -     :       [out]           "=Q"(*out),
> -             [key]           "=r"(dummy0),
> -             [rounds]        "=r"(dummy1)
> -     :       [in]            "Q"(*in),
> -                             "1"(ctx->key_dec),
> -                             "2"(num_rounds(ctx) - 2)
> -     :       "cc");
> -
> +     __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>       kernel_neon_end();
>  }
>  
> -/*
> - * aes_sub() - use the aese instruction to perform the AES sbox
> substitution
> - *             on each byte in 'input'
> - */
> -static u32 aes_sub(u32 input)
> -{
> -     u32 ret;
> -
> -     __asm__("dup    v1.4s, %w[in]           ;"
> -             "movi   v0.16b, #0              ;"
> -             "aese   v0.16b, v1.16b          ;"
> -             "umov   %w[out], v0.4s[0]       ;"
> -
> -     :       [out]   "=r"(ret)
> -     :       [in]    "r"(input)
> -     :               "v0","v1");
> -
> -     return ret;
> -}
> -
>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                    unsigned int key_len)
>  {
> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords);
>               u32 *rko = rki + kwords;
>  
> -             rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^
> rcon[i] ^ rki[0];
> +             rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
> rcon[i] ^ rki[0]; rko[1] = rko[0] ^ rki[1];
>               rko[2] = rko[1] ^ rki[2];
>               rko[3] = rko[2] ^ rki[3];
> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) {
>                       if (i >= 6)
>                               break;
> -                     rko[4] = aes_sub(rko[3]) ^ rki[4];
> +                     rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>                       rko[5] = rko[4] ^ rki[5];
>                       rko[6] = rko[5] ^ rki[6];
>                       rko[7] = rko[6] ^ rki[7];
> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, 
>       key_dec[0] = key_enc[j];
>       for (i = 1, j--; j > 0; i++, j--)
> -             __asm__("ld1    {v0.4s}, %[in]          ;"
> -                     "aesimc v1.16b,
> v0.16b                ;"
> -                     "st1    {v1.4s}, %[out] ;"
> -
> -             :       [out]   "=Q"(key_dec[i])
> -             :       [in]    "Q"(key_enc[j])
> -             :               "v0","v1");
> +             __aes_ce_invert(key_dec + i, key_enc + j);
>       key_dec[i] = key_enc[0];
>  
>       kernel_neon_end();

Regards,
Alex

Reply via email to