Re: [PATCH v3 2/2] crypto: arm/aes - add some hardening against cache-timing attacks

2018-10-19 Thread Ard Biesheuvel
On 20 October 2018 at 04:39, Eric Biggers  wrote:
> On Fri, Oct 19, 2018 at 05:54:12PM +0800, Ard Biesheuvel wrote:
>> On 19 October 2018 at 13:41, Ard Biesheuvel  
>> wrote:
>> > On 18 October 2018 at 12:37, Eric Biggers  wrote:
>> >> From: Eric Biggers 
>> >>
>> >> Make the ARM scalar AES implementation closer to constant-time by
>> >> disabling interrupts and prefetching the tables into L1 cache.  This is
>> >> feasible because due to ARM's "free" rotations, the main tables are only
>> >> 1024 bytes instead of the usual 4096 used by most AES implementations.
>> >>
>> >> On ARM Cortex-A7, the speed loss is only about 5%.  The resulting code
>> >> is still over twice as fast as aes_ti.c.  Responsiveness is potentially
>> >> a concern, but interrupts are only disabled for a single AES block.
>> >>
>> >
>> > So that would be in the order of 700 cycles, based on the numbers you
>> > shared in v1 of the aes_ti.c patch. Does that sound about right? So
>> > that would be around 1 microsecond, which is really not a number to
>> > obsess about imo.
>> >
>> > I considered another option, which is to detect whether an interrupt
>> > has been taken (by writing some canary value below that stack pointer
>> > in the location where the exception handler will preserve the value of
>> > sp, and checking at the end whether it has been modified) and doing a
>> > usleep_range(x, y) if that is the case.
>> >
>> > But this is much simpler so let's only go there if we must.
>> >
>>
>> I played around a bit and implemented it for discussion purposes, but
>> restarting the operation if it gets interrupted, as suggested in the
>> paper (whitespace corruption courtesy of Gmail)
>>
>>
>> diff --git a/arch/arm/crypto/aes-cipher-core.S
>> b/arch/arm/crypto/aes-cipher-core.S
>> index 184d6c2d15d5..2e8a84a47784 100644
>> --- a/arch/arm/crypto/aes-cipher-core.S
>> +++ b/arch/arm/crypto/aes-cipher-core.S
>> @@ -10,6 +10,7 @@
>>   */
>>
>>  #include 
>> +#include 
>>  #include 
>>
>>   .text
>> @@ -139,6 +140,34 @@
>>
>>   __adrl ttab, \ttab
>>
>> + /*
>> + * Set a canary that will allow us to tell whether any
>> + * interrupts were taken while this function was executing.
>> + * The zero value will be overwritten with the process counter
>> + * value at the point where the IRQ exception is taken.
>> + */
>> + mov t0, #0
>> + str t0, [sp, #-(SVC_REGS_SIZE - S_PC)]
>> +
>> + /*
>> + * Prefetch the 1024-byte 'ft' or 'it' table into L1 cache,
>> + * assuming cacheline size >= 32.  This is a hardening measure
>> + * intended to make cache-timing attacks more difficult.
>> + * They may not be fully prevented, however; see the paper
>> + * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
>> + * ("Cache-timing attacks on AES") for a discussion of the many
>> + * difficulties involved in writing truly constant-time AES
>> + * software.
>> + */
>> + .set i, 0
>> + .rept 1024 / 128
>> + ldr r8, [ttab, #i + 0]
>> + ldr r9, [ttab, #i + 32]
>> + ldr r10, [ttab, #i + 64]
>> + ldr r11, [ttab, #i + 96]
>> + .set i, i + 128
>> + .endr
>> +
>>   tst rounds, #2
>>   bne 1f
>>
>> @@ -154,6 +183,8 @@
>>  2: __adrl ttab, \ltab
>>   \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b
>>
>> + ldr r0, [sp, #-(SVC_REGS_SIZE - S_PC)] // check canary
>> +
>>  #ifdef CONFIG_CPU_BIG_ENDIAN
>>   __rev r4, r4
>>   __rev r5, r5
>> diff --git a/arch/arm/crypto/aes-cipher-glue.c
>> b/arch/arm/crypto/aes-cipher-glue.c
>> index c222f6e072ad..de8f32121511 100644
>> --- a/arch/arm/crypto/aes-cipher-glue.c
>> +++ b/arch/arm/crypto/aes-cipher-glue.c
>> @@ -11,28 +11,39 @@
>>
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>
>> -asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 
>> *out);
>> +asmlinkage int __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 
>> *out);
>>  EXPORT_SYMBOL(__aes_arm_encrypt);
>>
>> -asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 
>> *out);
>> +asmlinkage int __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 
>> *out);
>>  EXPORT_SYMBOL(__aes_arm_decrypt);
>>
>>  static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>>  {
>>   struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>>   int rounds = 6 + ctx->key_length / 4;
>> + u8 buf[AES_BLOCK_SIZE];
>>
>> - __aes_arm_encrypt(ctx->key_enc, rounds, in, out);
>> + if (out == in)
>> +   in = memcpy(buf, in, AES_BLOCK_SIZE);
>> +
>> + while (unlikely(__aes_arm_encrypt(ctx->key_enc, rounds, in, out)))
>> +   cpu_relax();
>>  }
>>
>>  static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>>  {
>>   struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>>   int rounds = 6 + ctx->key_length / 4;
>> + u8 buf[AES_BLOCK_SIZE];
>> +
>> + if (out == in)
>> +   in = memcpy(buf, in, AES_BLOCK_SIZE);
>>
>> - __aes_arm_decrypt(ctx->key_dec, rounds, in, out);
>> + while (unlikely(__aes_arm_decrypt(ctx->key_dec, rounds, in, out)))
>> +   cpu_relax();
>>  }
>>
>>  static struct crypto_alg aes_alg = {
>
> It's an interesting 

[PATCH 1/2] crypto: fix cfb mode decryption

2018-10-19 Thread Dmitry Eremin-Solenikov
crypto_cfb_decrypt_segment() incorrectly XOR'ed generated keystream with
IV, rather than with data stream, resulting in incorrect decryption.
Test vectors will be added in the next patch.

Signed-off-by: Dmitry Eremin-Solenikov 
Cc: sta...@vger.kernel.org
---
 crypto/cfb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/cfb.c b/crypto/cfb.c
index a0d68c09e1b9..fd4e8500e121 100644
--- a/crypto/cfb.c
+++ b/crypto/cfb.c
@@ -144,7 +144,7 @@ static int crypto_cfb_decrypt_segment(struct skcipher_walk 
*walk,
 
do {
crypto_cfb_encrypt_one(tfm, iv, dst);
-   crypto_xor(dst, iv, bsize);
+   crypto_xor(dst, src, bsize);
iv = src;
 
src += bsize;
-- 
2.19.1



[PATCH 2/2] crypto: testmgr: add AES-CFB tests

2018-10-19 Thread Dmitry Eremin-Solenikov
Add AES128/192/256-CFB testvectors from NIST SP800-38A.

Signed-off-by: Dmitry Eremin-Solenikov 
Cc: sta...@vger.kernel.org
Signed-off-by: Dmitry Eremin-Solenikov 
---
 crypto/tcrypt.c  |  5 
 crypto/testmgr.c |  7 +
 crypto/testmgr.h | 76 
 3 files changed, 88 insertions(+)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index bdde95e8d369..a6315827d240 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1733,6 +1733,7 @@ static int do_test(const char *alg, u32 type, u32 mask, 
int m, u32 num_mb)
ret += tcrypt_test("xts(aes)");
ret += tcrypt_test("ctr(aes)");
ret += tcrypt_test("rfc3686(ctr(aes))");
+   ret += tcrypt_test("cfb(aes)");
break;
 
case 11:
@@ -2059,6 +2060,10 @@ static int do_test(const char *alg, u32 type, u32 mask, 
int m, u32 num_mb)
speed_template_16_24_32);
test_cipher_speed("ctr(aes)", DECRYPT, sec, NULL, 0,
speed_template_16_24_32);
+   test_cipher_speed("cfb(aes)", ENCRYPT, sec, NULL, 0,
+   speed_template_16_24_32);
+   test_cipher_speed("cfb(aes)", DECRYPT, sec, NULL, 0,
+   speed_template_16_24_32);
break;
 
case 201:
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index a1d42245082a..016d61c419fc 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2684,6 +2684,13 @@ static const struct alg_test_desc alg_test_descs[] = {
.dec = __VECS(aes_ccm_dec_tv_template)
}
}
+   }, {
+   .alg = "cfb(aes)",
+   .test = alg_test_skcipher,
+   .fips_allowed = 1,
+   .suite = {
+   .cipher = __VECS(aes_cfb_tv_template)
+   },
}, {
.alg = "chacha20",
.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 173111c70746..19b6d184c8fb 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -12081,6 +12081,82 @@ static const struct cipher_testvec 
aes_cbc_tv_template[] = {
},
 };
 
+static const struct cipher_testvec aes_cfb_tv_template[] = {
+   { /* From NIST SP800-38A */
+   .key= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+ "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+   .klen   = 16,
+   .iv = "\x00\x01\x02\x03\x04\x05\x06\x07"
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+   .ptext  = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+ "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+ "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+ "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+ "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+ "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+ "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+ "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+   .ctext  = "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+ "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+ "\xc8\xa6\x45\x37\xa0\xb3\xa9\x3f"
+ "\xcd\xe3\xcd\xad\x9f\x1c\xe5\x8b"
+ "\x26\x75\x1f\x67\xa3\xcb\xb1\x40"
+ "\xb1\x80\x8c\xf1\x87\xa4\xf4\xdf"
+ "\xc0\x4b\x05\x35\x7c\x5d\x1c\x0e"
+ "\xea\xc4\xc6\x6f\x9f\xf7\xf2\xe6",
+   .len= 64,
+   }, {
+   .key= "\x8e\x73\xb0\xf7\xda\x0e\x64\x52"
+ "\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+ "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+   .klen   = 24,
+   .iv = "\x00\x01\x02\x03\x04\x05\x06\x07"
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+   .ptext  = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+ "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+ "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+ "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+ "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+ "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+ "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+ "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+   .ctext  = "\xcd\xc8\x0d\x6f\xdd\xf1\x8c\xab"
+ "\x34\xc2\x59\x09\xc9\x9a\x41\x74"
+ "\x67\xce\x7f\x7f\x81\x17\x36\x21"
+ "\x96\x1a\x2b\x70\x17\x1d\x3d\x7a"
+ "\x2e\x1e\x8a\x1d\xd5\x9b\x88\xb1"
+ "\xc8\xe6\x0f\xed\x1e\xfa\xc4\xc9"
+ "\xc0\x5f\x9f\x9c\xa9\x83\x4f\xa0"
+ "\x42\xae\x8f\xba\x58\x4b\x09\xff",
+  

Re: [PATCH v3 2/2] crypto: arm/aes - add some hardening against cache-timing attacks

2018-10-19 Thread Eric Biggers
On Fri, Oct 19, 2018 at 05:54:12PM +0800, Ard Biesheuvel wrote:
> On 19 October 2018 at 13:41, Ard Biesheuvel  wrote:
> > On 18 October 2018 at 12:37, Eric Biggers  wrote:
> >> From: Eric Biggers 
> >>
> >> Make the ARM scalar AES implementation closer to constant-time by
> >> disabling interrupts and prefetching the tables into L1 cache.  This is
> >> feasible because due to ARM's "free" rotations, the main tables are only
> >> 1024 bytes instead of the usual 4096 used by most AES implementations.
> >>
> >> On ARM Cortex-A7, the speed loss is only about 5%.  The resulting code
> >> is still over twice as fast as aes_ti.c.  Responsiveness is potentially
> >> a concern, but interrupts are only disabled for a single AES block.
> >>
> >
> > So that would be in the order of 700 cycles, based on the numbers you
> > shared in v1 of the aes_ti.c patch. Does that sound about right? So
> > that would be around 1 microsecond, which is really not a number to
> > obsess about imo.
> >
> > I considered another option, which is to detect whether an interrupt
> > has been taken (by writing some canary value below that stack pointer
> > in the location where the exception handler will preserve the value of
> > sp, and checking at the end whether it has been modified) and doing a
> > usleep_range(x, y) if that is the case.
> >
> > But this is much simpler so let's only go there if we must.
> >
> 
> I played around a bit and implemented it for discussion purposes, but
> restarting the operation if it gets interrupted, as suggested in the
> paper (whitespace corruption courtesy of Gmail)
> 
> 
> diff --git a/arch/arm/crypto/aes-cipher-core.S
> b/arch/arm/crypto/aes-cipher-core.S
> index 184d6c2d15d5..2e8a84a47784 100644
> --- a/arch/arm/crypto/aes-cipher-core.S
> +++ b/arch/arm/crypto/aes-cipher-core.S
> @@ -10,6 +10,7 @@
>   */
> 
>  #include 
> +#include 
>  #include 
> 
>   .text
> @@ -139,6 +140,34 @@
> 
>   __adrl ttab, \ttab
> 
> + /*
> + * Set a canary that will allow us to tell whether any
> + * interrupts were taken while this function was executing.
> + * The zero value will be overwritten with the process counter
> + * value at the point where the IRQ exception is taken.
> + */
> + mov t0, #0
> + str t0, [sp, #-(SVC_REGS_SIZE - S_PC)]
> +
> + /*
> + * Prefetch the 1024-byte 'ft' or 'it' table into L1 cache,
> + * assuming cacheline size >= 32.  This is a hardening measure
> + * intended to make cache-timing attacks more difficult.
> + * They may not be fully prevented, however; see the paper
> + * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
> + * ("Cache-timing attacks on AES") for a discussion of the many
> + * difficulties involved in writing truly constant-time AES
> + * software.
> + */
> + .set i, 0
> + .rept 1024 / 128
> + ldr r8, [ttab, #i + 0]
> + ldr r9, [ttab, #i + 32]
> + ldr r10, [ttab, #i + 64]
> + ldr r11, [ttab, #i + 96]
> + .set i, i + 128
> + .endr
> +
>   tst rounds, #2
>   bne 1f
> 
> @@ -154,6 +183,8 @@
>  2: __adrl ttab, \ltab
>   \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b
> 
> + ldr r0, [sp, #-(SVC_REGS_SIZE - S_PC)] // check canary
> +
>  #ifdef CONFIG_CPU_BIG_ENDIAN
>   __rev r4, r4
>   __rev r5, r5
> diff --git a/arch/arm/crypto/aes-cipher-glue.c
> b/arch/arm/crypto/aes-cipher-glue.c
> index c222f6e072ad..de8f32121511 100644
> --- a/arch/arm/crypto/aes-cipher-glue.c
> +++ b/arch/arm/crypto/aes-cipher-glue.c
> @@ -11,28 +11,39 @@
> 
>  #include 
>  #include 
> +#include 
>  #include 
> 
> -asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 
> *out);
> +asmlinkage int __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
>  EXPORT_SYMBOL(__aes_arm_encrypt);
> 
> -asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 
> *out);
> +asmlinkage int __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
>  EXPORT_SYMBOL(__aes_arm_decrypt);
> 
>  static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>  {
>   struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>   int rounds = 6 + ctx->key_length / 4;
> + u8 buf[AES_BLOCK_SIZE];
> 
> - __aes_arm_encrypt(ctx->key_enc, rounds, in, out);
> + if (out == in)
> +   in = memcpy(buf, in, AES_BLOCK_SIZE);
> +
> + while (unlikely(__aes_arm_encrypt(ctx->key_enc, rounds, in, out)))
> +   cpu_relax();
>  }
> 
>  static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>  {
>   struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>   int rounds = 6 + ctx->key_length / 4;
> + u8 buf[AES_BLOCK_SIZE];
> +
> + if (out == in)
> +   in = memcpy(buf, in, AES_BLOCK_SIZE);
> 
> - __aes_arm_decrypt(ctx->key_dec, rounds, in, out);
> + while (unlikely(__aes_arm_decrypt(ctx->key_dec, rounds, in, out)))
> +   cpu_relax();
>  }
> 
>  static struct crypto_alg aes_alg = {

It's an interesting idea, but the main thing I don't like about this is that the
time it takes to do the encryption/decryption is unbounded, since it could get
livelocked with a high rate of interrupts.  To fix 

Re: [PATCH v3 2/2] crypto: arm/aes - add some hardening against cache-timing attacks

2018-10-19 Thread Eric Biggers
On Fri, Oct 19, 2018 at 01:41:35PM +0800, Ard Biesheuvel wrote:
> On 18 October 2018 at 12:37, Eric Biggers  wrote:
> > From: Eric Biggers 
> >
> > Make the ARM scalar AES implementation closer to constant-time by
> > disabling interrupts and prefetching the tables into L1 cache.  This is
> > feasible because due to ARM's "free" rotations, the main tables are only
> > 1024 bytes instead of the usual 4096 used by most AES implementations.
> >
> > On ARM Cortex-A7, the speed loss is only about 5%.  The resulting code
> > is still over twice as fast as aes_ti.c.  Responsiveness is potentially
> > a concern, but interrupts are only disabled for a single AES block.
> >
> 
> So that would be in the order of 700 cycles, based on the numbers you
> shared in v1 of the aes_ti.c patch. Does that sound about right? So
> that would be around 1 microsecond, which is really not a number to
> obsess about imo.
> 

Correct, on ARM Cortex-A7 I'm seeing slightly over 700 cycles per block
encrypted or decrypted, including the prefetching.

- Eric


Re: [PATCH v3 2/2] crypto: arm/aes - add some hardening against cache-timing attacks

2018-10-19 Thread Ard Biesheuvel
On 19 October 2018 at 13:41, Ard Biesheuvel  wrote:
> On 18 October 2018 at 12:37, Eric Biggers  wrote:
>> From: Eric Biggers 
>>
>> Make the ARM scalar AES implementation closer to constant-time by
>> disabling interrupts and prefetching the tables into L1 cache.  This is
>> feasible because due to ARM's "free" rotations, the main tables are only
>> 1024 bytes instead of the usual 4096 used by most AES implementations.
>>
>> On ARM Cortex-A7, the speed loss is only about 5%.  The resulting code
>> is still over twice as fast as aes_ti.c.  Responsiveness is potentially
>> a concern, but interrupts are only disabled for a single AES block.
>>
>
> So that would be in the order of 700 cycles, based on the numbers you
> shared in v1 of the aes_ti.c patch. Does that sound about right? So
> that would be around 1 microsecond, which is really not a number to
> obsess about imo.
>
> I considered another option, which is to detect whether an interrupt
> has been taken (by writing some canary value below that stack pointer
> in the location where the exception handler will preserve the value of
> sp, and checking at the end whether it has been modified) and doing a
> usleep_range(x, y) if that is the case.
>
> But this is much simpler so let's only go there if we must.
>

I played around a bit and implemented it for discussion purposes, but
restarting the operation if it gets interrupted, as suggested in the
paper (whitespace corruption courtesy of Gmail)


diff --git a/arch/arm/crypto/aes-cipher-core.S
b/arch/arm/crypto/aes-cipher-core.S
index 184d6c2d15d5..2e8a84a47784 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -10,6 +10,7 @@
  */

 #include 
+#include 
 #include 

  .text
@@ -139,6 +140,34 @@

  __adrl ttab, \ttab

+ /*
+ * Set a canary that will allow us to tell whether any
+ * interrupts were taken while this function was executing.
+ * The zero value will be overwritten with the process counter
+ * value at the point where the IRQ exception is taken.
+ */
+ mov t0, #0
+ str t0, [sp, #-(SVC_REGS_SIZE - S_PC)]
+
+ /*
+ * Prefetch the 1024-byte 'ft' or 'it' table into L1 cache,
+ * assuming cacheline size >= 32.  This is a hardening measure
+ * intended to make cache-timing attacks more difficult.
+ * They may not be fully prevented, however; see the paper
+ * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
+ * ("Cache-timing attacks on AES") for a discussion of the many
+ * difficulties involved in writing truly constant-time AES
+ * software.
+ */
+ .set i, 0
+ .rept 1024 / 128
+ ldr r8, [ttab, #i + 0]
+ ldr r9, [ttab, #i + 32]
+ ldr r10, [ttab, #i + 64]
+ ldr r11, [ttab, #i + 96]
+ .set i, i + 128
+ .endr
+
  tst rounds, #2
  bne 1f

@@ -154,6 +183,8 @@
 2: __adrl ttab, \ltab
  \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b

+ ldr r0, [sp, #-(SVC_REGS_SIZE - S_PC)] // check canary
+
 #ifdef CONFIG_CPU_BIG_ENDIAN
  __rev r4, r4
  __rev r5, r5
diff --git a/arch/arm/crypto/aes-cipher-glue.c
b/arch/arm/crypto/aes-cipher-glue.c
index c222f6e072ad..de8f32121511 100644
--- a/arch/arm/crypto/aes-cipher-glue.c
+++ b/arch/arm/crypto/aes-cipher-glue.c
@@ -11,28 +11,39 @@

 #include 
 #include 
+#include 
 #include 

-asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
+asmlinkage int __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
 EXPORT_SYMBOL(__aes_arm_encrypt);

-asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
+asmlinkage int __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
 EXPORT_SYMBOL(__aes_arm_decrypt);

 static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
  struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
  int rounds = 6 + ctx->key_length / 4;
+ u8 buf[AES_BLOCK_SIZE];

- __aes_arm_encrypt(ctx->key_enc, rounds, in, out);
+ if (out == in)
+   in = memcpy(buf, in, AES_BLOCK_SIZE);
+
+ while (unlikely(__aes_arm_encrypt(ctx->key_enc, rounds, in, out)))
+   cpu_relax();
 }

 static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
  struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
  int rounds = 6 + ctx->key_length / 4;
+ u8 buf[AES_BLOCK_SIZE];
+
+ if (out == in)
+   in = memcpy(buf, in, AES_BLOCK_SIZE);

- __aes_arm_decrypt(ctx->key_dec, rounds, in, out);
+ while (unlikely(__aes_arm_decrypt(ctx->key_dec, rounds, in, out)))
+   cpu_relax();
 }

 static struct crypto_alg aes_alg = {