Re: [PATCH] crypto: aes_ti - disable interrupts while accessing sbox

2018-10-16 Thread Eric Biggers
Hi Ard,

On Thu, Oct 04, 2018 at 08:55:14AM +0200, Ard Biesheuvel wrote:
> Hi Eric,
> 
> On 4 October 2018 at 06:07, Eric Biggers  wrote:
> > From: Eric Biggers 
> >
> > The generic constant-time AES implementation is supposed to preload the
> > AES S-box into the CPU's L1 data cache.  But, an interrupt handler can
> > run on the CPU and muck with the cache.  Worse, on preemptible kernels
> > the process can even be preempted and moved to a different CPU.  So the
> > implementation may actually still be vulnerable to cache-timing attacks.
> >
> > Make it more robust by disabling interrupts while the sbox is used.
> >
> > In some quick tests on x86 and ARM, this doesn't affect performance
> > significantly.  Responsiveness is also a concern, but interrupts are
> > only disabled for a single AES block which even on ARM Cortex-A7 is
> > "only" ~1500 cycles to encrypt or ~2600 cycles to decrypt.
> >
> 
> I share your concern, but that is quite a big hammer.
> 
> Also, does it really take ~100 cycles per byte? That is terrible :-)
> 
> Given that the full lookup table is only 1024 bytes (or 1024+256 bytes
> for decryption), I wonder if something like the below would be a
> better option for A7 (apologies for the mangled whitespace)
> 
> diff --git a/arch/arm/crypto/aes-cipher-core.S
> b/arch/arm/crypto/aes-cipher-core.S
> index 184d6c2d15d5..83e893f7e581 100644
> --- a/arch/arm/crypto/aes-cipher-core.S
> +++ b/arch/arm/crypto/aes-cipher-core.S
> @@ -139,6 +139,13 @@
> 
>   __adrl ttab, \ttab
> 
> + .irpc r, 01234567
> + ldr r8, [ttab, #(32 * \r)]
> + ldr r9, [ttab, #(32 * \r) + 256]
> + ldr r10, [ttab, #(32 * \r) + 512]
> + ldr r11, [ttab, #(32 * \r) + 768]
> + .endr
> +
>   tst rounds, #2
>   bne 1f
> 
> @@ -180,6 +187,12 @@ ENDPROC(__aes_arm_encrypt)
> 
>   .align 5
>  ENTRY(__aes_arm_decrypt)
> + __adrl ttab, __aes_arm_inverse_sbox
> +
> + .irpc r, 01234567
> + ldr r8, [ttab, #(32 * \r)]
> + .endr
> +
>   do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0
>  ENDPROC(__aes_arm_decrypt)
> 
> diff --git a/arch/arm/crypto/aes-cipher-glue.c
> b/arch/arm/crypto/aes-cipher-glue.c
> index c222f6e072ad..630e1a436f1d 100644
> --- a/arch/arm/crypto/aes-cipher-glue.c
> +++ b/arch/arm/crypto/aes-cipher-glue.c
> @@ -23,16 +23,22 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8
> *out, const u8 *in)
>  {
>   struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>   int rounds = 6 + ctx->key_length / 4;
> + unsigned long flags;
> 
> + local_irq_save(flags);
>   __aes_arm_encrypt(ctx->key_enc, rounds, in, out);
> + local_irq_restore(flags);
>  }
> 
>  static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>  {
>   struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>   int rounds = 6 + ctx->key_length / 4;
> + unsigned long flags;
> 
> + local_irq_save(flags);
>   __aes_arm_decrypt(ctx->key_dec, rounds, in, out);
> + local_irq_restore(flags);
>  }
> 
>  static struct crypto_alg aes_alg = {
> diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
> index ca554d57d01e..82fa860c9cb9 100644
> --- a/crypto/aes_generic.c
> +++ b/crypto/aes_generic.c
> @@ -63,7 +63,7 @@ static inline u8 byte(const u32 x, const unsigned n)
> 
>  static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };
> 
> -__visible const u32 crypto_ft_tab[4][256] = {
> +__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
>   {
>   0xa56363c6, 0x847c7cf8, 0x99ee, 0x8d7b7bf6,
>   0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
> @@ -327,7 +327,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
>   }
>  };
> 
> -__visible const u32 crypto_fl_tab[4][256] = {
> +__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
>   {
>   0x0063, 0x007c, 0x0077, 0x007b,
>   0x00f2, 0x006b, 0x006f, 0x00c5,
> @@ -591,7 +591,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
>   }
>  };
> 
> -__visible const u32 crypto_it_tab[4][256] = {
> +__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
>   {
>   0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
>   0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
> @@ -855,7 +855,7 @@ __visible const u32 crypto_it_tab[4][256] = {
>   }
>  };
> 
> -__visible const u32 crypto_il_tab[4][256] = {
> +__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
>   {
>   0x0052, 0x0009, 0x006a, 0x00d5,
>   0x0030, 0x0036, 0x00a5, 0x0038,
> 

Thanks for the suggestion -- this turns out to work pretty well.  At least in a
microbenchmark, loading the larger table only slows it down by around 5%, so
it's still around 2-3 times faster than aes_ti.c on ARM Cortex-A7.  It also
noticeably improves Adiantum performance (Adiantum-XChaCha12-AES):

 Before (cpb)After (cpb)
Adiantum (enc), size=4096:   10.91   10.58
Adiantum (dec), size=4096:   11.04   10.62
Adiantum (enc), size=512:18.07   15.57
Adiantum (dec), size=512:19.10   15.83

(Those 

Re: [PATCH] crypto: aes_ti - disable interrupts while accessing sbox

2018-10-04 Thread Ard Biesheuvel
Hi Eric,

On 4 October 2018 at 06:07, Eric Biggers  wrote:
> From: Eric Biggers 
>
> The generic constant-time AES implementation is supposed to preload the
> AES S-box into the CPU's L1 data cache.  But, an interrupt handler can
> run on the CPU and muck with the cache.  Worse, on preemptible kernels
> the process can even be preempted and moved to a different CPU.  So the
> implementation may actually still be vulnerable to cache-timing attacks.
>
> Make it more robust by disabling interrupts while the sbox is used.
>
> In some quick tests on x86 and ARM, this doesn't affect performance
> significantly.  Responsiveness is also a concern, but interrupts are
> only disabled for a single AES block which even on ARM Cortex-A7 is
> "only" ~1500 cycles to encrypt or ~2600 cycles to decrypt.
>

I share your concern, but that is quite a big hammer.

Also, does it really take ~100 cycles per byte? That is terrible :-)

Given that the full lookup table is only 1024 bytes (or 1024+256 bytes
for decryption), I wonder if something like the below would be a
better option for A7 (apologies for the mangled whitespace)

diff --git a/arch/arm/crypto/aes-cipher-core.S
b/arch/arm/crypto/aes-cipher-core.S
index 184d6c2d15d5..83e893f7e581 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -139,6 +139,13 @@

  __adrl ttab, \ttab

+ .irpc r, 01234567
+ ldr r8, [ttab, #(32 * \r)]
+ ldr r9, [ttab, #(32 * \r) + 256]
+ ldr r10, [ttab, #(32 * \r) + 512]
+ ldr r11, [ttab, #(32 * \r) + 768]
+ .endr
+
  tst rounds, #2
  bne 1f

@@ -180,6 +187,12 @@ ENDPROC(__aes_arm_encrypt)

  .align 5
 ENTRY(__aes_arm_decrypt)
+ __adrl ttab, __aes_arm_inverse_sbox
+
+ .irpc r, 01234567
+ ldr r8, [ttab, #(32 * \r)]
+ .endr
+
  do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0
 ENDPROC(__aes_arm_decrypt)

diff --git a/arch/arm/crypto/aes-cipher-glue.c
b/arch/arm/crypto/aes-cipher-glue.c
index c222f6e072ad..630e1a436f1d 100644
--- a/arch/arm/crypto/aes-cipher-glue.c
+++ b/arch/arm/crypto/aes-cipher-glue.c
@@ -23,16 +23,22 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8
*out, const u8 *in)
 {
  struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
  int rounds = 6 + ctx->key_length / 4;
+ unsigned long flags;

+ local_irq_save(flags);
  __aes_arm_encrypt(ctx->key_enc, rounds, in, out);
+ local_irq_restore(flags);
 }

 static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
  struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
  int rounds = 6 + ctx->key_length / 4;
+ unsigned long flags;

+ local_irq_save(flags);
  __aes_arm_decrypt(ctx->key_dec, rounds, in, out);
+ local_irq_restore(flags);
 }

 static struct crypto_alg aes_alg = {
diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
index ca554d57d01e..82fa860c9cb9 100644
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -63,7 +63,7 @@ static inline u8 byte(const u32 x, const unsigned n)

 static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };

-__visible const u32 crypto_ft_tab[4][256] = {
+__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
  {
  0xa56363c6, 0x847c7cf8, 0x99ee, 0x8d7b7bf6,
  0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
@@ -327,7 +327,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
  }
 };

-__visible const u32 crypto_fl_tab[4][256] = {
+__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
  {
  0x0063, 0x007c, 0x0077, 0x007b,
  0x00f2, 0x006b, 0x006f, 0x00c5,
@@ -591,7 +591,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
  }
 };

-__visible const u32 crypto_it_tab[4][256] = {
+__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
  {
  0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
  0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
@@ -855,7 +855,7 @@ __visible const u32 crypto_it_tab[4][256] = {
  }
 };

-__visible const u32 crypto_il_tab[4][256] = {
+__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
  {
  0x0052, 0x0009, 0x006a, 0x00d5,
  0x0030, 0x0036, 0x00a5, 0x0038,






> Fixes: b5e0b032b6c3 ("crypto: aes - add generic time invariant AES cipher")
> Signed-off-by: Eric Biggers 
> ---
>  crypto/aes_ti.c | 18 ++
>  1 file changed, 18 insertions(+)
>
> diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
> index 03023b2290e8e..81b604419ee0e 100644
> --- a/crypto/aes_ti.c
> +++ b/crypto/aes_ti.c
> @@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 
> *out, const u8 *in)
> const u32 *rkp = ctx->key_enc + 4;
> int rounds = 6 + ctx->key_length / 4;
> u32 st0[4], st1[4];
> +   unsigned long flags;
> int round;
>
> st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
> @@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 
> *out, const u8 *in)
> st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
> st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 

[PATCH] crypto: aes_ti - disable interrupts while accessing sbox

2018-10-03 Thread Eric Biggers
From: Eric Biggers 

The generic constant-time AES implementation is supposed to preload the
AES S-box into the CPU's L1 data cache.  But, an interrupt handler can
run on the CPU and muck with the cache.  Worse, on preemptible kernels
the process can even be preempted and moved to a different CPU.  So the
implementation may actually still be vulnerable to cache-timing attacks.

Make it more robust by disabling interrupts while the sbox is used.

In some quick tests on x86 and ARM, this doesn't affect performance
significantly.  Responsiveness is also a concern, but interrupts are
only disabled for a single AES block which even on ARM Cortex-A7 is
"only" ~1500 cycles to encrypt or ~2600 cycles to decrypt.

Fixes: b5e0b032b6c3 ("crypto: aes - add generic time invariant AES cipher")
Signed-off-by: Eric Biggers 
---
 crypto/aes_ti.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
index 03023b2290e8e..81b604419ee0e 100644
--- a/crypto/aes_ti.c
+++ b/crypto/aes_ti.c
@@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, 
const u8 *in)
const u32 *rkp = ctx->key_enc + 4;
int rounds = 6 + ctx->key_length / 4;
u32 st0[4], st1[4];
+   unsigned long flags;
int round;
 
st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
@@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, 
const u8 *in)
st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
 
+   /*
+* Disable interrupts (including preemption) while the sbox is loaded
+* into L1 cache and used for encryption on this CPU.
+*/
+   local_irq_save(flags);
+
st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
@@ -300,6 +307,8 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, 
const u8 *in)
put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
+
+   local_irq_restore(flags);
 }
 
 static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
@@ -308,6 +317,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, 
const u8 *in)
const u32 *rkp = ctx->key_dec + 4;
int rounds = 6 + ctx->key_length / 4;
u32 st0[4], st1[4];
+   unsigned long flags;
int round;
 
st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
@@ -315,6 +325,12 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, 
const u8 *in)
st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
 
+   /*
+* Disable interrupts (including preemption) while the sbox is loaded
+* into L1 cache and used for decryption on this CPU.
+*/
+   local_irq_save(flags);
+
st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
@@ -339,6 +355,8 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, 
const u8 *in)
put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
+
+   local_irq_restore(flags);
 }
 
 static struct crypto_alg aes_alg = {
-- 
2.19.0